From c17f9d9ab357f9ceafe7ac7b56fceab60753d1c9 Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 12 Feb 2025 14:55:21 -0500 Subject: [PATCH 001/102] New datamodel for evals w tests. --- libs/core/kiln_ai/datamodel/__init__.py | 1 + libs/core/kiln_ai/datamodel/eval.py | 99 ++++++++++ libs/core/kiln_ai/datamodel/eval_datamodel.py | 10 - libs/core/kiln_ai/datamodel/task.py | 5 + .../core/kiln_ai/datamodel/test_eval_model.py | 179 ++++++++++++++++++ 5 files changed, 284 insertions(+), 10 deletions(-) create mode 100644 libs/core/kiln_ai/datamodel/eval.py delete mode 100644 libs/core/kiln_ai/datamodel/eval_datamodel.py create mode 100644 libs/core/kiln_ai/datamodel/test_eval_model.py diff --git a/libs/core/kiln_ai/datamodel/__init__.py b/libs/core/kiln_ai/datamodel/__init__.py index 0d622418..fe377f54 100644 --- a/libs/core/kiln_ai/datamodel/__init__.py +++ b/libs/core/kiln_ai/datamodel/__init__.py @@ -63,4 +63,5 @@ "TaskOutputRating", "StructuredOutputMode", "FinetuneDataStrategy", + "Eval", ] diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py new file mode 100644 index 00000000..8af2b97d --- /dev/null +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -0,0 +1,99 @@ +import json +from enum import Enum +from typing import TYPE_CHECKING, Any, Union + +from pydantic import Field, model_validator +from typing_extensions import Self + +from kiln_ai.datamodel.basemodel import ( + ID_TYPE, + NAME_FIELD, + KilnParentedModel, + KilnParentModel, +) +from kiln_ai.datamodel.task_output import DataSource, DataSourceType + +if TYPE_CHECKING: + from kiln_ai.datamodel.task import Task + + +class EvalState(str, Enum): + enabled = "enabled" + disabled = "disabled" + + +class EvalConfigType(str, Enum): + g_eval = "g_eval" + + +class EvalConfig(KilnParentedModel): + """ + A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. + + A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid when the same eval is run with the same config. + """ + + name: str = NAME_FIELD + model: DataSource = Field(description="The model to use for this eval config.") + config_type: EvalConfigType = Field( + default=EvalConfigType.g_eval, + description="This is used to determine the type of eval to run.", + ) + properties: dict[str, Any] = Field( + default={}, + description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", + ) + + def parent_eval(self) -> "Eval": + if self.parent is None or self.parent.__class__.__name__ != "Eval": + raise ValueError("parent must be an Eval") + return self.parent # type: ignore + + @model_validator(mode="after") + def validate_properties(self) -> Self: + if self.config_type == EvalConfigType.g_eval: + if "g_eval_steps" not in self.properties or not isinstance( + self.properties["g_eval_steps"], list + ): + raise ValueError( + "g_eval_steps is required and must be a list for g_eval" + ) + return self + else: + raise ValueError(f"Invalid eval config type: {self.config_type}") + + @model_validator(mode="after") + def validate_model(self) -> Self: + if self.model.type != DataSourceType.synthetic: + raise ValueError("model must be a synthetic model for an eval config") + return self + + @model_validator(mode="after") + def validate_json_serializable(self) -> "EvalConfig": + try: + # This will raise a TypeError if the dict contains non-JSON-serializable objects + json.dumps(self.properties) + except TypeError as e: + raise ValueError(f"Properties must be JSON serializable: {str(e)}") + return self + + +class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): + name: str = NAME_FIELD + description: str | None = Field( + default=None, description="The description of the eval" + ) + state: EvalState = Field( + default=EvalState.enabled, + description="The state of the eval: enabled or disabled.", + ) + current_config_id: ID_TYPE = Field( + default=None, + description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", + ) + + # Workaround to return typed parent without importing Task + def parent_task(self) -> Union["Task", None]: + if self.parent is None or self.parent.__class__.__name__ != "Task": + return None + return self.parent # type: ignore diff --git a/libs/core/kiln_ai/datamodel/eval_datamodel.py b/libs/core/kiln_ai/datamodel/eval_datamodel.py deleted file mode 100644 index 6cf4a23b..00000000 --- a/libs/core/kiln_ai/datamodel/eval_datamodel.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import Field - -from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnBaseModel - - -class Eval(KilnBaseModel): - name: str = NAME_FIELD - description: str | None = Field( - default=None, description="The description of the eval" - ) diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 38ac7885..37a32768 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -13,6 +13,7 @@ ) from kiln_ai.datamodel.datamodel_enums import Priority, TaskOutputRatingType from kiln_ai.datamodel.dataset_split import DatasetSplit +from kiln_ai.datamodel.eval import Eval from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str from kiln_ai.datamodel.prompt import Prompt from kiln_ai.datamodel.task_run import TaskRun @@ -42,6 +43,7 @@ class Task( "dataset_splits": DatasetSplit, "finetunes": Finetune, "prompts": Prompt, + "evals": Eval, }, ): """ @@ -90,3 +92,6 @@ def finetunes(self, readonly: bool = False) -> list[Finetune]: def prompts(self, readonly: bool = False) -> list[Prompt]: return super().prompts(readonly=readonly) # type: ignore + + def evals(self, readonly: bool = False) -> list[Eval]: + return super().evals(readonly=readonly) # type: ignore diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py new file mode 100644 index 00000000..d54bc8c8 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -0,0 +1,179 @@ +import pytest + +from kiln_ai.datamodel.basemodel import KilnParentModel +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalState, +) +from kiln_ai.datamodel.task import Task +from kiln_ai.datamodel.task_output import DataSource, DataSourceType + + +@pytest.fixture +def mock_task(): + return Task(name="Test Task", instruction="Test instruction") + + +@pytest.fixture +def valid_eval_config_data(): + return { + "name": "Test Config", + "model_provider": "openai", + "model_name": "gpt-4", + "config_type": EvalConfigType.g_eval, + "properties": {"g_eval_steps": ["step1", "step2"]}, + } + + +def test_eval_state_values(): + assert EvalState.enabled == "enabled" + assert EvalState.disabled == "disabled" + assert len(EvalState) == 2 + + +def test_eval_config_type_values(): + assert EvalConfigType.g_eval == "g_eval" + assert len(EvalConfigType) == 1 + + +@pytest.fixture +def valid_eval_config_data(): + return { + "name": "Test Config", + "config_type": EvalConfigType.g_eval, + "properties": {"g_eval_steps": ["step1", "step2"]}, + "model": DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "openai_compatible", + }, + ), + } + + +@pytest.fixture +def valid_eval_config(valid_eval_config_data): + return EvalConfig(**valid_eval_config_data) + + +def test_eval_config_valid(valid_eval_config): + assert valid_eval_config.name == "Test Config" + assert valid_eval_config.config_type == EvalConfigType.g_eval + assert valid_eval_config.properties["g_eval_steps"] == ["step1", "step2"] + assert valid_eval_config.model.type == DataSourceType.synthetic + assert valid_eval_config.model.properties["model_name"] == "gpt-4" + assert valid_eval_config.model.properties["model_provider"] == "openai" + assert valid_eval_config.model.properties["adapter_name"] == "openai_compatible" + + +def test_eval_config_missing_g_eval_steps(valid_eval_config): + with pytest.raises( + ValueError, match="g_eval_steps is required and must be a list for g_eval" + ): + valid_eval_config.properties = {} + + +def test_eval_config_invalid_json(valid_eval_config): + class InvalidClass: + pass + + with pytest.raises(ValueError, match="Properties must be JSON serializable"): + valid_eval_config.properties = { + "g_eval_steps": [], + "invalid_key": InvalidClass(), + } + + +def test_eval_config_invalid_g_eval_steps_type(valid_eval_config): + with pytest.raises( + ValueError, match="g_eval_steps is required and must be a list for g_eval" + ): + valid_eval_config.properties = {"g_eval_steps": "not a list"} + + +def test_eval_config_invalid_config_type(valid_eval_config): + # Create an invalid config type using string + with pytest.raises(ValueError): + valid_eval_config.config_type = "invalid_type" + + +def test_human_datasource(valid_eval_config): + with pytest.raises(ValueError): + valid_eval_config.model.type = DataSourceType.human + # Not ideal - error isn'd caught until we try to save or set a root field + valid_eval_config.name = "Test Config" + + +def test_eval_basic_properties(): + eval = Eval( + name="Test Eval", + description="Test Description", + state=EvalState.enabled, + current_config_id="config123", + ) + + assert eval.name == "Test Eval" + assert eval.description == "Test Description" + assert eval.state == EvalState.enabled + assert eval.current_config_id == "config123" + + +def test_eval_default_values(): + eval = Eval(name="Test Eval") + + assert eval.description is None + assert eval.state == EvalState.enabled + assert eval.current_config_id is None + + +def test_eval_parent_task_relationship(mock_task, valid_eval_config_data): + eval = Eval(name="Test Eval", parent=mock_task) + config = EvalConfig(parent=eval, **valid_eval_config_data) + + assert eval.parent_task() == mock_task + assert eval.parent == mock_task + assert config.parent == eval + assert config.parent_eval() == eval + + +def test_eval_parent_task_none(): + eval = Eval(name="Test Eval") + assert eval.parent_task() is None + + +def test_eval_parent_task_wrong_type(): + # Create a non-Task parent + class DummyParent(KilnParentModel, parent_of={}): + pass + + with pytest.raises(ValueError): + Eval(name="Test Eval", parent=DummyParent()) + + +def test_eval_with_configs(mock_task, valid_eval_config_data, tmp_path): + task_path = tmp_path / "task.kiln" + mock_task.path = task_path + mock_task.save_to_file() + + eval = Eval(name="Test Eval", parent=mock_task) + eval.save_to_file() + + # Add config using the parent relationship + config = EvalConfig(parent=eval, **valid_eval_config_data) + config.save_to_file() + + # Test configs can be retrieved from disk + evals = mock_task.evals() + assert len(evals) == 1 + assert evals[0].name == "Test Eval" + configs = evals[0].configs() + assert len(configs) == 1 + assert configs[0].name == "Test Config" + assert configs[0].model.properties["model_provider"] == "openai" + + # and back up + assert configs[0].parent_eval().parent_task().path == task_path From f5596e21735311e9548d56b1be8e32bc042ac5ae Mon Sep 17 00:00:00 2001 From: scosman Date: Thu, 13 Feb 2025 12:21:38 -0500 Subject: [PATCH 002/102] title to json key function w tests --- libs/core/kiln_ai/datamodel/json_schema.py | 6 +++++ .../core/kiln_ai/datamodel/test_eval_model.py | 11 --------- .../kiln_ai/datamodel/test_json_schema.py | 23 +++++++++++++++++++ 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/libs/core/kiln_ai/datamodel/json_schema.py b/libs/core/kiln_ai/datamodel/json_schema.py index ffa1267e..146e4ca3 100644 --- a/libs/core/kiln_ai/datamodel/json_schema.py +++ b/libs/core/kiln_ai/datamodel/json_schema.py @@ -1,4 +1,5 @@ import json +import re from typing import Annotated, Dict import jsonschema @@ -83,3 +84,8 @@ def schema_from_json_str(v: str) -> Dict: raise ValueError(f"Invalid JSON: {v}\n {e}") except Exception as e: raise ValueError(f"Unexpected error parsing JSON schema: {v}\n {e}") + + +def string_to_json_key(s: str) -> str: + """Convert a string to a valid JSON key.""" + return re.sub(r"[^a-z0-9_]", "", s.strip().lower().replace(" ", "_")) diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index d54bc8c8..b374a007 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -16,17 +16,6 @@ def mock_task(): return Task(name="Test Task", instruction="Test instruction") -@pytest.fixture -def valid_eval_config_data(): - return { - "name": "Test Config", - "model_provider": "openai", - "model_name": "gpt-4", - "config_type": EvalConfigType.g_eval, - "properties": {"g_eval_steps": ["step1", "step2"]}, - } - - def test_eval_state_values(): assert EvalState.enabled == "enabled" assert EvalState.disabled == "disabled" diff --git a/libs/core/kiln_ai/datamodel/test_json_schema.py b/libs/core/kiln_ai/datamodel/test_json_schema.py index 1f574aa7..f2300078 100644 --- a/libs/core/kiln_ai/datamodel/test_json_schema.py +++ b/libs/core/kiln_ai/datamodel/test_json_schema.py @@ -4,6 +4,7 @@ from kiln_ai.datamodel.json_schema import ( JsonObjectSchema, schema_from_json_str, + string_to_json_key, validate_schema, ) @@ -123,3 +124,25 @@ def test_triangle_schema(): validate_schema({"a": 1, "b": 2, "c": 3}, json_triangle_schema) with pytest.raises(Exception): validate_schema({"a": 1, "b": 2, "c": "3"}, json_triangle_schema) + + +@pytest.mark.parametrize( + "input_str,expected", + [ + ("hello world", "hello_world"), + ("Hello World", "hello_world"), + ("hello_world", "hello_world"), + ("HELLO WORLD", "hello_world"), + ("hello123", "hello123"), + ("hello-world", "helloworld"), + ("hello!@#$%^&*()world", "helloworld"), + (" hello world ", "hello__world"), + ("hello__world", "hello__world"), + ("", ""), + ("!@#$%", ""), + ("snake_case_string", "snake_case_string"), + ("camelCaseString", "camelcasestring"), + ], +) +def test_string_to_json_key(input_str: str, expected: str): + assert string_to_json_key(input_str) == expected From 3aa608e9f2060db563c80395f7fbce9949d015ed Mon Sep 17 00:00:00 2001 From: scosman Date: Thu, 13 Feb 2025 12:40:45 -0500 Subject: [PATCH 003/102] checkpoint of g_eval work, has working json_schema output, and initial framework. Not up and running yet --- libs/core/kiln_ai/datamodel/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libs/core/kiln_ai/datamodel/__init__.py b/libs/core/kiln_ai/datamodel/__init__.py index fe377f54..0c276aaa 100644 --- a/libs/core/kiln_ai/datamodel/__init__.py +++ b/libs/core/kiln_ai/datamodel/__init__.py @@ -11,6 +11,7 @@ from __future__ import annotations +from kiln_ai.datamodel import dataset_split, eval, strict_mode from kiln_ai.datamodel.datamodel_enums import ( FinetuneDataStrategy, FineTuneStatusType, @@ -43,6 +44,7 @@ __all__ = [ "strict_mode", "dataset_split", + "eval", "Task", "Project", "TaskRun", @@ -63,5 +65,4 @@ "TaskOutputRating", "StructuredOutputMode", "FinetuneDataStrategy", - "Eval", ] From 92f7bccdb189c7e95fd423ee2c9b907b885faf70 Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 14 Feb 2025 16:52:58 -0500 Subject: [PATCH 004/102] Refactor our prompt ID system to 1 uniform ID, with a pydantic type and validator --- app/desktop/studio_server/data_gen_api.py | 6 +- app/desktop/studio_server/finetune_api.py | 4 +- app/desktop/studio_server/prompt_api.py | 4 +- .../studio_server/test_data_gen_api.py | 6 +- .../studio_server/test_finetune_api.py | 2 +- app/desktop/studio_server/test_prompt_api.py | 22 +--- app/web_ui/src/lib/api_schema.d.ts | 41 +++--- .../[task_id]/[run_id]/run/+page.svelte | 3 +- .../[task_id]/create_finetune/+page.svelte | 2 +- .../[project_id]/[task_id]/+page.svelte | 2 +- .../[generator_id]/+page.svelte | 3 +- app/web_ui/src/routes/(app)/run/+page.svelte | 4 +- libs/core/kiln_ai/adapters/prompt_builders.py | 108 +++++++++++---- .../kiln_ai/adapters/repair/repair_task.py | 30 ++--- .../kiln_ai/adapters/test_prompt_builders.py | 123 +++++++++++++++--- libs/core/kiln_ai/datamodel/__init__.py | 3 +- libs/core/kiln_ai/datamodel/prompt.py | 20 ++- libs/server/kiln_server/prompt_api.py | 23 ++-- libs/server/kiln_server/run_api.py | 6 +- libs/server/kiln_server/test_prompt_api.py | 20 +-- 20 files changed, 285 insertions(+), 147 deletions(-) diff --git a/app/desktop/studio_server/data_gen_api.py b/app/desktop/studio_server/data_gen_api.py index a4f05315..2d93b60b 100644 --- a/app/desktop/studio_server/data_gen_api.py +++ b/app/desktop/studio_server/data_gen_api.py @@ -6,7 +6,7 @@ DataGenSampleTask, DataGenSampleTaskInput, ) -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name +from kiln_ai.adapters.prompt_builders import PromptId, prompt_builder_from_id from kiln_ai.datamodel import DataSource, DataSourceType, TaskRun from kiln_server.run_api import model_provider_from_string from kiln_server.task_api import task_from_id @@ -60,7 +60,7 @@ class DataGenSaveSamplesApiInput(BaseModel): ) output_model_name: str = Field(description="The name of the model to use") output_provider: str = Field(description="The provider of the model to use") - prompt_method: str = Field( + prompt_method: PromptId = Field( description="The prompt method used to generate the output" ) @@ -122,7 +122,7 @@ async def save_sample( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_ui_name(sample.prompt_method, task) + prompt_builder = prompt_builder_from_id(sample.prompt_method, task) tags = ["synthetic"] if session_id: diff --git a/app/desktop/studio_server/finetune_api.py b/app/desktop/studio_server/finetune_api.py index f4e09a43..82744ed8 100644 --- a/app/desktop/studio_server/finetune_api.py +++ b/app/desktop/studio_server/finetune_api.py @@ -11,7 +11,7 @@ ) from kiln_ai.adapters.prompt_builders import ( chain_of_thought_prompt, - prompt_builder_from_ui_name, + prompt_builder_from_id, ) from kiln_ai.adapters.provider_tools import ( provider_enabled, @@ -340,7 +340,7 @@ def system_message_from_request( detail="System message generator is required when custom system message is not provided", ) try: - prompt_builder = prompt_builder_from_ui_name(system_message_generator, task) + prompt_builder = prompt_builder_from_id(system_message_generator, task) system_message = prompt_builder.build_prompt( include_json_instructions=False ) diff --git a/app/desktop/studio_server/prompt_api.py b/app/desktop/studio_server/prompt_api.py index d43b8760..6a494cdb 100644 --- a/app/desktop/studio_server/prompt_api.py +++ b/app/desktop/studio_server/prompt_api.py @@ -1,5 +1,5 @@ from fastapi import FastAPI, HTTPException -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id from kiln_server.task_api import task_from_id from pydantic import BaseModel @@ -18,7 +18,7 @@ async def generate_prompt( task = task_from_id(project_id, task_id) try: - prompt_builder = prompt_builder_from_ui_name(prompt_generator, task) + prompt_builder = prompt_builder_from_id(prompt_generator, task) prompt = prompt_builder.build_prompt_for_ui() except Exception as e: raise HTTPException(status_code=400, detail=str(e)) diff --git a/app/desktop/studio_server/test_data_gen_api.py b/app/desktop/studio_server/test_data_gen_api.py index 1bb39875..80d9dcaf 100644 --- a/app/desktop/studio_server/test_data_gen_api.py +++ b/app/desktop/studio_server/test_data_gen_api.py @@ -160,7 +160,7 @@ def test_save_sample_success_paid_run( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", topic_path=[], # No topic path ) @@ -215,7 +215,7 @@ def test_save_sample_success_with_mock_invoke( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", topic_path=["AI", "Machine Learning", "Deep Learning"], ) @@ -270,7 +270,7 @@ def test_save_sample_success_with_topic_path( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", ) # Act diff --git a/app/desktop/studio_server/test_finetune_api.py b/app/desktop/studio_server/test_finetune_api.py index 4e99fe4c..087e73a9 100644 --- a/app/desktop/studio_server/test_finetune_api.py +++ b/app/desktop/studio_server/test_finetune_api.py @@ -660,7 +660,7 @@ def mock_prompt_builder(): builder.build_prompt.return_value = "Generated system message" with unittest.mock.patch( - "app.desktop.studio_server.finetune_api.prompt_builder_from_ui_name", + "app.desktop.studio_server.finetune_api.prompt_builder_from_id", return_value=builder, ) as mock: yield mock, builder diff --git a/app/desktop/studio_server/test_prompt_api.py b/app/desktop/studio_server/test_prompt_api.py index 35c0f17c..f9cfcf6c 100644 --- a/app/desktop/studio_server/test_prompt_api.py +++ b/app/desktop/studio_server/test_prompt_api.py @@ -37,10 +37,8 @@ def mock_task(): @pytest.fixture -def mock_prompt_builder_from_ui_name(mock_task): - with patch( - "app.desktop.studio_server.prompt_api.prompt_builder_from_ui_name" - ) as mock: +def mock_prompt_builder_from_id(mock_task): + with patch("app.desktop.studio_server.prompt_api.prompt_builder_from_id") as mock: mock.return_value = MockPromptBuilder(mock_task) yield mock @@ -53,7 +51,7 @@ def mock_task_from_id(mock_task): def test_generate_prompt_success( - client, mock_task, mock_prompt_builder_from_ui_name, mock_task_from_id + client, mock_task, mock_prompt_builder_from_id, mock_task_from_id ): response = client.get( "/api/projects/project123/task/task456/gen_prompt/mock_generator" @@ -68,17 +66,13 @@ def test_generate_prompt_success( } mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_ui_name.assert_called_once_with( - "mock_generator", mock_task - ) + mock_prompt_builder_from_id.assert_called_once_with("mock_generator", mock_task) def test_generate_prompt_exception( - client, mock_task, mock_prompt_builder_from_ui_name, mock_task_from_id + client, mock_task, mock_prompt_builder_from_id, mock_task_from_id ): - mock_prompt_builder_from_ui_name.side_effect = ValueError( - "Invalid prompt generator" - ) + mock_prompt_builder_from_id.side_effect = ValueError("Invalid prompt generator") response = client.get( "/api/projects/project123/task/task456/gen_prompt/invalid_generator" @@ -89,6 +83,4 @@ def test_generate_prompt_exception( assert data == {"detail": "Invalid prompt generator"} mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_ui_name.assert_called_once_with( - "invalid_generator", mock_task - ) + mock_prompt_builder_from_id.assert_called_once_with("invalid_generator", mock_task) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index be6777a5..f32f1cb3 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -1179,7 +1179,7 @@ export interface components { * Where models have instruct and raw versions, instruct is default and raw is specified. * @enum {string} */ - ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b"; + ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b"; /** OllamaConnection */ OllamaConnection: { /** Message */ @@ -1269,9 +1269,29 @@ export interface components { }; /** * Prompt - * @description A prompt for a task. + * @description A prompt for a task. This is the custom prompt parented by a task. */ Prompt: { + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Generator Id + * @description The id of the generator that created this prompt. + */ + generator_id?: string | null; + /** + * Prompt + * @description The prompt for the task. + */ + prompt: string; + /** + * Chain Of Thought Instructions + * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. + */ + chain_of_thought_instructions?: string | null; /** * V * @default 1 @@ -1288,21 +1308,6 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; - /** - * Name - * @description A name for this entity. - */ - name: string; - /** - * Prompt - * @description The prompt for the task. - */ - prompt: string; - /** - * Chain Of Thought Instructions - * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. - */ - chain_of_thought_instructions?: string | null; /** Model Type */ readonly model_type: string; }; @@ -1328,8 +1333,6 @@ export interface components { PromptGenerator: { /** Id */ id: string; - /** Ui Id */ - ui_id: string; /** Short Description */ short_description: string; /** Description */ diff --git a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte index d923870c..49f015d9 100644 --- a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte +++ b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte @@ -36,8 +36,7 @@ )?.name let prompt_generator_name = $current_task_prompts?.generators.find( (generator) => - generator.ui_id === - run?.output?.source?.properties?.prompt_builder_name, + generator.id === run?.output?.source?.properties?.prompt_builder_name, )?.name // Special case for fine-tuned prompts diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte index 1e7100aa..83064af0 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte @@ -31,7 +31,7 @@ let finetune_custom_system_prompt = "" let finetune_custom_thinking_instructions = "Think step by step, explaining your reasoning." - let system_prompt_method = "basic" + let system_prompt_method = "simple_prompt_builder" $: project_id = $page.params.project_id $: task_id = $page.params.task_id diff --git a/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/+page.svelte index 5dac9699..0b38a966 100644 --- a/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/generate/[project_id]/[task_id]/+page.svelte @@ -28,7 +28,7 @@ $: project_id = $page.params.project_id $: task_id = $page.params.task_id - let prompt_method = "basic" + let prompt_method = "simple_prompt_builder" let model: string = $ui_state.selected_model // Shared vars for all nodes, so UI saves last used value diff --git a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte index 329cf1fc..ee84ebc6 100644 --- a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte @@ -81,7 +81,8 @@ To improve the quality of this prompt, edit the task instructions or requirements, or add more data to your dataset by running the task, or add ratings and repairs to your diff --git a/app/web_ui/src/routes/(app)/run/+page.svelte b/app/web_ui/src/routes/(app)/run/+page.svelte index e0c3c57e..c7324078 100644 --- a/app/web_ui/src/routes/(app)/run/+page.svelte +++ b/app/web_ui/src/routes/(app)/run/+page.svelte @@ -20,7 +20,7 @@ let input_form: RunInputForm - let prompt_method = "basic" + let prompt_method = "simple_prompt_builder" let model: string = $ui_state.selected_model $: model_name = model ? model.split("/").slice(1).join("/") : "" @@ -107,7 +107,7 @@ } else { if (prompt_method == "custom") { // Reset to basic, since custom is no longer available - prompt_method = "basic" + prompt_method = "simple_prompt_builder" } } } diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 94fbdb59..62c27b58 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -1,8 +1,12 @@ import json from abc import ABCMeta, abstractmethod -from typing import Dict +from enum import StrEnum +from typing import Annotated, Dict + +from pydantic import AfterValidator from kiln_ai.datamodel import Task, TaskRun +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error from kiln_ai.utils.formatting import snake_case @@ -337,25 +341,67 @@ def chain_of_thought_prompt(self) -> str | None: return self.fine_tune_model.thinking_instructions -# TODO P2: we end up with 2 IDs for these: the keys here (ui_name) and the prompt_builder_name from the class -# We end up maintaining this in _prompt_generators as well. -prompt_builder_registry = { - "simple_prompt_builder": SimplePromptBuilder, - "multi_shot_prompt_builder": MultiShotPromptBuilder, - "few_shot_prompt_builder": FewShotPromptBuilder, - "repairs_prompt_builder": RepairsPromptBuilder, - "simple_chain_of_thought_prompt_builder": SimpleChainOfThoughtPromptBuilder, - "few_shot_chain_of_thought_prompt_builder": FewShotChainOfThoughtPromptBuilder, - "multi_shot_chain_of_thought_prompt_builder": MultiShotChainOfThoughtPromptBuilder, -} +# Generators that can take any task and build a prompt +class PromptGenerators(StrEnum): + SIMPLE = "simple_prompt_builder" + MULTI_SHOT = "multi_shot_prompt_builder" + FEW_SHOT = "few_shot_prompt_builder" + REPAIRS = "repairs_prompt_builder" + SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder" + FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder" + MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder" + + +prompt_generator_values = [pg.value for pg in PromptGenerators] + + +# Our prompt ID can be one of: +# - A saved prompt ID +# - A fine-tune prompt ID +# - A prompt generator name +PromptId = Annotated[ + str, + AfterValidator(lambda v: _check_prompt_id(v)), +] +""" +A pydantic type that validates strings containing a valid prompt ID. +""" + + +def _check_prompt_id(id: str) -> str: + """ + Check that the prompt ID is valid. + """ + if id in prompt_generator_values: + return id + + if id.startswith("id::"): + # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id' + parts = id.split("::") + if len(parts) != 4: + raise ValueError( + f"Invalid saved prompt ID: {id}. Expected format: 'id::[project_id]::[task_id]::[prompt_id]'." + ) + return id + + if id.startswith("fine_tune_prompt::"): + # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id' + fine_tune_id = id[18:] + if len(fine_tune_id) == 0: + raise ValueError( + f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'." + ) + return id + + raise ValueError(f"Invalid prompt ID: {id}") # Our UI has some names that are not the same as the class names, which also hint parameters. -def prompt_builder_from_ui_name(ui_name: str, task: Task) -> BasePromptBuilder: +def prompt_builder_from_id(prompt_id: str, task: Task) -> BasePromptBuilder: """Convert a name used in the UI to the corresponding prompt builder class. Args: - ui_name (str): The UI name for the prompt builder type. + prompt_id (str): The prompt ID. Returns: type[BasePromptBuilder]: The corresponding prompt builder class. @@ -365,29 +411,35 @@ def prompt_builder_from_ui_name(ui_name: str, task: Task) -> BasePromptBuilder: """ # Saved prompts are prefixed with "id::" - if ui_name.startswith("id::"): - prompt_id = ui_name[4:] + if prompt_id.startswith("id::"): + prompt_id = prompt_id[4:] return SavedPromptBuilder(task, prompt_id) # Fine-tune prompts are prefixed with "fine_tune_prompt::" - if ui_name.startswith("fine_tune_prompt::"): - fine_tune_id = ui_name[18:] - return FineTunePromptBuilder(task, fine_tune_id) + if prompt_id.startswith("fine_tune_prompt::"): + prompt_id = prompt_id[18:] + return FineTunePromptBuilder(task, prompt_id) + + # Check if the prompt_id matches any enum value + if prompt_id not in [member.value for member in PromptGenerators]: + raise ValueError(f"Unknown prompt generator: {prompt_id}") + typed_prompt_generator = PromptGenerators(prompt_id) - match ui_name: - case "basic": + match typed_prompt_generator: + case PromptGenerators.SIMPLE: return SimplePromptBuilder(task) - case "few_shot": + case PromptGenerators.FEW_SHOT: return FewShotPromptBuilder(task) - case "many_shot": + case PromptGenerators.MULTI_SHOT: return MultiShotPromptBuilder(task) - case "repairs": + case PromptGenerators.REPAIRS: return RepairsPromptBuilder(task) - case "simple_chain_of_thought": + case PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT: return SimpleChainOfThoughtPromptBuilder(task) - case "few_shot_chain_of_thought": + case PromptGenerators.FEW_SHOT_CHAIN_OF_THOUGHT: return FewShotChainOfThoughtPromptBuilder(task) - case "multi_shot_chain_of_thought": + case PromptGenerators.MULTI_SHOT_CHAIN_OF_THOUGHT: return MultiShotChainOfThoughtPromptBuilder(task) case _: - raise ValueError(f"Unknown prompt builder: {ui_name}") + # Type checking will find missing cases + raise_exhaustive_enum_error(typed_prompt_generator) diff --git a/libs/core/kiln_ai/adapters/repair/repair_task.py b/libs/core/kiln_ai/adapters/repair/repair_task.py index 43690935..e140b812 100644 --- a/libs/core/kiln_ai/adapters/repair/repair_task.py +++ b/libs/core/kiln_ai/adapters/repair/repair_task.py @@ -6,7 +6,7 @@ from kiln_ai.adapters.prompt_builders import ( BasePromptBuilder, SavedPromptBuilder, - prompt_builder_registry, + prompt_builder_from_id, ) from kiln_ai.datamodel import Priority, Project, Task, TaskRequirement, TaskRun @@ -49,28 +49,16 @@ def _original_prompt(cls, run: TaskRun, task: Task) -> str: if run.output.source is None or run.output.source.properties is None: raise ValueError("No source properties found") - # Try ID first, then builder name - prompt_id = run.output.source.properties.get("prompt_id", None) + # Get the prompt builder - stored in 2 fields, mutually exclusive + prompt_id = run.output.source.properties.get( + "prompt_id" + ) or run.output.source.properties.get("prompt_builder_name", None) if prompt_id is not None and isinstance(prompt_id, str): - static_prompt_builder = SavedPromptBuilder(task, prompt_id) - return static_prompt_builder.build_prompt(include_json_instructions=False) + prompt_builder = prompt_builder_from_id(prompt_id, task) + if isinstance(prompt_builder, BasePromptBuilder): + return prompt_builder.build_prompt(include_json_instructions=False) - prompt_builder_class: Type[BasePromptBuilder] | None = None - prompt_builder_name = run.output.source.properties.get( - "prompt_builder_name", None - ) - if prompt_builder_name is not None and isinstance(prompt_builder_name, str): - prompt_builder_class = prompt_builder_registry.get( - prompt_builder_name, None - ) - if prompt_builder_class is None: - raise ValueError(f"No prompt builder found for name: {prompt_builder_name}") - prompt_builder = prompt_builder_class(task=task) - if not isinstance(prompt_builder, BasePromptBuilder): - raise ValueError( - f"Prompt builder {prompt_builder_name} is not a valid prompt builder" - ) - return prompt_builder.build_prompt(include_json_instructions=False) + raise ValueError(f"Prompt builder '{prompt_id}' is not a valid prompt builder") @classmethod def build_repair_task_input( diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 161f3d0c..f792d579 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -1,6 +1,7 @@ import json import pytest +from pydantic import BaseModel, ValidationError from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter from kiln_ai.adapters.model_adapters.test_structured_output import ( @@ -12,12 +13,14 @@ FineTunePromptBuilder, MultiShotChainOfThoughtPromptBuilder, MultiShotPromptBuilder, + PromptGenerators, + PromptId, RepairsPromptBuilder, SavedPromptBuilder, SimpleChainOfThoughtPromptBuilder, SimplePromptBuilder, chain_of_thought_prompt, - prompt_builder_from_ui_name, + prompt_builder_from_id, ) from kiln_ai.adapters.test_prompt_adaptors import build_test_task from kiln_ai.datamodel import ( @@ -320,48 +323,53 @@ def test_prompt_builder_name(): assert RepairsPromptBuilder.prompt_builder_name() == "repairs_prompt_builder" -def test_prompt_builder_from_ui_name(task_with_examples): +def test_prompt_builder_from_id(task_with_examples): task = task_with_examples - assert isinstance(prompt_builder_from_ui_name("basic", task), SimplePromptBuilder) assert isinstance( - prompt_builder_from_ui_name("few_shot", task), FewShotPromptBuilder + prompt_builder_from_id("simple_prompt_builder", task), SimplePromptBuilder ) assert isinstance( - prompt_builder_from_ui_name("many_shot", task), MultiShotPromptBuilder + prompt_builder_from_id("few_shot_prompt_builder", task), + FewShotPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("repairs", task), RepairsPromptBuilder + prompt_builder_from_id("multi_shot_prompt_builder", task), + MultiShotPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("simple_chain_of_thought", task), + prompt_builder_from_id("repairs_prompt_builder", task), + RepairsPromptBuilder, + ) + assert isinstance( + prompt_builder_from_id("simple_chain_of_thought_prompt_builder", task), SimpleChainOfThoughtPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("few_shot_chain_of_thought", task), + prompt_builder_from_id("few_shot_chain_of_thought_prompt_builder", task), FewShotChainOfThoughtPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("multi_shot_chain_of_thought", task), + prompt_builder_from_id("multi_shot_chain_of_thought_prompt_builder", task), MultiShotChainOfThoughtPromptBuilder, ) - with pytest.raises(ValueError, match="Unknown prompt builder: invalid_name"): - prompt_builder_from_ui_name("invalid_name", task) + with pytest.raises(ValueError, match="Unknown prompt generator: invalid_name"): + prompt_builder_from_id("invalid_name", task) with pytest.raises(ValueError, match="Prompt ID not found: 123"): - prompt_builder_from_ui_name("id::123", task) + prompt_builder_from_id("id::123", task) with pytest.raises( ValueError, match="Invalid fine-tune ID format. Expected 'project_id::task_id::fine_tune_id'", ): - prompt_builder_from_ui_name("fine_tune_prompt::123", task) + prompt_builder_from_id("fine_tune_prompt::123", task) with pytest.raises( ValueError, match="Fine-tune ID not found", ): - prompt_builder_from_ui_name("fine_tune_prompt::123::456::789", task) + prompt_builder_from_id("fine_tune_prompt::123::456::789", task) prompt = Prompt( name="test_prompt_name", @@ -370,7 +378,7 @@ def test_prompt_builder_from_ui_name(task_with_examples): parent=task, ) prompt.save_to_file() - pb = prompt_builder_from_ui_name("id::" + prompt.id, task) + pb = prompt_builder_from_id("id::" + prompt.id, task) assert isinstance(pb, SavedPromptBuilder) assert pb.prompt_id() == prompt.id assert pb.build_prompt(include_json_instructions=False) == "test_prompt" @@ -390,7 +398,7 @@ def test_prompt_builder_from_ui_name(task_with_examples): nested_fine_tune_id = ( task_with_examples.parent.id + "::" + task_with_examples.id + "::" + finetune.id ) - pb = prompt_builder_from_ui_name( + pb = prompt_builder_from_id( "fine_tune_prompt::" + nested_fine_tune_id, task_with_examples, ) @@ -587,3 +595,86 @@ def test_build_prompt_with_json_instructions(tmp_path): assert task.instruction in prompt_with_json for requirement in task.requirements: assert requirement.instruction in prompt_with_json + + +# Test model to validate the PromptId type +class TestModel(BaseModel): + prompt_id: PromptId + + +def test_valid_prompt_generator_names(): + """Test that valid prompt generator names are accepted""" + for generator in PromptGenerators: + model = TestModel(prompt_id=generator.value) + assert model.prompt_id == generator.value + + +def test_valid_saved_prompt_id(): + """Test that valid saved prompt IDs are accepted""" + valid_id = "id::project_123::task_456::prompt_789" + model = TestModel(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +def test_valid_fine_tune_prompt_id(): + """Test that valid fine-tune prompt IDs are accepted""" + valid_id = "fine_tune_prompt::ft_123456" + model = TestModel(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id", + [ + pytest.param("id::project_123::task_456", id="missing_prompt_id"), + pytest.param( + "id::project_123::task_456::prompt_789::extra", id="too_many_parts" + ), + pytest.param("id::", id="empty_parts"), + pytest.param("id::project_123", id="too_few_parts"), + ], +) +def test_invalid_saved_prompt_id_format(invalid_id): + """Test that invalid saved prompt ID formats are rejected""" + with pytest.raises(ValidationError, match="Invalid saved prompt ID"): + TestModel(prompt_id=invalid_id) + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"), + ("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"), + ], +) +def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error): + """Test that invalid fine-tune prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + TestModel(prompt_id=invalid_id) + + +def test_completely_invalid_formats(): + """Test that completely invalid formats are rejected""" + invalid_ids = [ + "", # Empty string + "invalid_format", # Random string + "id:wrong_format", # Almost correct but wrong separator + "fine_tune:wrong_format", # Almost correct but wrong prefix + ":::", # Just separators + ] + + for invalid_id in invalid_ids: + with pytest.raises(ValidationError, match="Invalid prompt ID"): + TestModel(prompt_id=invalid_id) + + +def test_prompt_generator_case_sensitivity(): + """Test that prompt generator names are case sensitive""" + # Take first generator and modify its case + first_generator = next(iter(PromptGenerators)).value + wrong_case = first_generator.upper() + if wrong_case == first_generator: + wrong_case = first_generator.lower() + + with pytest.raises(ValidationError): + TestModel(prompt_id=wrong_case) diff --git a/libs/core/kiln_ai/datamodel/__init__.py b/libs/core/kiln_ai/datamodel/__init__.py index 0c276aaa..09a33e51 100644 --- a/libs/core/kiln_ai/datamodel/__init__.py +++ b/libs/core/kiln_ai/datamodel/__init__.py @@ -27,7 +27,7 @@ Finetune, ) from kiln_ai.datamodel.project import Project -from kiln_ai.datamodel.prompt import Prompt +from kiln_ai.datamodel.prompt import BasePrompt, Prompt from kiln_ai.datamodel.task import Task, TaskRequirement from kiln_ai.datamodel.task_output import ( DataSource, @@ -61,6 +61,7 @@ "DatasetSplit", "RequirementRating", "TaskRequirement", + "BasePrompt", "Prompt", "TaskOutputRating", "StructuredOutputMode", diff --git a/libs/core/kiln_ai/datamodel/prompt.py b/libs/core/kiln_ai/datamodel/prompt.py index c4ec7d5e..650712d9 100644 --- a/libs/core/kiln_ai/datamodel/prompt.py +++ b/libs/core/kiln_ai/datamodel/prompt.py @@ -1,14 +1,20 @@ -from pydantic import Field +from pydantic import BaseModel, Field from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel -class Prompt(KilnParentedModel): +class BasePrompt(BaseModel): """ - A prompt for a task. + A prompt for a task. This is the basic data storage format which can be used throughout a project. + + The "Prompt" model name is reserved for the custom prompts parented by a task. """ name: str = NAME_FIELD + generator_id: str | None = Field( + default=None, + description="The id of the generator that created this prompt.", + ) prompt: str = Field( description="The prompt for the task.", min_length=1, @@ -17,3 +23,11 @@ class Prompt(KilnParentedModel): default=None, description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.", ) + + +class Prompt(KilnParentedModel, BasePrompt): + """ + A prompt for a task. This is the custom prompt parented by a task. + """ + + pass diff --git a/libs/server/kiln_server/prompt_api.py b/libs/server/kiln_server/prompt_api.py index a032ef6c..0b17cbb1 100644 --- a/libs/server/kiln_server/prompt_api.py +++ b/libs/server/kiln_server/prompt_api.py @@ -13,7 +13,6 @@ class PromptCreateRequest(BaseModel): class PromptGenerator(BaseModel): id: str - ui_id: str short_description: str description: str name: str @@ -50,58 +49,52 @@ async def get_prompts(project_id: str, task_id: str) -> PromptResponse: ) +# User friendly descriptions of the prompt generators _prompt_generators = [ PromptGenerator( - id="basic", - ui_id="simple_prompt_builder", + id="simple_prompt_builder", name="Basic (Zero Shot)", short_description="Includes the instructions and requirements from your task definition.", description="A basic prompt generator. It will include the instructions and requirements from your task definition. It won't include any examples from your runs (zero-shot).", chain_of_thought=False, ), PromptGenerator( - id="few_shot", - ui_id="few_shot_prompt_builder", + id="few_shot_prompt_builder", name="Few-Shot", short_description="Includes up to 4 examples from your dataset.", description="A multi-shot prompt generator that includes up to 4 examples from your dataset (few-shot). It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="many_shot", - ui_id="multi_shot_prompt_builder", + id="multi_shot_prompt_builder", name="Many-Shot", short_description="Includes up to 25 examples from your dataset.", description="A multi-shot prompt generator that includes up to 25 examples from your dataset (many-shot). It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="repairs", - ui_id="repairs_prompt_builder", + id="repairs_prompt_builder", name="Repair Multi-Shot", short_description="Includes examples from your dataset, including human feedback about mistakes and how to correct them.", description="A multi-shot prompt that will include up to 25 examples from your dataset. This prompt will use repaired examples to show 1) the generated content which had issues, 2) the human feedback about what was incorrect, 3) the corrected and approved content. This gives the LLM examples of common errors to avoid. It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="simple_chain_of_thought", - ui_id="simple_chain_of_thought_prompt_builder", + id="simple_chain_of_thought_prompt_builder", name="Chain of Thought", short_description="Gives the LLM time to 'think' before replying.", description="A chain of thought prompt generator that gives the LLM time to 'think' before replying. It will use the thinking_instruction from your task definition if it exists, or a standard 'step by step' instruction. The result will only include the final answer, not the 'thinking' tokens. The 'thinking' tokens will be available in the data model. It also includes the instructions and requirements from your task definition.", chain_of_thought=True, ), PromptGenerator( - id="few_shot_chain_of_thought", - ui_id="few_shot_chain_of_thought_prompt_builder", + id="few_shot_chain_of_thought_prompt_builder", name="Chain of Thought - Few Shot", short_description="Combines our 'Chain of Thought' generator with our 'Few-Shot' generator.", description="Combines our 'Chain of Thought' generator with our 'Few-Shot' generator, for both the thinking and the few shot examples.", chain_of_thought=True, ), PromptGenerator( - id="multi_shot_chain_of_thought", - ui_id="multi_shot_chain_of_thought_prompt_builder", + id="multi_shot_chain_of_thought_prompt_builder", name="Chain of Thought - Many Shot", short_description="Combines our 'Chain of Thought' generator with our 'Many-Shot' generator.", description="Combines our 'Chain of Thought' generator with our 'Many-Shot' generator, for both the thinking and the many shot examples.", diff --git a/libs/server/kiln_server/run_api.py b/libs/server/kiln_server/run_api.py index bd43c157..7c02ae19 100644 --- a/libs/server/kiln_server/run_api.py +++ b/libs/server/kiln_server/run_api.py @@ -5,7 +5,7 @@ from fastapi import FastAPI, HTTPException from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id from kiln_ai.datamodel import Task, TaskOutputRating, TaskOutputRatingType, TaskRun from kiln_ai.datamodel.basemodel import ID_TYPE from pydantic import BaseModel, ConfigDict @@ -188,8 +188,8 @@ async def run_task( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_ui_name( - request.ui_prompt_method or "basic", + prompt_builder = prompt_builder_from_id( + request.ui_prompt_method or "simple_prompt_builder", task, ) if prompt_builder is None: diff --git a/libs/server/kiln_server/test_prompt_api.py b/libs/server/kiln_server/test_prompt_api.py index 68f62497..a855af92 100644 --- a/libs/server/kiln_server/test_prompt_api.py +++ b/libs/server/kiln_server/test_prompt_api.py @@ -3,7 +3,7 @@ import pytest from fastapi import FastAPI from fastapi.testclient import TestClient -from kiln_ai.adapters.prompt_builders import prompt_builder_registry +from kiln_ai.adapters.prompt_builders import PromptGenerators from kiln_ai.datamodel import Project, Prompt, Task from kiln_server.custom_errors import connect_custom_errors @@ -116,18 +116,22 @@ def test_prompt_generators_content(): from kiln_server.prompt_api import _prompt_generators # Test a few key generators - basic = next(g for g in _prompt_generators if g.id == "basic") + basic = next(g for g in _prompt_generators if g.id == "simple_prompt_builder") assert basic.chain_of_thought is False assert "zero-shot" in basic.description.lower() - cot = next(g for g in _prompt_generators if g.id == "simple_chain_of_thought") + cot = next( + g + for g in _prompt_generators + if g.id == "simple_chain_of_thought_prompt_builder" + ) assert cot.chain_of_thought is True assert "Chain of Thought" in cot.name -# If we fix the TODO about maintaining these in 2 places we can remove this test, but this ensures we don't mess it up until then -def test_all_ui_ids_are_covered(): - generator_keys = prompt_builder_registry.keys() - api_list = [g.ui_id for g in _prompt_generators] +# Check our nice UI list with descriptions covers all our generators +def test_all_ids_are_covered(): + generators = [e.value for e in PromptGenerators] + api_list = [g.id for g in _prompt_generators] - assert set(api_list) == set(generator_keys) + assert set(api_list) == set(generators) From 0055af9e0dbe6a95fa97481c664d86a197bcde8e Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 14 Feb 2025 22:01:46 -0500 Subject: [PATCH 005/102] Add a prompt serialization in the eval config model. I might move this to the EvalRun but working with tests for now. --- .../adapters/model_adapters/base_adapter.py | 7 +- .../test_saving_adapter_results.py | 18 +++ libs/core/kiln_ai/adapters/prompt_builders.py | 70 ++++++++++- .../kiln_ai/adapters/test_prompt_builders.py | 119 ++++++++++++++++++ libs/core/kiln_ai/datamodel/eval.py | 5 + libs/core/kiln_ai/datamodel/task.py | 11 +- .../core/kiln_ai/datamodel/test_eval_model.py | 14 +++ 7 files changed, 241 insertions(+), 3 deletions(-) diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index 9ae8f9a2..e9f7fa32 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -94,6 +94,7 @@ async def invoke( self, input: Dict | str, input_source: DataSource | None = None, + allow_saving: bool = True, ) -> TaskRun: # validate input if self.input_schema is not None: @@ -128,7 +129,11 @@ async def invoke( run = self.generate_run(input, input_source, parsed_output) # Save the run if configured to do so, and we have a path to save to - if Config.shared().autosave_runs and self.kiln_task.path is not None: + if ( + allow_saving + and Config.shared().autosave_runs + and self.kiln_task.path is not None + ): run.save_to_file() else: # Clear the ID to indicate it's not persisted diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py index 64a9b6fd..64a36121 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py @@ -178,6 +178,24 @@ async def test_autosave_false(test_task, adapter): assert run.id is None +@pytest.mark.asyncio +async def test_autosave_true_with_disabled(test_task, adapter): + with patch("kiln_ai.utils.config.Config.shared") as mock_shared: + mock_config = mock_shared.return_value + mock_config.autosave_runs = True + mock_config.user_id = "test_user" + + input_data = "Test input" + + run = await adapter.invoke(input_data, allow_saving=False) + + # Check that no runs were saved + assert len(test_task.runs()) == 0 + + # Check that the run ID is not set + assert run.id is None + + @pytest.mark.asyncio async def test_autosave_true(test_task, adapter): with patch("kiln_ai.utils.config.Config.shared") as mock_shared: diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 62c27b58..9402d3d6 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -5,7 +5,7 @@ from pydantic import AfterValidator -from kiln_ai.datamodel import Task, TaskRun +from kiln_ai.datamodel import BasePrompt, Task, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error from kiln_ai.utils.formatting import snake_case @@ -304,6 +304,61 @@ def chain_of_thought_prompt(self) -> str | None: return self.prompt_model.chain_of_thought_instructions +class EvalPromptBuilder(BasePromptBuilder): + """A prompt builder that looks up a static prompt in an eval config.""" + + def __init__(self, task: Task, eval_config_prompt_id: str): + parts = eval_config_prompt_id.split("::") + if len(parts) != 5: + raise ValueError( + f"Invalid eval prompt ID: {eval_config_prompt_id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]::[eval_config_id]'." + ) + + task_id = parts[2] + if task_id != task.id: + raise ValueError( + f"Eval prompt ID: {eval_config_prompt_id}. Task ID mismatch. Expected: {task.id}, got: {task_id}." + ) + + eval_id = parts[3] + eval = next( + (eval for eval in task.evals(readonly=True) if eval.id == eval_id), + None, + ) + if not eval: + raise ValueError( + f"Eval ID not found: {eval_id} for prompt id {eval_config_prompt_id}" + ) + + eval_config_id = parts[4] + eval_config = next( + ( + eval_config + for eval_config in eval.configs(readonly=True) + if eval_config.id == eval_config_id + ), + None, + ) + if not eval_config: + raise ValueError( + f"Eval config ID not found: {eval_config_id} for prompt id {eval_config_prompt_id}" + ) + + self.prompt_model = eval_config.prompt + self.id = eval_config_prompt_id + + super().__init__(task) + + def prompt_id(self) -> str | None: + return self.id + + def build_base_prompt(self) -> str: + return self.prompt_model.prompt + + def chain_of_thought_prompt(self) -> str | None: + return self.prompt_model.chain_of_thought_instructions + + class FineTunePromptBuilder(BasePromptBuilder): """A prompt builder that looks up a fine-tune prompt.""" @@ -384,6 +439,15 @@ def _check_prompt_id(id: str) -> str: ) return id + if id.startswith("eval_prompt::"): + # check it had a eval_id after the :: -- 'project_id::task_id::eval_id::eval_config_id' + parts = id.split("::") + if len(parts) != 5: + raise ValueError( + f"Invalid eval prompt ID: {id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]'." + ) + return id + if id.startswith("fine_tune_prompt::"): # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id' fine_tune_id = id[18:] @@ -415,6 +479,10 @@ def prompt_builder_from_id(prompt_id: str, task: Task) -> BasePromptBuilder: prompt_id = prompt_id[4:] return SavedPromptBuilder(task, prompt_id) + # Eval prompts are prefixed with "eval_prompt::" + if prompt_id.startswith("eval_prompt::"): + return EvalPromptBuilder(task, prompt_id) + # Fine-tune prompts are prefixed with "fine_tune_prompt::" if prompt_id.startswith("fine_tune_prompt::"): prompt_id = prompt_id[18:] diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index f792d579..2112b958 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -8,6 +8,7 @@ build_structured_output_test_task, ) from kiln_ai.adapters.prompt_builders import ( + EvalPromptBuilder, FewShotChainOfThoughtPromptBuilder, FewShotPromptBuilder, FineTunePromptBuilder, @@ -35,6 +36,7 @@ TaskOutputRating, TaskRun, ) +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType def test_simple_prompt_builder(tmp_path): @@ -678,3 +680,120 @@ def test_prompt_generator_case_sensitivity(): with pytest.raises(ValidationError): TestModel(prompt_id=wrong_case) + + +@pytest.fixture +def valid_eval_config_datasource(): + return DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "openai_compatible", + }, + ) + + +def test_eval_prompt_builder(tmp_path, valid_eval_config_datasource): + task = build_test_task(tmp_path) + + # Create an eval and eval config + eval = Eval( + name="test_eval", + parent=task, + ) + eval.save_to_file() + + eval_config = EvalConfig( + name="test_eval_config", + parent=eval, + config_type=EvalConfigType.g_eval, + model=valid_eval_config_datasource, + prompt=Prompt( + name="test_prompt", + prompt="test_eval_prompt", + chain_of_thought_instructions="Think carefully", + ), + properties={"g_eval_steps": ["step1", "step2"]}, + ) + eval_config.save_to_file() + + # Construct the eval prompt ID + eval_prompt_id = ( + f"eval_prompt::{task.parent.id}::{task.id}::{eval.id}::{eval_config.id}" + ) + + # Test successful creation, constructor and ID creation + builders = [ + EvalPromptBuilder(task=task, eval_config_prompt_id=eval_prompt_id), + prompt_builder_from_id(eval_prompt_id, task), + ] + + for builder in builders: + assert ( + builder.build_prompt(include_json_instructions=False) == "test_eval_prompt" + ) + assert builder.chain_of_thought_prompt() == "Think carefully" + assert builder.prompt_id() == eval_prompt_id + + # test accessor + + +def test_eval_prompt_builder_validation_errors(tmp_path): + task = build_test_task(tmp_path) + + # Test invalid format + with pytest.raises(ValueError, match="Invalid eval prompt ID"): + EvalPromptBuilder(task=task, eval_config_prompt_id="eval_prompt::wrong::format") + + # Test task ID mismatch + wrong_task_id = f"eval_prompt::{task.parent.id}::wrong_task_id::eval_id::config_id" + with pytest.raises(ValueError, match="Task ID mismatch"): + EvalPromptBuilder(task=task, eval_config_prompt_id=wrong_task_id) + + # Test eval not found + nonexistent_eval = ( + f"eval_prompt::{task.parent.id}::{task.id}::nonexistent_eval::config_id" + ) + with pytest.raises(ValueError, match="Eval ID not found"): + EvalPromptBuilder(task=task, eval_config_prompt_id=nonexistent_eval) + + # Create eval but test config not found + eval = Eval( + name="test_eval", + parent=task, + ) + eval.save_to_file() + + nonexistent_config = ( + f"eval_prompt::{task.parent.id}::{task.id}::{eval.id}::nonexistent_config" + ) + with pytest.raises(ValueError, match="Eval config ID not found"): + EvalPromptBuilder(task=task, eval_config_prompt_id=nonexistent_config) + + +@pytest.mark.parametrize( + "valid_id", + [ + "eval_prompt::project_123::task_456::eval_789::config_012", # Valid eval prompt ID + ], +) +def test_valid_eval_prompt_id(valid_id): + """Test that valid eval prompt IDs are accepted""" + model = TestModel(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("eval_prompt::", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1::e1", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1::e1::c1::extra", "Invalid eval prompt ID"), + ], +) +def test_invalid_eval_prompt_id_format(invalid_id, expected_error): + """Test that invalid eval prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + TestModel(prompt_id=invalid_id) diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 8af2b97d..f9408754 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -11,6 +11,7 @@ KilnParentedModel, KilnParentModel, ) +from kiln_ai.datamodel.prompt import BasePrompt from kiln_ai.datamodel.task_output import DataSource, DataSourceType if TYPE_CHECKING: @@ -43,6 +44,7 @@ class EvalConfig(KilnParentedModel): default={}, description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", ) + prompt: BasePrompt = Field(description="The prompt to use for this eval config.") def parent_eval(self) -> "Eval": if self.parent is None or self.parent.__class__.__name__ != "Eval": @@ -97,3 +99,6 @@ def parent_task(self) -> Union["Task", None]: if self.parent is None or self.parent.__class__.__name__ != "Task": return None return self.parent # type: ignore + + def configs(self, readonly: bool = False) -> list[EvalConfig]: + return super().configs(readonly=readonly) # type: ignore diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 37a32768..6af3dc4f 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import TYPE_CHECKING, Dict, List, Union from pydantic import BaseModel, Field @@ -18,6 +18,9 @@ from kiln_ai.datamodel.prompt import Prompt from kiln_ai.datamodel.task_run import TaskRun +if TYPE_CHECKING: + from kiln_ai.datamodel.project import Project + class TaskRequirement(BaseModel): """ @@ -95,3 +98,9 @@ def prompts(self, readonly: bool = False) -> list[Prompt]: def evals(self, readonly: bool = False) -> list[Eval]: return super().evals(readonly=readonly) # type: ignore + + # Workaround to return typed parent without importing Task + def parent_project(self) -> Union["Project", None]: + if self.parent is None or self.parent.__class__.__name__ != "Project": + return None + return self.parent # type: ignore diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index b374a007..0889dcde 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -1,5 +1,6 @@ import pytest +from kiln_ai.datamodel import BasePrompt from kiln_ai.datamodel.basemodel import KilnParentModel from kiln_ai.datamodel.eval import ( Eval, @@ -41,6 +42,10 @@ def valid_eval_config_data(): "adapter_name": "openai_compatible", }, ), + "prompt": BasePrompt( + name="Test Prompt", + prompt="Test prompt", + ), } @@ -57,6 +62,15 @@ def test_eval_config_valid(valid_eval_config): assert valid_eval_config.model.properties["model_name"] == "gpt-4" assert valid_eval_config.model.properties["model_provider"] == "openai" assert valid_eval_config.model.properties["adapter_name"] == "openai_compatible" + assert valid_eval_config.prompt.name == "Test Prompt" + assert valid_eval_config.prompt.prompt == "Test prompt" + + +def test_eval_config_missing_prompt(valid_eval_config): + with pytest.raises( + ValueError, match="Input should be a valid dictionary or instance of BasePromp" + ): + valid_eval_config.prompt = None def test_eval_config_missing_g_eval_steps(valid_eval_config): From 56f7e083199e28c0d4bd8ac42addba9da82b1a03 Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 14 Feb 2025 22:18:12 -0500 Subject: [PATCH 006/102] Add in progress eval adaptor, and g_eval implementation --- libs/core/kiln_ai/adapters/eval/base_eval.py | 127 +++++++++++ libs/core/kiln_ai/adapters/eval/g_eval.py | 97 ++++++++ libs/core/kiln_ai/adapters/eval/registry.py | 13 ++ .../kiln_ai/adapters/eval/test_base_eval.py | 212 ++++++++++++++++++ .../core/kiln_ai/adapters/eval/test_g_eval.py | 144 ++++++++++++ 5 files changed, 593 insertions(+) create mode 100644 libs/core/kiln_ai/adapters/eval/base_eval.py create mode 100644 libs/core/kiln_ai/adapters/eval/g_eval.py create mode 100644 libs/core/kiln_ai/adapters/eval/registry.py create mode 100644 libs/core/kiln_ai/adapters/eval/test_base_eval.py create mode 100644 libs/core/kiln_ai/adapters/eval/test_g_eval.py diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py new file mode 100644 index 00000000..c2be4fbd --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -0,0 +1,127 @@ +import json +from abc import abstractmethod +from typing import Dict + +from kiln_ai.adapters.adapter_registry import adapter_for_task +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.datamodel.eval import EvalConfig +from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema +from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRun +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error + + +class BaseEval: + def __init__(self, eval_config: EvalConfig): + self.eval_config = eval_config + eval = eval_config.parent_eval() + if not eval: + raise ValueError("Eval config must have a parent eval") + self.eval = eval + task = self.eval.parent_task() + if not task: + raise ValueError("Eval must have a parent task") + self.target_task = task + self.score_schema = BaseEval.build_score_schema(task, allow_float_scores=True) + + def model_and_provider(self) -> tuple[str, ModelProviderName]: + model_name = self.eval_config.model.properties.get("model_name") + provider = self.eval_config.model.properties.get("model_provider") + if ( + not model_name + or not provider + or not isinstance(model_name, str) + or not isinstance(provider, str) + or provider not in ModelProviderName.__members__ + ): + raise ValueError( + "Model name and provider must be set in the eval config model properties" + ) + + return model_name, ModelProviderName(provider) + + async def run(self, input: Dict | str) -> Dict[str, int | float | str]: + run_adapter = adapter_for_task( + self.target_task, + # TODO: take these from evalRun + "llama_3_1_8b", + ModelProviderName.groq, + ) + + # we don't save by default here. We'll save manually after validating the output + run_output = await run_adapter.invoke(input, allow_saving=False) + + eval_output = await self.run_eval(run_output) + validate_schema(eval_output, self.score_schema) + + return eval_output + + @abstractmethod + # Runs the eval on the given task run and returns a dictionary of scores which should conform to the score schema + async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: + pass + + @classmethod + def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str: + """ + Build a JSON schema for the scoring output of the task requirements + """ + + # Note: python maintains order, which is good as we want the user defined order, and overall last + properties = {} + for requirement in task.requirements: + property_key = string_to_json_key(requirement.name) + if property_key in properties or property_key == "overall_rating": + raise ValueError( + f"Duplicate requirement name: {requirement.name}. Can not be used as unique JSON schema key." + ) + if len(property_key) == 0: + raise ValueError( + f"Invalid requirement name: {requirement.name}. Can not be used as JSON schema key." + ) + property: dict[str, str | int | float | list[str]] = { + "title": requirement.name, + } + match requirement.type: + case TaskOutputRatingType.five_star: + if allow_float_scores: + property["type"] = "number" + else: + property["type"] = "integer" + + property["minimum"] = 1 + property["maximum"] = 5 + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." + ) + case TaskOutputRatingType.pass_fail: + property["enum"] = ["pass", "fail"] + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be either 'pass' or 'fail'." + ) + case TaskOutputRatingType.pass_fail_critical: + property["enum"] = ["pass", "fail", "critical"] + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." + ) + case TaskOutputRatingType.custom: + # Skip custom rating types in evals + continue + case _: + raise_exhaustive_enum_error(requirement.type) + + properties[property_key] = property + + properties["overall_rating"] = { + "type": "integer", + "minimum": 1, + "maximum": 5, + "title": "Overall Rating", + "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", + } + + schema = { + "type": "object", + "properties": properties, + "required": list(properties.keys()), + } + return json.dumps(schema, indent=2, ensure_ascii=False) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py new file mode 100644 index 00000000..f1b0ef49 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -0,0 +1,97 @@ +import json +from typing import Dict + +from kiln_ai.adapters.adapter_registry import adapter_for_task +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.prompt_builders import SimpleChainOfThoughtPromptBuilder +from kiln_ai.datamodel import Project, Task, TaskRun +from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType + +# better prompts +# https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-summarization + + +class GEvalTask(Task, parent_of={}): + """ + Kiln task for executing a G-Eval. Can be run on any Kiln adapter. + """ + + def __init__(self, eval_config: EvalConfig, target_task: Task): + # This keep the typechecker happy. TODO: shouldn't need this or parent_of above. + tmp_project = Project(name="GEval") + + system_instruction = f""" +Your job to evaluate a model's performance on a task. Blocks will be marked with tags. + +The task the model was given is as follows: + +{eval_config.prompt.prompt} + +""" + # TODO allow over riding of system instruction via config + + # Build the COT eval instructions + cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" + steps = eval_config.properties["g_eval_steps"] + if not steps or not isinstance(steps, list): + raise ValueError("g_eval_steps must be a list") + for i, step in enumerate(steps): + cot_instructions += f"{i + 1}) {step}\n" + + # We restrict the LLM scoring to integer scores (see later logprob calculation, which requires integer scores) + # However, the overall score we output can be a float. + output_schema = BaseEval.build_score_schema( + target_task, allow_float_scores=False + ) + + super().__init__( + name="GEval Task", + parent=tmp_project, + instruction=system_instruction, + thinking_instruction=cot_instructions, + output_json_schema=output_schema, + ) + + +class GEval(BaseEval): + def __init__(self, eval_config: EvalConfig): + if not eval_config.config_type == EvalConfigType.g_eval: + raise ValueError("GEval must be initialized with a GEval Config") + + super().__init__(eval_config) + + self.geval_task = GEvalTask(eval_config, self.target_task) + + async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: + """ + Run this G-Eval on the given task run. + """ + + model_name, provider = self.model_and_provider() + # We always use Simple COT for G-Eval + prompt_builder = SimpleChainOfThoughtPromptBuilder(self.geval_task) + + adapter = adapter_for_task( + self.geval_task, + model_name, + provider, + prompt_builder, + ) + + # TODO: does eval see intermediate output? I don't think so, but think about it. + input = f"""The model was given the following input for the task: + +{task_run.input} + + +The model produced the following output for the task: + +{task_run.output} + +""" + + result = await adapter.invoke(input) + + # TODO g_eval logprobs + parsed_output = json.loads(result.output.output) + return parsed_output diff --git a/libs/core/kiln_ai/adapters/eval/registry.py b/libs/core/kiln_ai/adapters/eval/registry.py new file mode 100644 index 00000000..a8b66d96 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/registry.py @@ -0,0 +1,13 @@ +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.g_eval import GEval +from kiln_ai.datamodel.eval import EvalConfigType +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error + + +def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]: + match eval_config_type: + case EvalConfigType.g_eval: + return GEval + case _: + # type checking will catch missing cases + raise_exhaustive_enum_error(eval_config_type) diff --git a/libs/core/kiln_ai/adapters/eval/test_base_eval.py b/libs/core/kiln_ai/adapters/eval/test_base_eval.py new file mode 100644 index 00000000..a6b1ddc9 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_base_eval.py @@ -0,0 +1,212 @@ +import json + +import pytest +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRequirement + + +def test_score_schema_five_star(): + # Create a task with a five-star requirement + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Quality Score", + instruction="Rate the quality", + type=TaskOutputRatingType.five_star, + ) + ], + ) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + # Check basic schema structure + assert schema["type"] == "object" + assert schema["required"] == ["quality_score", "overall_rating"] + + # Check requirement property + req_prop = schema["properties"]["quality_score"] + assert req_prop["type"] == "integer" + assert req_prop["minimum"] == 1 + assert req_prop["maximum"] == 5 + assert "Quality Score" in req_prop["title"] + assert "Rate the quality" in req_prop["description"] + assert "between 1 and 5" in req_prop["description"] + + # Check overall rating property + assert "overall_rating" in schema["properties"] + overall = schema["properties"]["overall_rating"] + assert overall["type"] == "integer" + assert overall["minimum"] == 1 + assert overall["maximum"] == 5 + assert "Overall Rating" in overall["title"] + assert "The overall rating for the task output" in overall["description"] + assert "between 1 and 5" in overall["description"] + + +def test_score_schema_five_star_float(): + # Create a task with a five-star requirement + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Quality Score", + instruction="Rate the quality", + type=TaskOutputRatingType.five_star, + ) + ], + ) + + schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema = json.loads(schema_str) + + # Check basic schema structure + assert schema["type"] == "object" + assert schema["required"] == ["quality_score", "overall_rating"] + + # Check requirement property + req_prop = schema["properties"]["quality_score"] + assert req_prop["type"] == "number" + assert req_prop["minimum"] == 1 + assert req_prop["maximum"] == 5 + assert "Quality Score" in req_prop["title"] + assert "Rate the quality" in req_prop["description"] + assert "between 1 and 5" in req_prop["description"] + + # Check overall rating property + assert "overall_rating" in schema["properties"] + overall = schema["properties"]["overall_rating"] + assert overall["type"] == "integer" + assert overall["minimum"] == 1 + assert overall["maximum"] == 5 + assert "Overall Rating" in overall["title"] + assert "The overall rating for the task output" in overall["description"] + assert "between 1 and 5" in overall["description"] + + +def test_score_schema_pass_fail(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Pass Fail Test", + instruction="Check if it passes", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + req_prop = schema["properties"]["pass_fail_test"] + assert req_prop["enum"] == ["pass", "fail"] + assert "Pass Fail Test" in req_prop["title"] + assert "Check if it passes" in req_prop["description"] + assert "'pass' or 'fail'" in req_prop["description"] + + assert schema["properties"]["overall_rating"] is not None + + +def test_score_schema_pass_fail_critical(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Critical Test", + instruction="Check for critical issues", + type=TaskOutputRatingType.pass_fail_critical, + ) + ], + ) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + req_prop = schema["properties"]["critical_test"] + assert "enum" in req_prop + assert req_prop["enum"] == ["pass", "fail", "critical"] + assert "'pass', 'fail', or 'critical'" in req_prop["description"] + + assert schema["properties"]["overall_rating"] is not None + + +def test_score_schema_multiple_requirements(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + TaskRequirement( + name="Pass Check", + instruction="Basic pass check", + type=TaskOutputRatingType.pass_fail, + ), + TaskRequirement( + name="Security", + instruction="Check security", + type=TaskOutputRatingType.pass_fail_critical, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + # Verify order is maintained + assert list(schema["properties"].keys()) == [ + "quality", + "pass_check", + "security", + "overall_rating", + ] + + +def test_score_schema_custom_type_skipped(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Custom Rating", + instruction="Custom rating", + type=TaskOutputRatingType.custom, + ), + TaskRequirement( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + # Custom type should be skipped + assert len(schema["properties"]) == 2 # one requirement + overall_rating + + # Verify only non-custom requirement and overall_rating are present + props = list(schema["properties"].keys()) + assert "quality" in props + assert "overall_rating" in props + + +def test_score_schema_no_requirements(): + task = Task(name="Test Task", instruction="Test instruction", requirements=[]) + + schema_str = BaseEval.build_score_schema(task) + schema = json.loads(schema_str) + + # Should only have overall_rating + assert len(schema["properties"]) == 1 + assert "overall_rating" in schema["properties"] diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py new file mode 100644 index 00000000..618a7303 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -0,0 +1,144 @@ +import pytest +from kiln_ai.adapters.eval.g_eval import GEval +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + Project, + Task, + TaskOutput, + TaskOutputRatingType, + TaskRequirement, + TaskRun, +) +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType + + +@pytest.fixture +def test_task(tmp_path): + project = Project(name="Test Project", path=tmp_path / "project.kiln") + project.save_to_file() + + task = Task( + name="Joke Generator", + instruction="Generate a joke, given a topic", + parent=project, + requirements=[ + TaskRequirement( + name="Topic alignment", + instruction="Rate how aligned the joke is to the provided topic", + type=TaskOutputRatingType.five_star, + ), + TaskRequirement( + name="Appropriateness", + instruction="Check if the content is appropriate for all audiences", + type=TaskOutputRatingType.pass_fail, + ), + ], + ) + task.save_to_file() + return task + + +@pytest.fixture +def test_eval_config(test_task): + eval = Eval(name="Joke Quality Eval", parent=test_task) + eval.save_to_file() + + config = EvalConfig( + name="Llama 8b Joke Generator Eval", + parent=eval, + config_type=EvalConfigType.g_eval, + model=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt_4o_mini", + "model_provider": "openai", + "adapter_name": "openai_compatible", + }, + ), + prompt=BasePrompt( + # TODO ensure it's called with the frozen prompt + name="Joke Generator Frozen Prompt", + prompt=test_task.instruction, + ), + properties={ + "g_eval_steps": [ + "Is the joke funny?", + "Is the content appropriate for all audiences?", + "Is the joke culturally sensitive?", + "Is the joke politically correct?", + "Is the joke aligned with the provided topic?", + ] + }, + ) + config.save_to_file() + return config + + +@pytest.fixture +def test_task_run(test_task): + task_run = TaskRun( + parent=test_task, + input="Tell me a chicken joke", + input_source=DataSource( + type=DataSourceType.human, properties={"created_by": "test_user"} + ), + output=TaskOutput( + output="Why did the chicken cross the road? To get to the other side!", + source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "llama_3_1_8b", + "model_provider": "groq", + "adapter_name": "langchain", + }, + ), + ), + ) + task_run.save_to_file() + return task_run + + +@pytest.mark.paid +async def test_run_g_eval(test_task, test_eval_config, test_task_run): + # Create G-Eval instance + g_eval = GEval(test_eval_config) + + # Run the evaluation + eval_result = await g_eval.run_eval(test_task_run) + + # Verify the evaluation results + assert isinstance(eval_result, dict) + assert "topic_alignment" in eval_result + assert isinstance(eval_result["topic_alignment"], int) + assert 1 <= eval_result["topic_alignment"] <= 5 + + assert "appropriateness" in eval_result + assert eval_result["appropriateness"] in ["pass", "fail"] + + assert "overall_rating" in eval_result + assert isinstance(eval_result["overall_rating"], int) + assert 1 <= eval_result["overall_rating"] <= 5 + + +@pytest.mark.paid +async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run): + # Create G-Eval instance + g_eval = GEval(test_eval_config) + + # Run the evaluation + eval_result = await g_eval.run("chickens") + + # Verify the evaluation results + assert isinstance(eval_result, dict) + assert "topic_alignment" in eval_result + assert isinstance(eval_result["topic_alignment"], int) + assert 1 <= eval_result["topic_alignment"] <= 5 + + assert "appropriateness" in eval_result + assert eval_result["appropriateness"] in ["pass", "fail"] + + assert "overall_rating" in eval_result + assert isinstance(eval_result["overall_rating"], int) + assert 1 <= eval_result["overall_rating"] <= 5 From 8c015f3b7a3d039fdb5e02db55e31a98ddca9c51 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 15 Feb 2025 13:29:16 -0500 Subject: [PATCH 007/102] G-evals are working with tests!! I need to re-read the paper to check my math, but this is the right framework. Lots of tests because of all the potential edge cases. I've already seen some cool results averaging several values (but t=gpt 4o mini is certain quite a bit) --- .../core/kiln_ai/adapters/adapter_registry.py | 7 +- libs/core/kiln_ai/adapters/eval/base_eval.py | 74 ++++-- libs/core/kiln_ai/adapters/eval/g_eval.py | 220 +++++++++++++++++- .../kiln_ai/adapters/eval/test_base_eval.py | 37 ++- .../core/kiln_ai/adapters/eval/test_g_eval.py | 218 +++++++++++++++-- .../kiln_ai/adapters/eval/test_g_eval_data.py | 4 + .../adapters/model_adapters/base_adapter.py | 21 +- .../model_adapters/langchain_adapters.py | 8 + .../model_adapters/openai_model_adapter.py | 21 +- .../test_saving_adapter_results.py | 9 +- libs/core/kiln_ai/adapters/run_output.py | 3 + 11 files changed, 560 insertions(+), 62 deletions(-) create mode 100644 libs/core/kiln_ai/adapters/eval/test_g_eval_data.py diff --git a/libs/core/kiln_ai/adapters/adapter_registry.py b/libs/core/kiln_ai/adapters/adapter_registry.py index aea617af..508bd4f9 100644 --- a/libs/core/kiln_ai/adapters/adapter_registry.py +++ b/libs/core/kiln_ai/adapters/adapter_registry.py @@ -2,7 +2,7 @@ from kiln_ai import datamodel from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, BaseAdapter from kiln_ai.adapters.model_adapters.langchain_adapters import LangchainAdapter from kiln_ai.adapters.model_adapters.openai_model_adapter import ( OpenAICompatibleAdapter, @@ -20,6 +20,7 @@ def adapter_for_task( provider: ModelProviderName, prompt_builder: BasePromptBuilder | None = None, tags: list[str] | None = None, + base_adapter_config: AdapterConfig | None = None, ) -> BaseAdapter: # Get the provider to run. For things like the fine-tune provider, we want to run the underlying provider core_provider_name = core_provider(model_name, provider) @@ -42,6 +43,7 @@ def adapter_for_task( ), prompt_builder=prompt_builder, tags=tags, + base_adapter_config=base_adapter_config, ) case ModelProviderName.openai: return OpenAICompatibleAdapter( @@ -53,6 +55,7 @@ def adapter_for_task( ), prompt_builder=prompt_builder, tags=tags, + base_adapter_config=base_adapter_config, ) case ModelProviderName.openai_compatible: config = openai_compatible_config(model_name) @@ -61,6 +64,7 @@ def adapter_for_task( config=config, prompt_builder=prompt_builder, tags=tags, + base_adapter_config=base_adapter_config, ) # Use LangchainAdapter for the rest case ModelProviderName.groq: @@ -90,4 +94,5 @@ def adapter_for_task( provider=provider, prompt_builder=prompt_builder, tags=tags, + base_adapter_config=base_adapter_config, ) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index c2be4fbd..50a1031b 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -4,6 +4,7 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig from kiln_ai.datamodel.eval import EvalConfig from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRun @@ -39,16 +40,17 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - async def run(self, input: Dict | str) -> Dict[str, int | float | str]: + async def run(self, input: Dict | str) -> Dict[str, float]: run_adapter = adapter_for_task( self.target_task, # TODO: take these from evalRun "llama_3_1_8b", ModelProviderName.groq, + base_adapter_config=AdapterConfig(allow_saving=False), ) # we don't save by default here. We'll save manually after validating the output - run_output = await run_adapter.invoke(input, allow_saving=False) + run_output = await run_adapter.invoke(input) eval_output = await self.run_eval(run_output) validate_schema(eval_output, self.score_schema) @@ -57,13 +59,18 @@ async def run(self, input: Dict | str) -> Dict[str, int | float | str]: @abstractmethod # Runs the eval on the given task run and returns a dictionary of scores which should conform to the score schema - async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: + async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: pass @classmethod def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str: """ Build a JSON schema for the scoring output of the task requirements + + We allow 2 modes: allow_float_scores=True and allow_float_scores=False. + + allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). + allow_float_scores=True is used after we take a g-eval weighting of the model's logprobs. For example, a pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. """ # Note: python maintains order, which is good as we want the user defined order, and overall last @@ -78,31 +85,47 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str raise ValueError( f"Invalid requirement name: {requirement.name}. Can not be used as JSON schema key." ) - property: dict[str, str | int | float | list[str]] = { + property: dict[str, str | int | float | list[str] | list[int]] = { "title": requirement.name, } match requirement.type: case TaskOutputRatingType.five_star: if allow_float_scores: property["type"] = "number" + property["minimum"] = 1 + property["maximum"] = 5 else: - property["type"] = "integer" + property["enum"] = [1, 2, 3, 4, 5] - property["minimum"] = 1 - property["maximum"] = 5 property["description"] = ( f"{requirement.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." ) case TaskOutputRatingType.pass_fail: - property["enum"] = ["pass", "fail"] - property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be either 'pass' or 'fail'." - ) + if allow_float_scores: + property["type"] = "number" + property["minimum"] = 0 + property["maximum"] = 1 + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." + ) + else: + property["enum"] = ["pass", "fail"] + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be either 'pass' or 'fail'." + ) case TaskOutputRatingType.pass_fail_critical: - property["enum"] = ["pass", "fail", "critical"] - property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." - ) + if allow_float_scores: + property["type"] = "number" + property["minimum"] = -1 + property["maximum"] = 1 + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." + ) + else: + property["enum"] = ["pass", "fail", "critical"] + property["description"] = ( + f"{requirement.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." + ) case TaskOutputRatingType.custom: # Skip custom rating types in evals continue @@ -111,13 +134,20 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str properties[property_key] = property - properties["overall_rating"] = { - "type": "integer", - "minimum": 1, - "maximum": 5, - "title": "Overall Rating", - "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", - } + if allow_float_scores: + properties["overall_rating"] = { + "type": "number", + "minimum": 1, + "maximum": 5, + "title": "Overall Rating", + "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", + } + else: + properties["overall_rating"] = { + "enum": [1, 2, 3, 4, 5], + "title": "Overall Rating", + "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", + } schema = { "type": "object", diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index f1b0ef49..24256de0 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -1,15 +1,30 @@ import json -from typing import Dict +import math +from typing import Dict, List, Tuple from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput from kiln_ai.adapters.prompt_builders import SimpleChainOfThoughtPromptBuilder from kiln_ai.datamodel import Project, Task, TaskRun from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType +from openai.types.chat import ChatCompletionTokenLogprob # better prompts # https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-summarization +# all the tokens we score for, and their float scores. +TOKEN_TO_SCORE_MAP: Dict[str, float] = { + "1": 1.0, + "2": 2.0, + "3": 3.0, + "4": 4.0, + "5": 5.0, + "pass": 1.0, + "fail": 0.0, + "critical": -1.0, +} + class GEvalTask(Task, parent_of={}): """ @@ -62,7 +77,7 @@ def __init__(self, eval_config: EvalConfig): self.geval_task = GEvalTask(eval_config, self.target_task) - async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: + async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: """ Run this G-Eval on the given task run. """ @@ -76,6 +91,11 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: model_name, provider, prompt_builder, + base_adapter_config=AdapterConfig( + allow_saving=False, + # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely + top_logprobs=10, + ), ) # TODO: does eval see intermediate output? I don't think so, but think about it. @@ -90,8 +110,196 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, int | float | str]: """ - result = await adapter.invoke(input) + # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() + _, run_output = await adapter.invoke_returning_run_output(input) + + return self.build_g_eval_score(run_output) + + def build_g_eval_score(self, run_output: RunOutput) -> Dict[str, float]: + """ + Build the G-Eval score for the given run and run output. + + We create a weighted average of each rating using the logprobs. + + @misc{liu2023gevalnlgevaluationusing, + title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, + author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, + year={2023}, + eprint={2303.16634}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2303.16634}, + } + """ + # We use structured output + outputs = run_output.output + assert isinstance(outputs, dict) + + # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit + raw_output = self.raw_output_from_logprobs(run_output) + + # find the offset the start of each metric in the raw output json + metrics: List[str] = list(outputs.keys()) + metric_offsets = self.metric_offsets(raw_output, metrics) + + final_scores: Dict[str, float] = {} + for metric in metrics: + score = self.g_eval_single_metric( + run_output, metric, metric_offsets, raw_output + ) + if score is None: + raise ValueError(f"No score found for metric: {metric}") + final_scores[metric] = score + + return final_scores + + def g_eval_single_metric( + self, + run_output: RunOutput, + metric: str, + metric_offsets: Dict[str, int], + raw_output: str, + ) -> float | None: + """ + Run the G-Eval for a single metric. + + Scan the logprobs for the metric and return the weighted score of the rating token. + """ + + start_offset, end_offset = self.token_search_range( + raw_output, metric, metric_offsets + ) + + offset = 0 + + if ( + run_output.output_logprobs is None + or run_output.output_logprobs.content is None + ): + raise RuntimeError( + "No logprobs found for output - can not calculate g-eval" + ) + + # scan the tokens in the range, looking for the rating token + for i, chat_logprob in enumerate(run_output.output_logprobs.content): + if offset >= end_offset: + break + if offset >= start_offset: + score = self.rating_token_to_score(chat_logprob) + if score is not None: + return score + offset += len(chat_logprob.token) + + return None + + def raw_output_from_logprobs(self, run_output: RunOutput) -> str: + """ + Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets + """ + if ( + run_output.output_logprobs is None + or run_output.output_logprobs.content is None + ): + raise RuntimeError( + "No logprobs found for output - can not calculate g-eval" + ) + + raw = "" + for chat_logprob in run_output.output_logprobs.content: + raw += chat_logprob.token + return raw + + def token_search_range( + self, raw_output: str, metric: str, metric_offsets: Dict[str, int] + ) -> Tuple[int, int]: + """ + Find the start and end offsets of the metric in the raw output. + + Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). + """ + start_offset = metric_offsets[metric] + len(metric) + + # Find the lowest end offset that is greater than the start offset + end_offset = len(raw_output) + for v in list(metric_offsets.values()): + if v < end_offset and v > start_offset: + end_offset = v + + return start_offset, end_offset + + def rating_token_to_score( + self, token_logprob: ChatCompletionTokenLogprob + ) -> float | None: + """ + Convert a rating token to a score using weighted average of top logprobs. + + Only includes tokens that have valid scores. + + Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. + """ + primary_token_score = self.score_from_token_string(token_logprob.token) + # check this is a real rating token, it could just be the ": ", "," or whitespace + if not primary_token_score: + return None + + total_score = 0.0 + total_probability = 0.0 + + # Process all valid scoring tokens + for top_logprob in token_logprob.top_logprobs: + token_score = self.score_from_token_string(top_logprob.token) + if token_score is not None: + # Convert logprob to probability + probability = math.exp(top_logprob.logprob) + total_score += token_score * probability + total_probability += probability + + if total_probability <= 0.0: + raise RuntimeError( + f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this." + ) + + # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) + weighted_score = total_score / total_probability + + return weighted_score + + def score_from_token_string(self, token: str) -> float | None: + if token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[token] + + # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' + unquoted_token = token.strip().strip('"').lower() + if unquoted_token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[unquoted_token] + + return None + + def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: + """ + Find the offset to the start of each metric in the raw output json + + For the example json: `{"overall_rating": 1}` == 1 + + should return: + { + "overall_rating": 1 # it's 1 character into the json string + } + """ + metric_offsets: Dict[str, int] = {} + for metric in metrics: + # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 + metric_name = f'"{metric}"' + + # we expect it exactly once + count = raw_output.count(metric_name) + if count != 1: + raise ValueError( + f"Metric {metric} should appear exactly once in the output. Found {count} times" + ) - # TODO g_eval logprobs - parsed_output = json.loads(result.output.output) - return parsed_output + offset = raw_output.find(metric_name) + if offset == -1: + raise ValueError(f"Metric {metric} not found in raw output") + metric_offsets[metric] = offset + return metric_offsets diff --git a/libs/core/kiln_ai/adapters/eval/test_base_eval.py b/libs/core/kiln_ai/adapters/eval/test_base_eval.py index a6b1ddc9..7772758d 100644 --- a/libs/core/kiln_ai/adapters/eval/test_base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_base_eval.py @@ -26,21 +26,17 @@ def test_score_schema_five_star(): assert schema["type"] == "object" assert schema["required"] == ["quality_score", "overall_rating"] - # Check requirement property + # Check requirement property, and that it's an enum of 1-5 req_prop = schema["properties"]["quality_score"] - assert req_prop["type"] == "integer" - assert req_prop["minimum"] == 1 - assert req_prop["maximum"] == 5 + assert req_prop["enum"] == [1, 2, 3, 4, 5] assert "Quality Score" in req_prop["title"] assert "Rate the quality" in req_prop["description"] assert "between 1 and 5" in req_prop["description"] - # Check overall rating property + # Check overall rating property, and that it's an enum of 1-5 assert "overall_rating" in schema["properties"] overall = schema["properties"]["overall_rating"] - assert overall["type"] == "integer" - assert overall["minimum"] == 1 - assert overall["maximum"] == 5 + assert overall["enum"] == [1, 2, 3, 4, 5] assert "Overall Rating" in overall["title"] assert "The overall rating for the task output" in overall["description"] assert "between 1 and 5" in overall["description"] @@ -79,7 +75,7 @@ def test_score_schema_five_star_float(): # Check overall rating property assert "overall_rating" in schema["properties"] overall = schema["properties"]["overall_rating"] - assert overall["type"] == "integer" + assert overall["type"] == "number" assert overall["minimum"] == 1 assert overall["maximum"] == 5 assert "Overall Rating" in overall["title"] @@ -111,6 +107,19 @@ def test_score_schema_pass_fail(): assert schema["properties"]["overall_rating"] is not None + # Now check that we can allow float scores with the proper float structure + schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema = json.loads(schema_str) + + req_prop = schema["properties"]["pass_fail_test"] + assert req_prop["type"] == "number" + assert req_prop["minimum"] == 0 + assert req_prop["maximum"] == 1 + assert ( + "between 0 and 1, with 0 being a failure and 1 being a pass" + in req_prop["description"] + ) + def test_score_schema_pass_fail_critical(): task = Task( @@ -135,6 +144,16 @@ def test_score_schema_pass_fail_critical(): assert schema["properties"]["overall_rating"] is not None + # Now check that we can allow float scores with the proper float structure + schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema = json.loads(schema_str) + + req_prop = schema["properties"]["critical_test"] + assert req_prop["type"] == "number" + assert req_prop["minimum"] == -1 + assert req_prop["maximum"] == 1 + assert "between -1 and 1, with 1 being a pass" in req_prop["description"] + def test_score_schema_multiple_requirements(): task = Task( diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 618a7303..787bb92a 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -1,5 +1,10 @@ +import math +import pickle + import pytest -from kiln_ai.adapters.eval.g_eval import GEval +from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval +from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output +from kiln_ai.adapters.model_adapters.base_adapter import RunOutput from kiln_ai.datamodel import ( BasePrompt, DataSource, @@ -108,18 +113,20 @@ async def test_run_g_eval(test_task, test_eval_config, test_task_run): # Run the evaluation eval_result = await g_eval.run_eval(test_task_run) - # Verify the evaluation results - assert isinstance(eval_result, dict) assert "topic_alignment" in eval_result - assert isinstance(eval_result["topic_alignment"], int) - assert 1 <= eval_result["topic_alignment"] <= 5 + topic_alignment = eval_result["topic_alignment"] + assert isinstance(topic_alignment, float) + assert 1 <= topic_alignment <= 5 assert "appropriateness" in eval_result - assert eval_result["appropriateness"] in ["pass", "fail"] + appropriateness = eval_result["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 assert "overall_rating" in eval_result - assert isinstance(eval_result["overall_rating"], int) - assert 1 <= eval_result["overall_rating"] <= 5 + overall = eval_result["overall_rating"] + assert isinstance(overall, float) + assert 1.0 <= overall <= 5.0 @pytest.mark.paid @@ -132,13 +139,198 @@ async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run): # Verify the evaluation results assert isinstance(eval_result, dict) + assert "topic_alignment" in eval_result - assert isinstance(eval_result["topic_alignment"], int) - assert 1 <= eval_result["topic_alignment"] <= 5 + topic_alignment = eval_result["topic_alignment"] + assert isinstance(topic_alignment, float) + assert 1 <= topic_alignment <= 5 assert "appropriateness" in eval_result - assert eval_result["appropriateness"] in ["pass", "fail"] + appropriateness = eval_result["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 assert "overall_rating" in eval_result - assert isinstance(eval_result["overall_rating"], int) - assert 1 <= eval_result["overall_rating"] <= 5 + overall = eval_result["overall_rating"] + assert isinstance(overall, float) + assert 1.0 <= overall <= 5.0 + + +async def test_g_eval_logprobs(test_task, test_eval_config, test_task_run): + # Create G-Eval instance + run_output = pickle.loads(serialized_run_output) + assert isinstance(run_output, RunOutput) + assert run_output.output_logprobs is not None + g_eval = GEval(test_eval_config) + result = g_eval.build_g_eval_score(run_output) + + assert "overall_rating" in result + overall = result["overall_rating"] + assert isinstance(overall, float) + assert overall >= 1.0 and overall <= 5.0 + # Confirm weighted value, and confirm the approx isn't why it's passing + assert pytest.approx(overall) == 3.99752802363598 + assert pytest.approx(overall) != 4.0 + + # Check topic_alignment + assert "topic_alignment" in result + topic_alignment = result["topic_alignment"] + assert isinstance(topic_alignment, float) + assert topic_alignment >= 1.0 and topic_alignment <= 5.0 + # Confirm weighted value, and confirm the approx isn't why it's passing + assert pytest.approx(topic_alignment) == 4.999983298485167 + assert pytest.approx(topic_alignment) != 5.0 + + # Check appropriateness + assert "appropriateness" in result + appropriateness = result["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 + # Fail chance so low, we need to specify the precision + assert pytest.approx(appropriateness, 1e-12) == 0.9999999999572222 + assert pytest.approx(appropriateness, 1e-12) != 1.0 + + +def test_token_case(): + # we assume the token is lower case in the logprobs token fuzzy matching code. This will catch if we ever add a token that's not. + for token in TOKEN_TO_SCORE_MAP.keys(): + assert token.lower() == token + + +def test_metric_offsets_and_search_ranges(test_eval_config): + g_eval = GEval(test_eval_config) + raw_output = ( + '{"topic_alignment": 4, "appropriateness": "pass", "overall_rating": 5}' + ) + metrics = ["topic_alignment", "appropriateness", "overall_rating"] + + offsets = g_eval.metric_offsets(raw_output, metrics) + + assert len(offsets) == 3 + assert offsets["topic_alignment"] == 1 # Position after opening { + assert offsets["appropriateness"] == 23 # Position after "appropriateness": + assert offsets["overall_rating"] == 50 # Position after "overall_rating": + + # Test search ranges + + # Test first metric + start, end = g_eval.token_search_range(raw_output, "topic_alignment", offsets) + assert start == 16 # Position after "topic_alignment" + assert end == 23 # Position after "appropriateness" + + # Test middle metric + start, end = g_eval.token_search_range(raw_output, "appropriateness", offsets) + assert start == 38 # Position after "appropriateness" + assert end == 50 # Position after "overall_rating" + + # Test last metric + start, end = g_eval.token_search_range(raw_output, "overall_rating", offsets) + assert start == 64 # Position after "overall_rating" + assert end == len(raw_output) # end of string + + +def test_metric_offsets_invalid(test_eval_config): + g_eval = GEval(test_eval_config) + raw_output = '{"topic_alignment": 4, "topic_alignment": 5}' + metrics = ["topic_alignment"] + + with pytest.raises(ValueError, match="should appear exactly once"): + g_eval.metric_offsets(raw_output, metrics) + + raw_output = '{"something_else": 4}' + with pytest.raises(ValueError, match="should appear exactly once"): + g_eval.metric_offsets(raw_output, metrics) + + +@pytest.mark.parametrize( + "token_string,expected_score", + [ + # Direct matches + ("1", 1.0), + ("5", 5.0), + ("pass", 1.0), + ("fail", 0.0), + ("critical", -1.0), + # Variations with quotes and spacing + ('"1"', 1.0), + (" pass ", 1.0), + ("PASS", 1.0), + ('"FAIL"', 0.0), + ('"pAss"', 1.0), + # Invalid tokens + ("invalid", None), + ("6", None), + ("0", None), + ("", None), + ], +) +def test_score_from_token_string(test_eval_config, token_string, expected_score): + g_eval = GEval(test_eval_config) + assert g_eval.score_from_token_string(token_string) == expected_score + + +def test_raw_output_from_logprobs(test_eval_config): + g_eval = GEval(test_eval_config) + + # Create a minimal RunOutput with some logprobs + class MockLogprob: + def __init__(self, token): + self.token = token + + class MockLogprobs: + def __init__(self): + self.content = [ + MockLogprob('{"'), + MockLogprob("score"), + MockLogprob('": '), + MockLogprob("5"), + MockLogprob("}"), + ] + + run_output = RunOutput( + output={"score": 5}, + output_logprobs=MockLogprobs(), + intermediate_outputs={}, + ) + + raw = g_eval.raw_output_from_logprobs(run_output) + assert raw == '{"score": 5}' + + +def test_rating_token_to_score(test_eval_config): + g_eval = GEval(test_eval_config) + + class MockTopLogprob: + def __init__(self, token, logprob): + self.token = token + self.logprob = logprob + + class MockTokenLogprob: + def __init__(self, token, top_logprobs): + self.token = token + self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs] + + # Test single token case + token_logprob = MockTokenLogprob("5", [("5", 0.0)]) # log(1) = 0 + score = g_eval.rating_token_to_score(token_logprob) + assert score == 5.0 + + # Test weighted average case + token_logprob = MockTokenLogprob( + "4", + [ + ("4", math.log(0.6)), # 60% probability + ("5", math.log(0.4)), # 40% probability + ], + ) + score = g_eval.rating_token_to_score(token_logprob) + assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4) + + # Test invalid token + token_logprob = MockTokenLogprob(":", [(":", 0.0)]) + assert g_eval.rating_token_to_score(token_logprob) is None + + # Test no valid scoring tokens + token_logprob = MockTokenLogprob("5", []) + with pytest.raises(RuntimeError, match="No valid scoring tokens found"): + g_eval.rating_token_to_score(token_logprob) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py b/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py new file mode 100644 index 00000000..a36bdc49 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py @@ -0,0 +1,4 @@ +# Saved a real RunOutput, with real logprobs via: +# po = pickle.dumps(result) +# print(f"\n\nPickled result: \n{po}\n\n") +serialized_run_output = b"\x80\x04\x95\xe8:\x00\x00\x00\x00\x00\x00\x8c\x1bkiln_ai.adapters.run_output\x94\x8c\tRunOutput\x94\x93\x94)\x81\x94}\x94(\x8c\x06output\x94}\x94(\x8c\x0ftopic_alignment\x94K\x05\x8c\x0fappropriateness\x94\x8c\x04pass\x94\x8c\x0eoverall_rating\x94K\x04u\x8c\x14intermediate_outputs\x94}\x94\x8c\x10chain_of_thought\x94X\x08\x06\x00\x001) **Is the joke funny?**\n The joke \"Why did the chicken cross the road? To get to the other side!\" is a classic joke that many consider to be humorous due to its simplicity and unexpected nature. However, as it's a very well-known punchline, some may find it less amusing for being overly familiar. Overall, it can elicit a chuckle, but it may not be considered original or particularly funny by everyone.\n\n2) **Is the content appropriate for all audiences?**\n Yes, the joke is appropriate for all audiences. It does not contain any offensive language or themes, making it suitable for children and adults alike.\n\n3) **Is the joke culturally sensitive?**\n Yes, the joke is culturally sensitive. It does not touch on any potentially sensitive topics or stereotypes. It\xe2\x80\x99s a universal humor that transcends cultural boundaries.\n\n4) **Is the joke politically correct?**\n Yes, the joke is politically correct. It does not make any political statements or discriminatory remarks. It simply presents a light-hearted situation involving a chicken, which is neutral and inoffensive.\n\n5) **Is the joke aligned with the provided topic?**\n Yes, the joke is aligned with the provided topic of a \"chicken joke.\" It directly references a chicken and is structured as a joke, fulfilling the prompt's requirements.\n\nIn summary, while the joke may lack originality, it is appropriate, sensitive, politically correct, and aligns well with the topic. The humor level can vary depending on personal taste, but overall, it meets the evaluation criteria.\x94s\x8c\x0foutput_logprobs\x94\x8c!openai.types.chat.chat_completion\x94\x8c\x0eChoiceLogprobs\x94\x93\x94)\x81\x94}\x94(\x8c\x08__dict__\x94}\x94(\x8c\x07content\x94]\x94(\x8c/openai.types.chat.chat_completion_token_logprob\x94\x8c\x1aChatCompletionTokenLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(\x8c\x05token\x94\x8c\x02{\"\x94\x8c\x05bytes\x94]\x94(K{K\"e\x8c\x07logprob\x94G\xbf5\xfe.\xba\x97\xb1\xde\x8c\x0ctop_logprobs\x94]\x94(h\x19\x8c\nTopLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\"\x94h!]\x94(K{K\"eh#G\xbf5\xfe.\xba\x97\xb1\xdeu\x8c\x12__pydantic_extra__\x94}\x94\x8c\x17__pydantic_fields_set__\x94\x8f\x94(h\x1fh#h!\x90\x8c\x14__pydantic_private__\x94Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\n\x94h!]\x94(K{K\neh#G\xc0 \x00,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01{\x94h!]\x94K{ah#G\xc0/\x80,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\r\n\x94h!]\x94(K{K\rK\neh#G\xc01@\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\n\n\x94h!]\x94(K{K\nK\neh#G\xc03\xc0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\"\x94h!]\x94(K K{K\"eh#G\xc05\x00\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\n\x94h!]\x94(K K{K\neh#G\xc06\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\n\x94h!]\x94K\nah#G\xc07\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{}\x94h!]\x94(K{K}eh#G\xc08 \x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Oh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc0\x1b\x818\xa2\x07\xfd%uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04type\x94h!]\x94(KtKyKpKeeh#G\xc0!\x80\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03top\x94h!]\x94(KtKoKpeh#G\xc0-\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05theme\x94h!]\x94(KtKhKeKmKeeh#G\xc0.\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc00\x00N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 topic\x94h!]\x94(K KtKoKpKiKceh#G\xc00@N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Topic\x94h!]\x94(KTKoKpKiKceh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05title\x94h!]\x94(KtKiKtKlKeeh#G\xc00\xc0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n alignment\x94h!]\x94(K KaKlKiKgKnKmKeKnKteh#G\xc0+\x00\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06_align\x94h!]\x94(K_KaKlKiKgKneh#G\xc0.@\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_ALIGNMENT\x94h!]\x94(K_KAKLKIKGKNKMKEKNKTeh#G\xc0.\x80\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\tAlignment\x94h!]\x94(KAKlKiKgKnKmKeKnKteh#G\xc00\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_assignment\x94h!]\x94(K_KaKsKsKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n Alignment\x94h!]\x94(K KAKlKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03_al\x94h!]\x94(K_KaKleh#G\xc01\xa0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_similarity\x94h!]\x94(K_KsKiKmKiKlKaKrKiKtKyeh#G\xc01\xe0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xc02 \x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc03\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\xc2\xbb:\x94h!]\x94(K\xc2K\xbbK:eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03+\":\x94h!]\x94(K+K\"K:eh#G\xc07@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc07\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x015\x94h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x014\x94h!]\x94K4ah#G\xc0&\x00\x02:l\xe3Xuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01 \x94h!]\x94K ah#G\xc01\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x013\x94h!]\x94K3ah#G\xc07\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc08\xa0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01-\x94h!]\x94K-ah#G\xc0; \x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01f\x94h!]\x94Kfah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\t\x94h!]\x94K\tah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \x94h!]\x94(K K K eh#G\xc0;@\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\"\x94h!]\x94K\"ah#G\xc0;p\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01,\x94h!]\x94K,ah#G\xc05\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 ,\"\x94h!]\x94(K K,K\"eh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"\\\x94h!]\x94(K,K\"K\\eh#G\xc07`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"%\x94h!]\x94(K,K\"K%eh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\",\x94h!]\x94(K,K\"K,eh#G\xc0:\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\n\x94h!]\x94(K,K\neh#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\r\n\x94h!]\x94(K,K\rK\neh#G\xc0< \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x8f\x01\x00\x00h!]\x94K\tah#G\xc0=p\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01.\x94h!]\x94K.ah#G\xc0>@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05appro\x94h!]\x94(KaKpKpKrKoeh#G\xc0\"\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc0&\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc0*\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02in\x94h!]\x94(KiKneh#G\xc00\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Appro\x94h!]\x94(KAKpKpKrKoeh#G\xc02\x80\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 Appro\x94h!]\x94(K KAKpKpKrKoeh#G\xc02\xa0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc02\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04apro\x94h!]\x94(KaKpKrKoeh#G\xc03\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\rapproximately\x94h!]\x94(KaKpKpKrKoKxKiKmKaKtKeKlKyeh#G\xc04@\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01i\x94h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjA\x02\x00\x00h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iation\x94h!]\x94(KiKaKtKiKoKneh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03iat\x94h!]\x94(KiKaKteh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xc00 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iten\x94h!]\x94(KiKtKeKneh#G\xc00`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iann\x94h!]\x94(KiKaKnKneh#G\xc01\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02ri\x94h!]\x94(KrKieh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iately\x94h!]\x94(KiKaKtKeKlKyeh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05laten\x94h!]\x94(KlKaKtKeKneh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04aten\x94h!]\x94(KaKtKeKneh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05ensen\x94h!]\x94(KeKnKsKeKneh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ated\x94h!]\x94(KaKtKeKdeh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06teness\x94h!]\x94(KtKeKnKeKsKseh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ates\x94h!]\x94(KaKtKeKseh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05eness\x94h!]\x94(KeKnKeKsKseh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04onen\x94h!]\x94(KoKnKeKneh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04uten\x94h!]\x94(KuKtKeKneh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06enness\x94h!]\x94(KeKnKnKeKsKseh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"'\x94h!]\x94(K\"K:K\"K'eh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \":\"\x94h!]\x94(K K\"K:K\"eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\":\"\",\"\x94h!]\x94(K\"K:K\"K\"K,K\"eh#G\xc04\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":[\"\x94h!]\x94(K\"K:K[K\"eh#G\xc05\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"+\x94h!]\x94(K\"K:K\"K+eh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":{\"\x94h!]\x94(K\"K:K{K\"eh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03':'\x94h!]\x94(K'K:K'eh#G\xc06\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05 pass\x94h!]\x94(K KpKaKsKseh#G\xc03 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04fail\x94h!]\x94(KfKaKiKleh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03pas\x94h!]\x94(KpKaKseh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05.pass\x94h!]\x94(K.KpKaKsKseh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04Pass\x94h!]\x94(KPKaKsKseh#G\xc09\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04PASS\x94h!]\x94(KPKAKSKSeh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passed\x94h!]\x94(KpKaKsKsKeKdeh#G\xc09\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05-pass\x94h!]\x94(K-KpKaKsKseh#G\xc09\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passes\x94h!]\x94(KpKaKsKsKeKseh#G\xc0: \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \",\"\x94h!]\x94(K K\"K,K\"eh#G\xc02\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc04\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04.\",\"\x94h!]\x94(K.K\"K,K\"eh#G\xc04@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03','\x94h!]\x94(K'K,K'eh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"#\x94h!]\x94(K\"K,K\"K#eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"+\x94h!]\x94(K\"K,K\"K+eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05\\\",\\\"\x94h!]\x94(K\\K\"K,K\\K\"eh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"\\\x94h!]\x94(K\"K,K\"K\\eh#G\xc08\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07Overall\x94h!]\x94(KOKvKeKrKaKlKleh#G\xc00\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 overall\x94h!]\x94(K KoKvKeKrKaKlKleh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01c\x94h!]\x94Kcah#G\xc06\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08overview\x94h!]\x94(KoKvKeKrKvKiKeKweh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04over\x94h!]\x94(KoKvKeKreh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 Overall\x94h!]\x94(K KOKvKeKrKaKlKleh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe6\x95\xb4\xe4\xbd\x93\x94h!]\x94(K\xe6K\x95K\xb4K\xe4K\xbdK\x93eh#G\xc09`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05polit\x94h!]\x94(KpKoKlKiKteh#G\xc0:\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 rating\x94h!]\x94(K KrKaKtKiKnKgeh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06rating\x94h!]\x94(KrKaKtKiKnKgeh#G\xc01\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 Rating\x94h!]\x94(K KRKaKtKiKnKgeh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06Rating\x94h!]\x94(KRKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07-rating\x94h!]\x94(K-KrKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07.rating\x94h!]\x94(K.KrKaKtKiKnKgeh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05_rate\x94h!]\x94(K_KrKaKtKeeh#G\xc03\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t_rotation\x94h!]\x94(K_KrKoKtKaKtKiKoKneh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02_r\x94h!]\x94(K_Kreh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc04\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc07\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \":\x94h!]\x94(K K\"K:eh#G\xc08 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjs\x01\x00\x00h!]\x94K3ah#G\xc0\x18\x02\x89\x11\x8c\x19~uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xc0,\x81D\xaaS\xfc\x01uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjl\x01\x00\x00h!]\x94K ah#G\xc05\x10\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x012\x94h!]\x94K2ah#G\xc070\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x81\x01\x00\x00h!]\x94K-ah#G\xc08\xd0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\n\n\x94h!]\x94(K\nK\neh#G\xc09\x80\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fh_h!]\x94K\nah#G\xc09\xc0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc09\xf0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x88\x01\x00\x00h!]\x94Kfah#G\xc0:0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01}\x94h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fj\xf3\x04\x00\x00h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 }\x94h!]\x94(K K}eh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc05`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02}\n\x94h!]\x94(K}K\neh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\n\n\x94h!]\x94(K}K\nK\neh#G\xc08\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\xea\x01\x00\x00h!]\x94K.ah#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\r\n\x94h!]\x94(K}K\rK\neh#G\xc0; \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05}\r\n\r\n\x94h!]\x94(K}K\rK\nK\rK\neh#G\xc0=\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04}\n\n\n\x94h!]\x94(K}K\nK\nK\neh#G\xc0=\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07}\n\n\n\n\n\n\x94h!]\x94(K}K\nK\nK\nK\nK\nK\neh#G\xc0>\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nube\x8c\x07refusal\x94Nuh-}\x94h/\x8f\x94(h\x17j<\x05\x00\x00\x90h1Nubub." diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index e9f7fa32..308be71c 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -19,6 +19,12 @@ from kiln_ai.utils.config import Config +@dataclass +class AdapterConfig: + allow_saving: bool = True + top_logprobs: int | None = None + + @dataclass class AdapterInfo: adapter_name: str @@ -52,6 +58,7 @@ def __init__( model_provider_name: str, prompt_builder: BasePromptBuilder | None = None, tags: list[str] | None = None, + config: AdapterConfig | None = None, ): self.prompt_builder = prompt_builder or SimplePromptBuilder(kiln_task) self.kiln_task = kiln_task @@ -61,6 +68,7 @@ def __init__( self.model_name = model_name self.model_provider_name = model_provider_name self._model_provider: KilnModelProvider | None = None + self.base_adapter_config = config or AdapterConfig() def model_provider(self) -> KilnModelProvider: """ @@ -94,8 +102,15 @@ async def invoke( self, input: Dict | str, input_source: DataSource | None = None, - allow_saving: bool = True, ) -> TaskRun: + run_output, _ = await self.invoke_returning_run_output(input, input_source) + return run_output + + async def invoke_returning_run_output( + self, + input: Dict | str, + input_source: DataSource | None = None, + ) -> Tuple[TaskRun, RunOutput]: # validate input if self.input_schema is not None: if not isinstance(input, dict): @@ -130,7 +145,7 @@ async def invoke( # Save the run if configured to do so, and we have a path to save to if ( - allow_saving + self.base_adapter_config.allow_saving and Config.shared().autosave_runs and self.kiln_task.path is not None ): @@ -139,7 +154,7 @@ async def invoke( # Clear the ID to indicate it's not persisted run.id = None - return run + return run, run_output def has_structured_output(self) -> bool: return self.output_schema is not None diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index 3aaa4513..0ebf5dc0 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -20,6 +20,7 @@ ) from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, + AdapterConfig, AdapterInfo, BaseAdapter, BasePromptBuilder, @@ -47,6 +48,7 @@ def __init__( provider: str | None = None, prompt_builder: BasePromptBuilder | None = None, tags: list[str] | None = None, + base_adapter_config: AdapterConfig | None = None, ): if custom_model is not None: self._model = custom_model @@ -84,6 +86,7 @@ def __init__( model_provider_name=provider, prompt_builder=prompt_builder, tags=tags, + config=base_adapter_config, ) async def model(self) -> LangChainModelType: @@ -129,6 +132,11 @@ async def model(self) -> LangChainModelType: return self._model async def _run(self, input: Dict | str) -> RunOutput: + if self.base_adapter_config.top_logprobs is not None: + raise ValueError( + "Kiln's Langchain adapter does not support logprobs/top_logprobs. Select a model from an OpenAI compatible provider (openai, openrouter, etc) instead." + ) + provider = self.model_provider() model = await self.model() chain = model diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index f66526aa..3a3fd204 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -12,6 +12,7 @@ from kiln_ai.adapters.ml_model_list import StructuredOutputMode from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, + AdapterConfig, AdapterInfo, BaseAdapter, BasePromptBuilder, @@ -31,6 +32,7 @@ def __init__( kiln_task: datamodel.Task, prompt_builder: BasePromptBuilder | None = None, tags: list[str] | None = None, + base_adapter_config: AdapterConfig | None = None, ): self.config = config self.client = AsyncOpenAI( @@ -45,6 +47,7 @@ def __init__( model_provider_name=config.provider_name, prompt_builder=prompt_builder, tags=tags, + config=base_adapter_config, ) async def _run(self, input: Dict | str) -> RunOutput: @@ -115,6 +118,8 @@ async def _run(self, input: Dict | str) -> RunOutput: model=provider.provider_options["model"], messages=messages, extra_body=extra_body, + logprobs=self.base_adapter_config.top_logprobs is not None, + top_logprobs=self.base_adapter_config.top_logprobs, **response_format_options, ) @@ -133,6 +138,11 @@ async def _run(self, input: Dict | str) -> RunOutput: ) message = response.choices[0].message + logprobs = response.choices[0].logprobs + + # Check logprobs worked, if requested + if self.base_adapter_config.top_logprobs is not None and logprobs is None: + raise RuntimeError("Logprobs were required, but no logprobs were returned.") # Save reasoning if it exists (OpenRouter specific format) if require_or_reasoning: @@ -164,16 +174,15 @@ async def _run(self, input: Dict | str) -> RunOutput: if not isinstance(response_content, str): raise RuntimeError(f"response is not a string: {response_content}") + # Parse to dict if we have structured output + output: Dict | str = response_content if self.has_structured_output(): - structured_response = parse_json_string(response_content) - return RunOutput( - output=structured_response, - intermediate_outputs=intermediate_outputs, - ) + output = parse_json_string(response_content) return RunOutput( - output=response_content, + output=output, intermediate_outputs=intermediate_outputs, + output_logprobs=logprobs, ) def adapter_info(self) -> AdapterInfo: diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py index 64a36121..420e276c 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py @@ -45,7 +45,11 @@ def test_task(tmp_path): @pytest.fixture def adapter(test_task): - return MockAdapter(test_task, model_name="phi_3_5", model_provider_name="ollama") + return MockAdapter( + test_task, + model_name="phi_3_5", + model_provider_name="ollama", + ) def test_save_run_isolation(test_task, adapter): @@ -187,7 +191,8 @@ async def test_autosave_true_with_disabled(test_task, adapter): input_data = "Test input" - run = await adapter.invoke(input_data, allow_saving=False) + adapter.base_adapter_config.allow_saving = False + run = await adapter.invoke(input_data) # Check that no runs were saved assert len(test_task.runs()) == 0 diff --git a/libs/core/kiln_ai/adapters/run_output.py b/libs/core/kiln_ai/adapters/run_output.py index 7c34cae6..e407ac15 100644 --- a/libs/core/kiln_ai/adapters/run_output.py +++ b/libs/core/kiln_ai/adapters/run_output.py @@ -1,8 +1,11 @@ from dataclasses import dataclass from typing import Dict +from openai.types.chat.chat_completion import ChoiceLogprobs + @dataclass class RunOutput: output: Dict | str intermediate_outputs: Dict[str, str] | None + output_logprobs: ChoiceLogprobs | None = None From 41c0e45e518facedd00dd93e3094e97fadf32f6b Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 09:42:23 -0500 Subject: [PATCH 008/102] Add LLM as Judge evaluator --- libs/core/kiln_ai/adapters/eval/g_eval.py | 63 ++++++++++++++++--- libs/core/kiln_ai/adapters/eval/registry.py | 2 + .../core/kiln_ai/adapters/eval/test_g_eval.py | 37 ++++++++++- .../kiln_ai/adapters/test_prompt_builders.py | 2 +- libs/core/kiln_ai/datamodel/eval.py | 14 +++-- .../core/kiln_ai/datamodel/test_eval_model.py | 21 +++---- 6 files changed, 109 insertions(+), 30 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 24256de0..a52cd90c 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -29,6 +29,8 @@ class GEvalTask(Task, parent_of={}): """ Kiln task for executing a G-Eval. Can be run on any Kiln adapter. + + Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. """ def __init__(self, eval_config: EvalConfig, target_task: Task): @@ -47,9 +49,9 @@ def __init__(self, eval_config: EvalConfig, target_task: Task): # Build the COT eval instructions cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" - steps = eval_config.properties["g_eval_steps"] + steps = eval_config.properties["eval_steps"] if not steps or not isinstance(steps, list): - raise ValueError("g_eval_steps must be a list") + raise ValueError("eval_steps must be a list") for i, step in enumerate(steps): cot_instructions += f"{i + 1}) {step}\n" @@ -69,9 +71,22 @@ def __init__(self, eval_config: EvalConfig, target_task: Task): class GEval(BaseEval): + """ + A evaluator which implements G-Eval and LLM as Judge. + + G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 + + LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. + """ + def __init__(self, eval_config: EvalConfig): - if not eval_config.config_type == EvalConfigType.g_eval: - raise ValueError("GEval must be initialized with a GEval Config") + if ( + eval_config.config_type != EvalConfigType.g_eval + and eval_config.config_type != EvalConfigType.llm_as_judge + ): + raise ValueError( + "GEval must be initialized with a GEval or LLM as Judge Config" + ) super().__init__(eval_config) @@ -86,6 +101,12 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: # We always use Simple COT for G-Eval prompt_builder = SimpleChainOfThoughtPromptBuilder(self.geval_task) + # Only fetch logprobs for G-Eval + # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely + top_logprobs = ( + 10 if self.eval_config.config_type == EvalConfigType.g_eval else None + ) + adapter = adapter_for_task( self.geval_task, model_name, @@ -93,8 +114,7 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: prompt_builder, base_adapter_config=AdapterConfig( allow_saving=False, - # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely - top_logprobs=10, + top_logprobs=top_logprobs, ), ) @@ -113,7 +133,26 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() _, run_output = await adapter.invoke_returning_run_output(input) - return self.build_g_eval_score(run_output) + if self.eval_config.config_type == EvalConfigType.llm_as_judge: + return self.build_llm_as_judge_score(run_output) + else: + return self.build_g_eval_score(run_output) + + def build_llm_as_judge_score(self, run_output: RunOutput) -> Dict[str, float]: + """ + Build the LLM as Judge score for the given run and run output. + """ + # Convert the output format we asked for (discreet values) to our float scores + scores: Dict[str, float] = {} + if not isinstance(run_output.output, dict): + raise ValueError("LLM as Judge output must be a dictionary") + + for metric, score in run_output.output.items(): + token_score = self.score_from_token_string(f"{score}") + if token_score is None: + raise ValueError(f"No score found for metric: {metric}") + scores[metric] = token_score + return scores def build_g_eval_score(self, run_output: RunOutput) -> Dict[str, float]: """ @@ -273,6 +312,16 @@ def score_from_token_string(self, token: str) -> float | None: if unquoted_token in TOKEN_TO_SCORE_MAP: return TOKEN_TO_SCORE_MAP[unquoted_token] + # handle numeric tokens like "1.0" + try: + float_value = float(token) + if float_value.is_integer(): + str_token = str(int(float_value)) + if str_token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[str_token] + except ValueError: + pass + return None def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: diff --git a/libs/core/kiln_ai/adapters/eval/registry.py b/libs/core/kiln_ai/adapters/eval/registry.py index a8b66d96..78ed84aa 100644 --- a/libs/core/kiln_ai/adapters/eval/registry.py +++ b/libs/core/kiln_ai/adapters/eval/registry.py @@ -8,6 +8,8 @@ def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]: match eval_config_type: case EvalConfigType.g_eval: return GEval + case EvalConfigType.llm_as_judge: + return GEval case _: # type checking will catch missing cases raise_exhaustive_enum_error(eval_config_type) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 787bb92a..04a1fed7 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -68,7 +68,7 @@ def test_eval_config(test_task): prompt=test_task.instruction, ), properties={ - "g_eval_steps": [ + "eval_steps": [ "Is the joke funny?", "Is the content appropriate for all audiences?", "Is the joke culturally sensitive?", @@ -105,9 +105,13 @@ def test_task_run(test_task): return task_run +@pytest.mark.parametrize( + "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] +) @pytest.mark.paid -async def test_run_g_eval(test_task, test_eval_config, test_task_run): +async def test_run_g_eval(test_task, test_eval_config, test_task_run, config_type): # Create G-Eval instance + test_eval_config.config_type = config_type g_eval = GEval(test_eval_config) # Run the evaluation @@ -129,9 +133,13 @@ async def test_run_g_eval(test_task, test_eval_config, test_task_run): assert 1.0 <= overall <= 5.0 +@pytest.mark.parametrize( + "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] +) @pytest.mark.paid -async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run): +async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run, config_type): # Create G-Eval instance + test_eval_config.config_type = config_type g_eval = GEval(test_eval_config) # Run the evaluation @@ -191,6 +199,22 @@ async def test_g_eval_logprobs(test_task, test_eval_config, test_task_run): assert pytest.approx(appropriateness, 1e-12) != 1.0 +async def test_llm_as_judge(test_task, test_eval_config, test_task_run): + # Create G-Eval instance, set to LLM as Judge + run_output = pickle.loads(serialized_run_output) + test_eval_config.config_type = EvalConfigType.llm_as_judge + g_eval = GEval(test_eval_config) + + assert isinstance(run_output, RunOutput) + assert run_output.output_logprobs is not None + result = g_eval.build_llm_as_judge_score(run_output) + + # unlike g_eval, llm_as_judge returns the main token converted to our float scores + assert result["overall_rating"] == 4.0 + assert result["topic_alignment"] == 5.0 + assert result["appropriateness"] == 1.0 + + def test_token_case(): # we assume the token is lower case in the logprobs token fuzzy matching code. This will catch if we ever add a token that's not. for token in TOKEN_TO_SCORE_MAP.keys(): @@ -257,11 +281,18 @@ def test_metric_offsets_invalid(test_eval_config): ("PASS", 1.0), ('"FAIL"', 0.0), ('"pAss"', 1.0), + ("1.0", 1.0), + ("2.0", 2.0), + ("3.0", 3.0), + ("4.0", 4.0), + ("5.0", 5.0), + ("5.0000", 5.0), # Invalid tokens ("invalid", None), ("6", None), ("0", None), ("", None), + ("4.9999999", None), ], ) def test_score_from_token_string(test_eval_config, token_string, expected_score): diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 2112b958..0d800942 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -714,7 +714,7 @@ def test_eval_prompt_builder(tmp_path, valid_eval_config_datasource): prompt="test_eval_prompt", chain_of_thought_instructions="Think carefully", ), - properties={"g_eval_steps": ["step1", "step2"]}, + properties={"eval_steps": ["step1", "step2"]}, ) eval_config.save_to_file() diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index f9408754..4acb5baf 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -25,6 +25,7 @@ class EvalState(str, Enum): class EvalConfigType(str, Enum): g_eval = "g_eval" + llm_as_judge = "llm_as_judge" class EvalConfig(KilnParentedModel): @@ -53,13 +54,14 @@ def parent_eval(self) -> "Eval": @model_validator(mode="after") def validate_properties(self) -> Self: - if self.config_type == EvalConfigType.g_eval: - if "g_eval_steps" not in self.properties or not isinstance( - self.properties["g_eval_steps"], list + if ( + self.config_type == EvalConfigType.g_eval + or self.config_type == EvalConfigType.llm_as_judge + ): + if "eval_steps" not in self.properties or not isinstance( + self.properties["eval_steps"], list ): - raise ValueError( - "g_eval_steps is required and must be a list for g_eval" - ) + raise ValueError("eval_steps is required and must be a list for g_eval") return self else: raise ValueError(f"Invalid eval config type: {self.config_type}") diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index 0889dcde..a9f5f9bf 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -23,17 +23,12 @@ def test_eval_state_values(): assert len(EvalState) == 2 -def test_eval_config_type_values(): - assert EvalConfigType.g_eval == "g_eval" - assert len(EvalConfigType) == 1 - - @pytest.fixture def valid_eval_config_data(): return { "name": "Test Config", "config_type": EvalConfigType.g_eval, - "properties": {"g_eval_steps": ["step1", "step2"]}, + "properties": {"eval_steps": ["step1", "step2"]}, "model": DataSource( type=DataSourceType.synthetic, properties={ @@ -57,7 +52,7 @@ def valid_eval_config(valid_eval_config_data): def test_eval_config_valid(valid_eval_config): assert valid_eval_config.name == "Test Config" assert valid_eval_config.config_type == EvalConfigType.g_eval - assert valid_eval_config.properties["g_eval_steps"] == ["step1", "step2"] + assert valid_eval_config.properties["eval_steps"] == ["step1", "step2"] assert valid_eval_config.model.type == DataSourceType.synthetic assert valid_eval_config.model.properties["model_name"] == "gpt-4" assert valid_eval_config.model.properties["model_provider"] == "openai" @@ -73,9 +68,9 @@ def test_eval_config_missing_prompt(valid_eval_config): valid_eval_config.prompt = None -def test_eval_config_missing_g_eval_steps(valid_eval_config): +def test_eval_config_missing_eval_steps(valid_eval_config): with pytest.raises( - ValueError, match="g_eval_steps is required and must be a list for g_eval" + ValueError, match="eval_steps is required and must be a list for g_eval" ): valid_eval_config.properties = {} @@ -86,16 +81,16 @@ class InvalidClass: with pytest.raises(ValueError, match="Properties must be JSON serializable"): valid_eval_config.properties = { - "g_eval_steps": [], + "eval_steps": [], "invalid_key": InvalidClass(), } -def test_eval_config_invalid_g_eval_steps_type(valid_eval_config): +def test_eval_config_invalid_eval_steps_type(valid_eval_config): with pytest.raises( - ValueError, match="g_eval_steps is required and must be a list for g_eval" + ValueError, match="eval_steps is required and must be a list for g_eval" ): - valid_eval_config.properties = {"g_eval_steps": "not a list"} + valid_eval_config.properties = {"eval_steps": "not a list"} def test_eval_config_invalid_config_type(valid_eval_config): From 107f598765480de0ed1af087415d5f9f210df97a Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 10:11:55 -0500 Subject: [PATCH 009/102] Add comment --- libs/core/kiln_ai/adapters/eval/registry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/core/kiln_ai/adapters/eval/registry.py b/libs/core/kiln_ai/adapters/eval/registry.py index 78ed84aa..b4b6722e 100644 --- a/libs/core/kiln_ai/adapters/eval/registry.py +++ b/libs/core/kiln_ai/adapters/eval/registry.py @@ -9,6 +9,7 @@ def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]: case EvalConfigType.g_eval: return GEval case EvalConfigType.llm_as_judge: + # Also implemented by GEval return GEval case _: # type checking will catch missing cases From e5bd88048dab3bee8a820a4bbf662ef8e73cde73 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 10:17:34 -0500 Subject: [PATCH 010/102] Fix python 3.10 issue, and update cursor rules with 3.10+ --- .cursorrules | 1 + libs/core/kiln_ai/adapters/prompt_builders.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.cursorrules b/.cursorrules index 32d21fb8..458a4bd8 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,3 +1,4 @@ - Always assume pydantic 2 (not pydantic 1) - Always use pytest for tests + - The project supports Python 3.10 and above diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 9402d3d6..749311fe 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -1,6 +1,6 @@ import json from abc import ABCMeta, abstractmethod -from enum import StrEnum +from enum import Enum from typing import Annotated, Dict from pydantic import AfterValidator @@ -397,7 +397,7 @@ def chain_of_thought_prompt(self) -> str | None: # Generators that can take any task and build a prompt -class PromptGenerators(StrEnum): +class PromptGenerators(str, Enum): SIMPLE = "simple_prompt_builder" MULTI_SHOT = "multi_shot_prompt_builder" FEW_SHOT = "few_shot_prompt_builder" From d9254431764bd84c9ec60db0be0775ac0f83a67c Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 15:37:36 -0500 Subject: [PATCH 011/102] Remove TODOs --- libs/core/kiln_ai/adapters/eval/g_eval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index a52cd90c..789784f6 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -45,7 +45,6 @@ def __init__(self, eval_config: EvalConfig, target_task: Task): {eval_config.prompt.prompt} """ - # TODO allow over riding of system instruction via config # Build the COT eval instructions cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" @@ -118,7 +117,6 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: ), ) - # TODO: does eval see intermediate output? I don't think so, but think about it. input = f"""The model was given the following input for the task: {task_run.input} From 76ee204b64e4ba7d9985e3601d48e46d73032914 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 16:10:55 -0500 Subject: [PATCH 012/102] CR feedback --- libs/core/kiln_ai/adapters/eval/base_eval.py | 2 +- libs/core/kiln_ai/adapters/eval/g_eval.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index 50a1031b..f28c0387 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -154,4 +154,4 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str "properties": properties, "required": list(properties.keys()), } - return json.dumps(schema, indent=2, ensure_ascii=False) + return json.dumps(schema, ensure_ascii=False) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 789784f6..7400b509 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -10,9 +10,6 @@ from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType from openai.types.chat import ChatCompletionTokenLogprob -# better prompts -# https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-summarization - # all the tokens we score for, and their float scores. TOKEN_TO_SCORE_MAP: Dict[str, float] = { "1": 1.0, From d4fad9eb4bc0cf9521b8516a225ff7c494c80fec Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 16:12:37 -0500 Subject: [PATCH 013/102] Remove unused import --- libs/core/kiln_ai/adapters/eval/g_eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 7400b509..247feaa0 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -1,4 +1,3 @@ -import json import math from typing import Dict, List, Tuple From 4232eb184c1d79c142ffe0a9d4f6dc4fd9cc9c99 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 18:43:57 -0500 Subject: [PATCH 014/102] Big change: - Use our prompt_ids everywhere! - Make a new RunConfig which contains all info about running a model - Use RunConfig everywhere --- app/desktop/studio_server/data_gen_api.py | 6 +- app/desktop/studio_server/prompt_api.py | 16 ++-- app/desktop/studio_server/test_prompt_api.py | 28 +++--- app/desktop/studio_server/test_repair_api.py | 2 +- app/web_ui/src/lib/api_schema.d.ts | 14 ++- .../[generator_id]/+page.svelte | 4 +- .../core/kiln_ai/adapters/adapter_registry.py | 12 +-- libs/core/kiln_ai/adapters/eval/g_eval.py | 7 +- .../adapters/model_adapters/base_adapter.py | 49 +++++------ .../model_adapters/langchain_adapters.py | 30 ++++--- .../model_adapters/openai_model_adapter.py | 28 +++--- .../model_adapters/test_base_adapter.py | 28 +++--- .../model_adapters/test_langchain_adapter.py | 16 ++-- .../test_openai_model_adapter.py | 85 ++++++------------- .../test_saving_adapter_results.py | 41 +++++---- .../model_adapters/test_structured_output.py | 60 +++++++------ libs/core/kiln_ai/adapters/prompt_builders.py | 11 --- .../kiln_ai/adapters/repair/repair_task.py | 2 +- .../adapters/repair/test_repair_task.py | 6 +- .../kiln_ai/adapters/test_adapter_registry.py | 11 +-- .../kiln_ai/adapters/test_prompt_adaptors.py | 31 ++++--- .../kiln_ai/adapters/test_prompt_builders.py | 16 +--- libs/core/kiln_ai/datamodel/run_config.py | 75 ++++++++++++++++ libs/core/kiln_ai/datamodel/task_output.py | 4 +- libs/core/kiln_ai/datamodel/test_basemodel.py | 20 ++--- .../core/kiln_ai/datamodel/test_datasource.py | 5 +- .../kiln_ai/datamodel/test_example_models.py | 14 ++- libs/server/kiln_server/run_api.py | 15 +--- libs/server/kiln_server/test_run_api.py | 2 +- 29 files changed, 325 insertions(+), 313 deletions(-) create mode 100644 libs/core/kiln_ai/datamodel/run_config.py diff --git a/app/desktop/studio_server/data_gen_api.py b/app/desktop/studio_server/data_gen_api.py index 2d93b60b..958cabdd 100644 --- a/app/desktop/studio_server/data_gen_api.py +++ b/app/desktop/studio_server/data_gen_api.py @@ -6,7 +6,7 @@ DataGenSampleTask, DataGenSampleTaskInput, ) -from kiln_ai.adapters.prompt_builders import PromptId, prompt_builder_from_id +from kiln_ai.adapters.prompt_builders import PromptId from kiln_ai.datamodel import DataSource, DataSourceType, TaskRun from kiln_server.run_api import model_provider_from_string from kiln_server.task_api import task_from_id @@ -122,8 +122,6 @@ async def save_sample( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_id(sample.prompt_method, task) - tags = ["synthetic"] if session_id: tags.append(f"synthetic_session_{session_id}") @@ -132,7 +130,7 @@ async def save_sample( task, model_name=sample.output_model_name, provider=model_provider_from_string(sample.output_provider), - prompt_builder=prompt_builder, + prompt_id=sample.prompt_method, tags=tags, ) diff --git a/app/desktop/studio_server/prompt_api.py b/app/desktop/studio_server/prompt_api.py index 6a494cdb..913e07cd 100644 --- a/app/desktop/studio_server/prompt_api.py +++ b/app/desktop/studio_server/prompt_api.py @@ -1,30 +1,30 @@ from fastapi import FastAPI, HTTPException -from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.adapters.prompt_builders import PromptId, prompt_builder_from_id from kiln_server.task_api import task_from_id from pydantic import BaseModel class PromptApiResponse(BaseModel): prompt: str - prompt_builder_name: str - ui_generator_name: str + prompt_id: str def connect_prompt_api(app: FastAPI): - @app.get("/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_generator}") + @app.get("/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_id}") async def generate_prompt( - project_id: str, task_id: str, prompt_generator: str + project_id: str, + task_id: str, + prompt_id: str, ) -> PromptApiResponse: task = task_from_id(project_id, task_id) try: - prompt_builder = prompt_builder_from_id(prompt_generator, task) + prompt_builder = prompt_builder_from_id(prompt_id, task) prompt = prompt_builder.build_prompt_for_ui() except Exception as e: raise HTTPException(status_code=400, detail=str(e)) return PromptApiResponse( prompt=prompt, - prompt_builder_name=prompt_builder.__class__.prompt_builder_name(), - ui_generator_name=prompt_generator, + prompt_id=prompt_id, ) diff --git a/app/desktop/studio_server/test_prompt_api.py b/app/desktop/studio_server/test_prompt_api.py index f9cfcf6c..dc82b5cf 100644 --- a/app/desktop/studio_server/test_prompt_api.py +++ b/app/desktop/studio_server/test_prompt_api.py @@ -20,10 +20,6 @@ def client(): # Mock prompt builder class class MockPromptBuilder(BasePromptBuilder): - @classmethod - def prompt_builder_name(cls): - return "MockPromptBuilder" - def build_base_prompt(self): return "Mock prompt" @@ -54,19 +50,20 @@ def test_generate_prompt_success( client, mock_task, mock_prompt_builder_from_id, mock_task_from_id ): response = client.get( - "/api/projects/project123/task/task456/gen_prompt/mock_generator" + "/api/projects/project123/task/task456/gen_prompt/simple_prompt_builder" ) assert response.status_code == 200 data = response.json() assert data == { "prompt": "Mock prompt for UI", - "prompt_builder_name": "MockPromptBuilder", - "ui_generator_name": "mock_generator", + "prompt_id": "simple_prompt_builder", } mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_id.assert_called_once_with("mock_generator", mock_task) + mock_prompt_builder_from_id.assert_called_once_with( + "simple_prompt_builder", mock_task + ) def test_generate_prompt_exception( @@ -75,12 +72,17 @@ def test_generate_prompt_exception( mock_prompt_builder_from_id.side_effect = ValueError("Invalid prompt generator") response = client.get( - "/api/projects/project123/task/task456/gen_prompt/invalid_generator" + "/api/projects/project123/task/task456/gen_prompt/simple_prompt_builder" ) assert response.status_code == 400 - data = response.json() - assert data == {"detail": "Invalid prompt generator"} + assert "Invalid prompt generator" in response.text - mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_id.assert_called_once_with("invalid_generator", mock_task) + +def test_generate_prompt_id_format(client, mock_task, mock_task_from_id): + response = client.get( + "/api/projects/project123/task/task456/gen_prompt/invalid_generator_id" + ) + + assert response.status_code == 400 + assert "Unknown prompt generator: invalid_generator_id" in response.text diff --git a/app/desktop/studio_server/test_repair_api.py b/app/desktop/studio_server/test_repair_api.py index 2d0fc8b6..d39eab16 100644 --- a/app/desktop/studio_server/test_repair_api.py +++ b/app/desktop/studio_server/test_repair_api.py @@ -40,7 +40,7 @@ def data_source(): "model_name": "gpt_4o", "model_provider": "openai", "adapter_name": "langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index f32f1cb3..f88d2343 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -416,7 +416,7 @@ export interface paths { patch?: never; trace?: never; }; - "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_generator}": { + "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_id}": { parameters: { query?: never; header?: never; @@ -424,7 +424,7 @@ export interface paths { cookie?: never; }; /** Generate Prompt */ - get: operations["generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_generator__get"]; + get: operations["generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_id__get"]; put?: never; post?: never; delete?: never; @@ -1315,10 +1315,8 @@ export interface components { PromptApiResponse: { /** Prompt */ prompt: string; - /** Prompt Builder Name */ - prompt_builder_name: string; - /** Ui Generator Name */ - ui_generator_name: string; + /** Prompt Id */ + prompt_id: string; }; /** PromptCreateRequest */ PromptCreateRequest: { @@ -2705,14 +2703,14 @@ export interface operations { }; }; }; - generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_generator__get: { + generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_id__get: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; - prompt_generator: string; + prompt_id: string; }; cookie?: never; }; diff --git a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte index ee84ebc6..f15e6d4e 100644 --- a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/generator_details/[generator_id]/+page.svelte @@ -32,13 +32,13 @@ try { prompt_loading = true const { data: prompt_response, error: get_error } = await client.GET( - "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_generator}", + "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_id}", { params: { path: { project_id, task_id, - prompt_generator, + prompt_id: prompt_generator, }, }, }, diff --git a/libs/core/kiln_ai/adapters/adapter_registry.py b/libs/core/kiln_ai/adapters/adapter_registry.py index 508bd4f9..60786b51 100644 --- a/libs/core/kiln_ai/adapters/adapter_registry.py +++ b/libs/core/kiln_ai/adapters/adapter_registry.py @@ -8,7 +8,7 @@ OpenAICompatibleAdapter, OpenAICompatibleConfig, ) -from kiln_ai.adapters.prompt_builders import BasePromptBuilder +from kiln_ai.adapters.prompt_builders import PromptId from kiln_ai.adapters.provider_tools import core_provider, openai_compatible_config from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -18,7 +18,7 @@ def adapter_for_task( kiln_task: datamodel.Task, model_name: str, provider: ModelProviderName, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, tags: list[str] | None = None, base_adapter_config: AdapterConfig | None = None, ) -> BaseAdapter: @@ -41,7 +41,7 @@ def adapter_for_task( "X-Title": "KilnAI", }, ), - prompt_builder=prompt_builder, + prompt_id=prompt_id, tags=tags, base_adapter_config=base_adapter_config, ) @@ -53,7 +53,7 @@ def adapter_for_task( model_name=model_name, provider_name=provider, ), - prompt_builder=prompt_builder, + prompt_id=prompt_id, tags=tags, base_adapter_config=base_adapter_config, ) @@ -62,7 +62,7 @@ def adapter_for_task( return OpenAICompatibleAdapter( kiln_task=kiln_task, config=config, - prompt_builder=prompt_builder, + prompt_id=prompt_id, tags=tags, base_adapter_config=base_adapter_config, ) @@ -92,7 +92,7 @@ def adapter_for_task( kiln_task, model_name=model_name, provider=provider, - prompt_builder=prompt_builder, + prompt_id=prompt_id, tags=tags, base_adapter_config=base_adapter_config, ) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 247feaa0..edbf534a 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -4,7 +4,7 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput -from kiln_ai.adapters.prompt_builders import SimpleChainOfThoughtPromptBuilder +from kiln_ai.adapters.prompt_builders import PromptGenerators from kiln_ai.datamodel import Project, Task, TaskRun from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType from openai.types.chat import ChatCompletionTokenLogprob @@ -93,8 +93,6 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: """ model_name, provider = self.model_and_provider() - # We always use Simple COT for G-Eval - prompt_builder = SimpleChainOfThoughtPromptBuilder(self.geval_task) # Only fetch logprobs for G-Eval # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely @@ -106,7 +104,8 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: self.geval_task, model_name, provider, - prompt_builder, + # We always use Simple COT for G-Eval + prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, base_adapter_config=AdapterConfig( allow_saving=False, top_logprobs=top_logprobs, diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index 308be71c..133cc13e 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -5,33 +5,29 @@ from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode from kiln_ai.adapters.parsers.parser_registry import model_parser_from_id -from kiln_ai.adapters.prompt_builders import BasePromptBuilder, SimplePromptBuilder from kiln_ai.adapters.provider_tools import kiln_model_provider_from from kiln_ai.adapters.run_output import RunOutput from kiln_ai.datamodel import ( DataSource, DataSourceType, - Task, TaskOutput, TaskRun, ) from kiln_ai.datamodel.json_schema import validate_schema +from kiln_ai.datamodel.run_config import RunConfig from kiln_ai.utils.config import Config @dataclass class AdapterConfig: - allow_saving: bool = True - top_logprobs: int | None = None + """ + An adapter config is config options that do NOT impact the output of the model. + For example: if it's saved, of if we request additional data like logprobs. + """ -@dataclass -class AdapterInfo: - adapter_name: str - model_name: str - model_provider: str - prompt_builder_name: str - prompt_id: str | None = None + allow_saving: bool = True + top_logprobs: int | None = None COT_FINAL_ANSWER_PROMPT = "Considering the above, return a final result." @@ -53,21 +49,21 @@ class BaseAdapter(metaclass=ABCMeta): def __init__( self, - kiln_task: Task, - model_name: str, - model_provider_name: str, - prompt_builder: BasePromptBuilder | None = None, + run_config: RunConfig, tags: list[str] | None = None, config: AdapterConfig | None = None, ): - self.prompt_builder = prompt_builder or SimplePromptBuilder(kiln_task) - self.kiln_task = kiln_task + self.run_config = run_config + # TODO: remove these? Use run_config directly? + self.prompt_builder = run_config.prompt_builder() + self.kiln_task = run_config.task + self.model_name = run_config.model_name + self.model_provider_name = run_config.model_provider_name + self._model_provider: KilnModelProvider | None = None + self.output_schema = self.kiln_task.output_json_schema self.input_schema = self.kiln_task.input_json_schema self.default_tags = tags - self.model_name = model_name - self.model_provider_name = model_provider_name - self._model_provider: KilnModelProvider | None = None self.base_adapter_config = config or AdapterConfig() def model_provider(self) -> KilnModelProvider: @@ -160,7 +156,7 @@ def has_structured_output(self) -> bool: return self.output_schema is not None @abstractmethod - def adapter_info(self) -> AdapterInfo: + def adapter_name(self) -> str: pass @abstractmethod @@ -244,12 +240,9 @@ def _properties_for_task_output(self) -> Dict[str, str | int | float]: props = {} # adapter info - adapter_info = self.adapter_info() - props["adapter_name"] = adapter_info.adapter_name - props["model_name"] = adapter_info.model_name - props["model_provider"] = adapter_info.model_provider - props["prompt_builder_name"] = adapter_info.prompt_builder_name - if adapter_info.prompt_id is not None: - props["prompt_id"] = adapter_info.prompt_id + props["adapter_name"] = self.adapter_name() + props["model_name"] = self.run_config.model_name + props["model_provider"] = self.run_config.model_provider_name + props["prompt_id"] = self.run_config.prompt_id return props diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index 0ebf5dc0..271855ee 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -21,9 +21,7 @@ from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, AdapterConfig, - AdapterInfo, BaseAdapter, - BasePromptBuilder, RunOutput, ) from kiln_ai.adapters.ollama_tools import ( @@ -31,6 +29,10 @@ ollama_base_url, ollama_model_installed, ) +from kiln_ai.adapters.prompt_builders import ( + PromptId, +) +from kiln_ai.datamodel.run_config import RunConfig from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -46,7 +48,7 @@ def __init__( custom_model: BaseChatModel | None = None, model_name: str | None = None, provider: str | None = None, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, tags: list[str] | None = None, base_adapter_config: AdapterConfig | None = None, ): @@ -80,11 +82,17 @@ def __init__( if model_name is None: raise ValueError("model_name must be provided") - super().__init__( - kiln_task, + run_config = RunConfig( + task=kiln_task, model_name=model_name, model_provider_name=provider, - prompt_builder=prompt_builder, + ) + + if prompt_id is not None: + run_config.prompt_id = prompt_id + + super().__init__( + run_config=run_config, tags=tags, config=base_adapter_config, ) @@ -199,14 +207,8 @@ async def _run(self, input: Dict | str) -> RunOutput: intermediate_outputs=intermediate_outputs, ) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - model_name=self.model_name, - model_provider=self.model_provider_name, - adapter_name="kiln_langchain_adapter", - prompt_builder_name=self.prompt_builder.__class__.prompt_builder_name(), - prompt_id=self.prompt_builder.prompt_id(), - ) + def adapter_name(self) -> str: + return "kiln_langchain_adapter" def _munge_response(self, response: Dict) -> Dict: # Mistral Large tool calling format is a bit different. Convert to standard format. diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index 3a3fd204..6e63423d 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -13,15 +13,15 @@ from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, AdapterConfig, - AdapterInfo, BaseAdapter, - BasePromptBuilder, RunOutput, ) from kiln_ai.adapters.model_adapters.openai_compatible_config import ( OpenAICompatibleConfig, ) from kiln_ai.adapters.parsers.json_parser import parse_json_string +from kiln_ai.adapters.prompt_builders import PromptId +from kiln_ai.datamodel.run_config import RunConfig from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -30,7 +30,7 @@ def __init__( self, config: OpenAICompatibleConfig, kiln_task: datamodel.Task, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, tags: list[str] | None = None, base_adapter_config: AdapterConfig | None = None, ): @@ -41,11 +41,17 @@ def __init__( default_headers=config.default_headers, ) - super().__init__( - kiln_task, + run_config = RunConfig( + task=kiln_task, model_name=config.model_name, model_provider_name=config.provider_name, - prompt_builder=prompt_builder, + ) + + if prompt_id is not None: + run_config.prompt_id = prompt_id + + super().__init__( + run_config=run_config, tags=tags, config=base_adapter_config, ) @@ -185,14 +191,8 @@ async def _run(self, input: Dict | str) -> RunOutput: output_logprobs=logprobs, ) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - model_name=self.model_name, - model_provider=self.model_provider_name, - adapter_name="kiln_openai_compatible_adapter", - prompt_builder_name=self.prompt_builder.__class__.prompt_builder_name(), - prompt_id=self.prompt_builder.prompt_id(), - ) + def adapter_name(self) -> str: + return "kiln_openai_compatible_adapter" async def response_format_options(self) -> dict[str, Any]: # Unstructured if task isn't structured diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py index c80c409a..a9d67365 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py @@ -3,8 +3,9 @@ import pytest from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.datamodel import Task +from kiln_ai.datamodel.run_config import RunConfig class MockAdapter(BaseAdapter): @@ -13,13 +14,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input): return None - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="test", - model_name=self.model_name, - model_provider=self.model_provider_name, - prompt_builder_name="test", - ) + def adapter_name(self) -> str: + return "test" @pytest.fixture @@ -37,9 +33,11 @@ def base_task(): @pytest.fixture def adapter(base_task): return MockAdapter( - kiln_task=base_task, - model_name="test_model", - model_provider_name="test_provider", + run_config=RunConfig( + task=base_task, + model_name="test_model", + model_provider_name="test_provider", + ), ) @@ -85,7 +83,9 @@ async def test_model_provider_missing_names(base_task): """Test error when model or provider name is missing""" # Test with missing model name adapter = MockAdapter( - kiln_task=base_task, model_name="", model_provider_name="test_provider" + run_config=RunConfig( + task=base_task, model_name="", model_provider_name="test_provider" + ), ) with pytest.raises( ValueError, match="model_name and model_provider_name must be provided" @@ -94,7 +94,9 @@ async def test_model_provider_missing_names(base_task): # Test with missing provider name adapter = MockAdapter( - kiln_task=base_task, model_name="test_model", model_provider_name="" + run_config=RunConfig( + task=base_task, model_name="test_model", model_provider_name="" + ), ) with pytest.raises( ValueError, match="model_name and model_provider_name must be provided" diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py index 272f0f88..72519e8c 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py @@ -18,8 +18,8 @@ LangchainAdapter, langchain_model_from_provider, ) -from kiln_ai.adapters.prompt_builders import SimpleChainOfThoughtPromptBuilder from kiln_ai.adapters.test_prompt_adaptors import build_test_task +from kiln_ai.datamodel.run_config import RunConfig @pytest.fixture @@ -56,9 +56,8 @@ def test_langchain_adapter_infer_model_name(tmp_path): lca = LangchainAdapter(kiln_task=task, custom_model=custom) - model_info = lca.adapter_info() - assert model_info.model_name == "custom.langchain:llama-3.1-8b-instant" - assert model_info.model_provider == "custom.langchain:ChatGroq" + assert lca.run_config.model_name == "custom.langchain:llama-3.1-8b-instant" + assert lca.run_config.model_provider_name == "custom.langchain:ChatGroq" def test_langchain_adapter_info(tmp_path): @@ -66,10 +65,9 @@ def test_langchain_adapter_info(tmp_path): lca = LangchainAdapter(kiln_task=task, model_name="llama_3_1_8b", provider="ollama") - model_info = lca.adapter_info() - assert model_info.adapter_name == "kiln_langchain_adapter" - assert model_info.model_name == "llama_3_1_8b" - assert model_info.model_provider == "ollama" + assert lca.adapter_name() == "kiln_langchain_adapter" + assert lca.run_config.model_name == "llama_3_1_8b" + assert lca.run_config.model_provider_name == "ollama" async def test_langchain_adapter_with_cot(tmp_path): @@ -81,7 +79,7 @@ async def test_langchain_adapter_with_cot(tmp_path): kiln_task=task, model_name="llama_3_1_8b", provider="ollama", - prompt_builder=SimpleChainOfThoughtPromptBuilder(task), + prompt_id="simple_chain_of_thought_prompt_builder", ) # Mock the base model and its invoke method diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py index de45caf2..2c2e0fca 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py @@ -5,11 +5,11 @@ from openai import AsyncOpenAI from kiln_ai.adapters.ml_model_list import StructuredOutputMode -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BasePromptBuilder from kiln_ai.adapters.model_adapters.openai_compatible_config import ( OpenAICompatibleConfig, ) from kiln_ai.adapters.model_adapters.openai_model_adapter import OpenAICompatibleAdapter +from kiln_ai.adapters.prompt_builders import BasePromptBuilder from kiln_ai.datamodel import Project, Task @@ -37,14 +37,6 @@ def mock_task(tmp_path): return task -@pytest.fixture -def mock_prompt_builder(): - builder = Mock(spec=BasePromptBuilder) - type(builder).prompt_builder_name = Mock(return_value="test_prompt_builder") - builder.prompt_id = Mock(return_value="test_prompt_id") - return builder - - @pytest.fixture def config(): return OpenAICompatibleConfig( @@ -56,44 +48,37 @@ def config(): ) -def test_initialization(config, mock_task, mock_prompt_builder): +def test_initialization(config, mock_task): adapter = OpenAICompatibleAdapter( config=config, kiln_task=mock_task, - prompt_builder=mock_prompt_builder, + prompt_id="simple_prompt_builder", tags=["test-tag"], ) assert isinstance(adapter.client, AsyncOpenAI) assert adapter.config == config assert adapter.kiln_task == mock_task - assert adapter.prompt_builder == mock_prompt_builder + assert adapter.run_config.task == mock_task + assert adapter.run_config.prompt_id == "simple_prompt_builder" assert adapter.default_tags == ["test-tag"] - assert adapter.model_name == config.model_name - assert adapter.model_provider_name == config.provider_name + assert adapter.run_config.model_name == config.model_name + assert adapter.run_config.model_provider_name == config.provider_name -def test_adapter_info(config, mock_task, mock_prompt_builder): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +def test_adapter_info(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) - info = adapter.adapter_info() - assert isinstance(info, AdapterInfo) - assert info.model_name == config.model_name - assert info.model_provider == config.provider_name - assert info.adapter_name == "kiln_openai_compatible_adapter" - assert info.prompt_builder_name == "base_prompt_builder" - assert info.prompt_id == "test_prompt_id" + assert adapter.adapter_name() == "kiln_openai_compatible_adapter" + + assert adapter.run_config.model_name == config.model_name + assert adapter.run_config.model_provider_name == config.provider_name + assert adapter.run_config.prompt_id == "simple_prompt_builder" @pytest.mark.asyncio -async def test_response_format_options_unstructured( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_unstructured(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) # Mock has_structured_output to return False with patch.object(adapter, "has_structured_output", return_value=False): @@ -109,12 +94,8 @@ async def test_response_format_options_unstructured( ], ) @pytest.mark.asyncio -async def test_response_format_options_json_mode( - config, mock_task, mock_prompt_builder, mode -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_mode(config, mock_task, mode): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -134,12 +115,8 @@ async def test_response_format_options_json_mode( ], ) @pytest.mark.asyncio -async def test_response_format_options_function_calling( - config, mock_task, mock_prompt_builder, mode -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_function_calling(config, mock_task, mode): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -153,12 +130,8 @@ async def test_response_format_options_function_calling( @pytest.mark.asyncio -async def test_response_format_options_json_instructions( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_instructions(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -172,12 +145,8 @@ async def test_response_format_options_json_instructions( @pytest.mark.asyncio -async def test_response_format_options_json_schema( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_schema(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -198,10 +167,8 @@ async def test_response_format_options_json_schema( } -def test_tool_call_params(config, mock_task, mock_prompt_builder): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +def test_tool_call_params(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) params = adapter.tool_call_params() expected_schema = mock_task.output_schema() diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py index 420e276c..06d39dfe 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py @@ -3,7 +3,6 @@ import pytest from kiln_ai.adapters.model_adapters.base_adapter import ( - AdapterInfo, BaseAdapter, RunOutput, ) @@ -13,6 +12,7 @@ Project, Task, ) +from kiln_ai.datamodel.run_config import RunConfig from kiln_ai.utils.config import Config @@ -20,14 +20,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input: dict | str) -> dict | str: return RunOutput(output="Test output", intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - prompt_builder_name="mock_prompt_builder", - prompt_id="mock_prompt_id", - ) + def adapter_name(self) -> str: + return "mock_adapter" @pytest.fixture @@ -46,9 +40,12 @@ def test_task(tmp_path): @pytest.fixture def adapter(test_task): return MockAdapter( - test_task, - model_name="phi_3_5", - model_provider_name="ollama", + run_config=RunConfig( + task=test_task, + model_name="phi_3_5", + model_provider_name="ollama", + prompt_id="simple_chain_of_thought_prompt_builder", + ), ) @@ -98,13 +95,12 @@ def test_save_run_isolation(test_task, adapter): assert reloaded_output.source.type == DataSourceType.synthetic assert reloaded_output.rating is None assert reloaded_output.source.properties["adapter_name"] == "mock_adapter" - assert reloaded_output.source.properties["model_name"] == "mock_model" - assert reloaded_output.source.properties["model_provider"] == "mock_provider" + assert reloaded_output.source.properties["model_name"] == "phi_3_5" + assert reloaded_output.source.properties["model_provider"] == "ollama" assert ( - reloaded_output.source.properties["prompt_builder_name"] - == "mock_prompt_builder" + reloaded_output.source.properties["prompt_id"] + == "simple_chain_of_thought_prompt_builder" ) - assert reloaded_output.source.properties["prompt_id"] == "mock_prompt_id" # Run again, with same input and different output. Should create a new TaskRun. different_run_output = RunOutput( output="Different output", intermediate_outputs=None @@ -122,7 +118,7 @@ def test_save_run_isolation(test_task, adapter): properties={ "model_name": "mock_model", "model_provider": "mock_provider", - "prompt_builder_name": "mock_prompt_builder", + "prompt_id": "mock_prompt_builder", "adapter_name": "mock_adapter", }, ), @@ -225,6 +221,9 @@ async def test_autosave_true(test_task, adapter): assert output.output == "Test output" assert output.source.type == DataSourceType.synthetic assert output.source.properties["adapter_name"] == "mock_adapter" - assert output.source.properties["model_name"] == "mock_model" - assert output.source.properties["model_provider"] == "mock_provider" - assert output.source.properties["prompt_builder_name"] == "mock_prompt_builder" + assert output.source.properties["model_name"] == "phi_3_5" + assert output.source.properties["model_provider"] == "ollama" + assert ( + output.source.properties["prompt_id"] + == "simple_chain_of_thought_prompt_builder" + ) diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py index db6bf7c6..84e1a253 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py @@ -12,16 +12,17 @@ built_in_models, ) from kiln_ai.adapters.model_adapters.base_adapter import ( - AdapterInfo, BaseAdapter, RunOutput, ) from kiln_ai.adapters.ollama_tools import ollama_online from kiln_ai.adapters.prompt_builders import ( BasePromptBuilder, + PromptId, SimpleChainOfThoughtPromptBuilder, ) from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers +from kiln_ai.datamodel.run_config import RunConfig from kiln_ai.datamodel.test_json_schema import json_joke_schema, json_triangle_schema @@ -39,9 +40,9 @@ async def test_structured_output_gpt_4o_mini(tmp_path): await run_structured_output_test(tmp_path, "gpt_4o_mini", "openai") -@pytest.mark.parametrize("model_name", ["llama_3_1_8b"]) +@pytest.mark.parametrize("model_name", ["llama_3_1_8b", "gemma_2_2b"]) @pytest.mark.ollama -async def test_structured_output_ollama_llama(tmp_path, model_name): +async def test_structured_output_ollama(tmp_path, model_name): if not await ollama_online(): pytest.skip("Ollama API not running. Expect it running on localhost:11434") await run_structured_output_test(tmp_path, model_name, "ollama") @@ -49,19 +50,21 @@ async def test_structured_output_ollama_llama(tmp_path, model_name): class MockAdapter(BaseAdapter): def __init__(self, kiln_task: datamodel.Task, response: Dict | str | None): - super().__init__(kiln_task, model_name="phi_3_5", model_provider_name="ollama") + super().__init__( + run_config=RunConfig( + task=kiln_task, + model_name="phi_3_5", + model_provider_name="ollama", + prompt_id="simple_chain_of_thought_prompt_builder", + ), + ) self.response = response async def _run(self, input: str) -> RunOutput: return RunOutput(output=self.response, intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - prompt_builder_name="mock_prompt_builder", - ) + def adapter_name(self) -> str: + return "mock_adapter" async def test_mock_unstructred_response(tmp_path): @@ -204,15 +207,21 @@ async def run_structured_input_task( task: datamodel.Task, model_name: str, provider: str, - pb: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ): a = adapter_for_task( - task, model_name=model_name, provider=provider, prompt_builder=pb + task, + model_name=model_name, + provider=provider, + prompt_id=prompt_id, ) with pytest.raises(ValueError): # not structured input in dictionary await a.invoke("a=1, b=2, c=3") - with pytest.raises(jsonschema.exceptions.ValidationError): + with pytest.raises( + ValueError, + match="This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema.", + ): # invalid structured input await a.invoke({"a": 1, "b": 2, "d": 3}) @@ -229,13 +238,14 @@ async def run_structured_input_task( assert "[[equilateral]]" in response else: assert response["is_equilateral"] is True - adapter_info = a.adapter_info() + expected_pb_name = "simple_prompt_builder" - if pb is not None: - expected_pb_name = pb.__class__.prompt_builder_name() - assert adapter_info.prompt_builder_name == expected_pb_name - assert adapter_info.model_name == model_name - assert adapter_info.model_provider == provider + if prompt_id is not None: + expected_pb_name = prompt_id + assert a.run_config.prompt_id == expected_pb_name + + assert a.run_config.model_name == model_name + assert a.run_config.model_provider_name == provider @pytest.mark.paid @@ -257,8 +267,9 @@ async def test_all_built_in_models_structured_input( @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) async def test_structured_input_cot_prompt_builder(tmp_path, model_name, provider_name): task = build_structured_input_test_task(tmp_path) - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_structured_input_task(task, model_name, provider_name, pb) + await run_structured_input_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) @pytest.mark.paid @@ -302,5 +313,6 @@ async def test_structured_output_cot_prompt_builder( """ task.output_json_schema = json.dumps(triangle_schema) task.save_to_file() - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_structured_input_task(task, model_name, provider_name, pb) + await run_structured_input_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 749311fe..82be0626 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -57,17 +57,6 @@ def build_base_prompt(self) -> str: """ pass - @classmethod - def prompt_builder_name(cls) -> str: - """Returns the name of the prompt builder, to be used for persisting into the datastore. - - Default implementation gets the name of the prompt builder in snake case. If you change the class name, you should override this so prior saved data is compatible. - - Returns: - str: The prompt builder name in snake_case format. - """ - return snake_case(cls.__name__) - def build_user_message(self, input: Dict | str) -> str: """Build a user message from the input. diff --git a/libs/core/kiln_ai/adapters/repair/repair_task.py b/libs/core/kiln_ai/adapters/repair/repair_task.py index e140b812..6163a62b 100644 --- a/libs/core/kiln_ai/adapters/repair/repair_task.py +++ b/libs/core/kiln_ai/adapters/repair/repair_task.py @@ -49,7 +49,7 @@ def _original_prompt(cls, run: TaskRun, task: Task) -> str: if run.output.source is None or run.output.source.properties is None: raise ValueError("No source properties found") - # Get the prompt builder - stored in 2 fields, mutually exclusive + # Get the prompt builder id. Need the second check because we used to store this in a prompt_builder_name field, so loading legacy runs will need this. prompt_id = run.output.source.properties.get( "prompt_id" ) or run.output.source.properties.get("prompt_builder_name", None) diff --git a/libs/core/kiln_ai/adapters/repair/test_repair_task.py b/libs/core/kiln_ai/adapters/repair/test_repair_task.py index 9c63d974..2d7d261f 100644 --- a/libs/core/kiln_ai/adapters/repair/test_repair_task.py +++ b/libs/core/kiln_ai/adapters/repair/test_repair_task.py @@ -95,7 +95,7 @@ def sample_task_run(sample_task): "model_name": "gpt_4o", "model_provider": "openai", "adapter_name": "langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ), ), @@ -201,7 +201,7 @@ async def test_live_run(sample_task, sample_task_run, sample_repair_data): "adapter_name": "kiln_langchain_adapter", "model_name": "llama_3_1_8b", "model_provider": "groq", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } @@ -238,7 +238,7 @@ async def test_mocked_repair_task_run(sample_task, sample_task_run, sample_repai "adapter_name": "kiln_langchain_adapter", "model_name": "llama_3_1_8b", "model_provider": "ollama", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } assert run.input_source.type == DataSourceType.human assert "created_by" in run.input_source.properties diff --git a/libs/core/kiln_ai/adapters/test_adapter_registry.py b/libs/core/kiln_ai/adapters/test_adapter_registry.py index 6a70d11b..d803f2c2 100644 --- a/libs/core/kiln_ai/adapters/test_adapter_registry.py +++ b/libs/core/kiln_ai/adapters/test_adapter_registry.py @@ -89,19 +89,14 @@ def test_langchain_adapter_creation(mock_config, basic_task, provider): # TODO should run for all cases def test_custom_prompt_builder(mock_config, basic_task): - class TestPromptBuilder(BasePromptBuilder): - def build_base_prompt(self, kiln_task) -> str: - return "test-prompt" - - prompt_builder = TestPromptBuilder(basic_task) adapter = adapter_for_task( kiln_task=basic_task, model_name="gpt-4", provider=ModelProviderName.openai, - prompt_builder=prompt_builder, + prompt_id="simple_chain_of_thought_prompt_builder", ) - assert adapter.prompt_builder == prompt_builder + assert adapter.run_config.prompt_id == "simple_chain_of_thought_prompt_builder" # TODO should run for all cases @@ -129,6 +124,7 @@ def test_openai_compatible_adapter(mock_compatible_config, mock_config, basic_ta mock_compatible_config.return_value.model_name = "test-model" mock_compatible_config.return_value.api_key = "test-key" mock_compatible_config.return_value.base_url = "https://test.com/v1" + mock_compatible_config.return_value.provider_name = "CustomProvider99" adapter = adapter_for_task( kiln_task=basic_task, @@ -141,6 +137,7 @@ def test_openai_compatible_adapter(mock_compatible_config, mock_config, basic_ta assert adapter.config.model_name == "test-model" assert adapter.config.api_key == "test-key" assert adapter.config.base_url == "https://test.com/v1" + assert adapter.config.provider_name == "CustomProvider99" def test_custom_openai_compatible_provider(mock_config, basic_task): diff --git a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py index e7b97f90..bd4188ed 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py +++ b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py @@ -11,6 +11,7 @@ from kiln_ai.adapters.ollama_tools import ollama_online from kiln_ai.adapters.prompt_builders import ( BasePromptBuilder, + PromptId, SimpleChainOfThoughtPromptBuilder, ) @@ -132,7 +133,7 @@ async def test_mock_returning_run(tmp_path): "adapter_name": "kiln_langchain_adapter", "model_name": "custom.langchain:unknown_model", "model_provider": "ollama", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } @@ -149,8 +150,9 @@ async def test_all_models_providers_plaintext(tmp_path, model_name, provider_nam @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) async def test_cot_prompt_builder(tmp_path, model_name, provider_name): task = build_test_task(tmp_path) - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_simple_task(task, model_name, provider_name, pb) + await run_simple_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) def build_test_task(tmp_path: Path): @@ -186,20 +188,20 @@ async def run_simple_test( tmp_path: Path, model_name: str, provider: str | None = None, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ): task = build_test_task(tmp_path) - return await run_simple_task(task, model_name, provider, prompt_builder) + return await run_simple_task(task, model_name, provider, prompt_id) async def run_simple_task( task: datamodel.Task, model_name: str, provider: str, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ) -> datamodel.TaskRun: adapter = adapter_for_task( - task, model_name=model_name, provider=provider, prompt_builder=prompt_builder + task, model_name=model_name, provider=provider, prompt_id=prompt_id ) run = await adapter.invoke( @@ -212,13 +214,14 @@ async def run_simple_task( ) assert "64" in run.output.output source_props = run.output.source.properties - assert source_props["adapter_name"] == "kiln_langchain_adapter" + assert source_props["adapter_name"] in [ + "kiln_langchain_adapter", + "kiln_openai_compatible_adapter", + ] assert source_props["model_name"] == model_name assert source_props["model_provider"] == provider - expected_prompt_builder_name = ( - prompt_builder.__class__.prompt_builder_name() - if prompt_builder - else "simple_prompt_builder" - ) - assert source_props["prompt_builder_name"] == expected_prompt_builder_name + if prompt_id is None: + assert source_props["prompt_id"] == "simple_prompt_builder" + else: + assert source_props["prompt_id"] == prompt_id return run diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 0d800942..695ae980 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -3,7 +3,7 @@ import pytest from pydantic import BaseModel, ValidationError -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.adapters.model_adapters.test_structured_output import ( build_structured_output_test_task, ) @@ -62,12 +62,8 @@ class MockAdapter(BaseAdapter): def _run(self, input: str) -> str: return "mock response" - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - ) + def adapter_name(self) -> str: + return "mock_adapter" def test_simple_prompt_builder_structured_output(tmp_path): @@ -319,12 +315,6 @@ def check_example_outputs(task: Task, count: int): assert f"## Example {count}" in prompt -def test_prompt_builder_name(): - assert SimplePromptBuilder.prompt_builder_name() == "simple_prompt_builder" - assert MultiShotPromptBuilder.prompt_builder_name() == "multi_shot_prompt_builder" - assert RepairsPromptBuilder.prompt_builder_name() == "repairs_prompt_builder" - - def test_prompt_builder_from_id(task_with_examples): task = task_with_examples assert isinstance( diff --git a/libs/core/kiln_ai/datamodel/run_config.py b/libs/core/kiln_ai/datamodel/run_config.py new file mode 100644 index 00000000..da25907f --- /dev/null +++ b/libs/core/kiln_ai/datamodel/run_config.py @@ -0,0 +1,75 @@ +from typing import TYPE_CHECKING, Union + +from pydantic import BaseModel, Field, model_validator +from typing_extensions import Self + +from kiln_ai.adapters.prompt_builders import ( + BasePromptBuilder, + PromptGenerators, + PromptId, + prompt_builder_from_id, +) +from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel +from kiln_ai.datamodel.task import Task + +if TYPE_CHECKING: + from kiln_ai.datamodel.task import Task + + +class RunConfig(BaseModel): + """ + A configuration for running a task. + + This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + + For example: task, model, provider, prompt (ID, builder, etc), etc. + """ + + task: "Task" = Field(description="The task to run.") + model_name: str = Field(description="The model to use for this run config.") + model_provider_name: str = Field( + description="The provider to use for this run config." + ) + prompt_id: PromptId = Field( + description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.", + default=PromptGenerators.SIMPLE, + ) + + def prompt_builder(self) -> BasePromptBuilder: + return prompt_builder_from_id(self.prompt_id, self.task) + + +class TaskRunConfig(RunConfig, KilnParentedModel): + """ + A run config, parented to a Kiln Task. + + A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + + Used for saving and sharing run configs in a Kiln Project. + """ + + name: str = NAME_FIELD + description: str | None = Field( + default=None, description="The description of the task run config." + ) + run_config: RunConfig = Field( + description="The run config to use for this task run." + ) + + # Workaround to return typed parent without importing Task + def parent_task(self) -> Union["Task", None]: + if self.parent is None or self.parent.__class__.__name__ != "Task": + return None + return self.parent # type: ignore + + @model_validator(mode="after") + def validate_task(self) -> Self: + # Check that the task in the run config matches the parent task + parent_task = self.parent_task() + if parent_task is None: + raise ValueError("Run config must be parented to a task") + if self.run_config.task is None: + raise ValueError("Run config must have a task") + if self.run_config.task.id != parent_task.id: + raise ValueError("Run config task must match parent task") + return self diff --git a/libs/core/kiln_ai/datamodel/task_output.py b/libs/core/kiln_ai/datamodel/task_output.py index ae0de84d..96463432 100644 --- a/libs/core/kiln_ai/datamodel/task_output.py +++ b/libs/core/kiln_ai/datamodel/task_output.py @@ -205,13 +205,13 @@ class DataSource(BaseModel): not_allowed_for=[DataSourceType.human], ), DataSourceProperty( + # Legacy field -- allow loading from old runs, but we shouldn't be setting it. name="prompt_builder_name", type=str, not_allowed_for=[DataSourceType.human], ), DataSourceProperty( - # Optional: an ID within the scope of the prompt_builder_name. - # Used for prompt builders with IDs (like saved prompts, fine-tune prompts) + # The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details. name="prompt_id", type=str, not_allowed_for=[DataSourceType.human], diff --git a/libs/core/kiln_ai/datamodel/test_basemodel.py b/libs/core/kiln_ai/datamodel/test_basemodel.py index 460b9dea..2dc848d1 100644 --- a/libs/core/kiln_ai/datamodel/test_basemodel.py +++ b/libs/core/kiln_ai/datamodel/test_basemodel.py @@ -6,7 +6,7 @@ import pytest -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.adapters.run_output import RunOutput from kiln_ai.datamodel import Task, TaskRun from kiln_ai.datamodel.basemodel import ( @@ -15,6 +15,7 @@ string_to_valid_name, ) from kiln_ai.datamodel.model_cache import ModelCache +from kiln_ai.datamodel.run_config import RunConfig @pytest.fixture @@ -484,13 +485,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input): return RunOutput(output="test output", intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="test", - model_name=self.model_name, - model_provider=self.model_provider_name, - prompt_builder_name="test", - ) + def adapter_name(self) -> str: + return "test" @pytest.fixture @@ -501,9 +497,11 @@ def base_task(): @pytest.fixture def adapter(base_task): return MockAdapter( - kiln_task=base_task, - model_name="test_model", - model_provider_name="test_provider", + run_config=RunConfig( + task=base_task, + model_name="test_model", + model_provider_name="test_provider", + ), ) diff --git a/libs/core/kiln_ai/datamodel/test_datasource.py b/libs/core/kiln_ai/datamodel/test_datasource.py index f10ef140..934a96a4 100644 --- a/libs/core/kiln_ai/datamodel/test_datasource.py +++ b/libs/core/kiln_ai/datamodel/test_datasource.py @@ -18,14 +18,14 @@ def test_valid_synthetic_data_source(): properties={ "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "completion", + "prompt_id": "simple_prompt_builder", "adapter_name": "langchain", }, ) assert data_source.type == DataSourceType.synthetic assert data_source.properties["model_name"] == "GPT-4" assert data_source.properties["model_provider"] == "OpenAI" - assert data_source.properties["prompt_builder_name"] == "completion" + assert data_source.properties["prompt_id"] == "simple_prompt_builder" assert data_source.properties["adapter_name"] == "langchain" @@ -85,6 +85,7 @@ def test_prompt_type_optional_for_synthetic(): }, ) assert "prompt_builder_name" not in data_source.properties + assert "prompt_id" not in data_source.properties def test_private_data_source_properties_not_serialized(): diff --git a/libs/core/kiln_ai/datamodel/test_example_models.py b/libs/core/kiln_ai/datamodel/test_example_models.py index 423fa208..a0dc5e10 100644 --- a/libs/core/kiln_ai/datamodel/test_example_models.py +++ b/libs/core/kiln_ai/datamodel/test_example_models.py @@ -155,7 +155,7 @@ def test_structured_output_workflow(tmp_path): "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), parent=task, @@ -470,7 +470,7 @@ def test_valid_synthetic_task_output(): "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), ) @@ -478,7 +478,7 @@ def test_valid_synthetic_task_output(): assert output.source.properties["adapter_name"] == "TestAdapter" assert output.source.properties["model_name"] == "GPT-4" assert output.source.properties["model_provider"] == "OpenAI" - assert output.source.properties["prompt_builder_name"] == "TestPromptBuilder" + assert output.source.properties["prompt_id"] == "simple_prompt_builder" def test_invalid_synthetic_task_output_missing_keys(): @@ -507,23 +507,21 @@ def test_invalid_synthetic_task_output_empty_values(): "adapter_name": "TestAdapter", "model_name": "", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), ) def test_invalid_synthetic_task_output_non_string_values(): - with pytest.raises( - ValidationError, match="'prompt_builder_name' must be of type str" - ): + with pytest.raises(ValidationError, match="'prompt_id' must be of type str"): DataSource( type=DataSourceType.synthetic, properties={ "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": 123, + "prompt_id": 123, }, ) diff --git a/libs/server/kiln_server/run_api.py b/libs/server/kiln_server/run_api.py index 7c02ae19..e0ae2826 100644 --- a/libs/server/kiln_server/run_api.py +++ b/libs/server/kiln_server/run_api.py @@ -5,7 +5,7 @@ from fastapi import FastAPI, HTTPException from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.adapters.prompt_builders import PromptId from kiln_ai.datamodel import Task, TaskOutputRating, TaskOutputRatingType, TaskRun from kiln_ai.datamodel.basemodel import ID_TYPE from pydantic import BaseModel, ConfigDict @@ -38,7 +38,7 @@ class RunTaskRequest(BaseModel): provider: str plaintext_input: str | None = None structured_input: Dict[str, Any] | None = None - ui_prompt_method: str | None = None + ui_prompt_method: PromptId | None = None tags: list[str] | None = None # Allows use of the model_name field (usually pydantic will reserve model_*) @@ -188,20 +188,11 @@ async def run_task( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_id( - request.ui_prompt_method or "simple_prompt_builder", - task, - ) - if prompt_builder is None: - raise HTTPException( - status_code=400, - detail=f"Unknown prompt method: {request.ui_prompt_method}", - ) adapter = adapter_for_task( task, model_name=request.model_name, provider=model_provider_from_string(request.provider), - prompt_builder=prompt_builder, + prompt_id=request.ui_prompt_method or "simple_prompt_builder", tags=request.tags, ) diff --git a/libs/server/kiln_server/test_run_api.py b/libs/server/kiln_server/test_run_api.py index 477b288e..e64ee3c4 100644 --- a/libs/server/kiln_server/test_run_api.py +++ b/libs/server/kiln_server/test_run_api.py @@ -84,7 +84,7 @@ def task_run_setup(tmp_path): "model_name": "gpt_4o", "model_provider": "ollama", "adapter_name": "kiln_langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ), ), From 446fafe8a6e59553288ede1b7116ad8827d5c10c Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 18:47:03 -0500 Subject: [PATCH 015/102] Remove console.log --- .../src/lib/utils/json_schema_editor/json_schema_templates.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts index 028d5b86..4068ba4c 100644 --- a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts +++ b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts @@ -80,7 +80,6 @@ export function schema_from_model( required.push(key) } } - console.log(properties) return { type: "object", properties: properties, From 6e72cf52c6f2116aa4b9f6426d4560d28411b4c4 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 18:49:00 -0500 Subject: [PATCH 016/102] Better API typing: check for valid PromptId using pydantic types --- app/desktop/studio_server/prompt_api.py | 4 ++-- app/desktop/studio_server/test_prompt_api.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app/desktop/studio_server/prompt_api.py b/app/desktop/studio_server/prompt_api.py index 913e07cd..4f63431a 100644 --- a/app/desktop/studio_server/prompt_api.py +++ b/app/desktop/studio_server/prompt_api.py @@ -6,7 +6,7 @@ class PromptApiResponse(BaseModel): prompt: str - prompt_id: str + prompt_id: PromptId def connect_prompt_api(app: FastAPI): @@ -14,7 +14,7 @@ def connect_prompt_api(app: FastAPI): async def generate_prompt( project_id: str, task_id: str, - prompt_id: str, + prompt_id: PromptId, ) -> PromptApiResponse: task = task_from_id(project_id, task_id) diff --git a/app/desktop/studio_server/test_prompt_api.py b/app/desktop/studio_server/test_prompt_api.py index dc82b5cf..0b1ccf67 100644 --- a/app/desktop/studio_server/test_prompt_api.py +++ b/app/desktop/studio_server/test_prompt_api.py @@ -84,5 +84,5 @@ def test_generate_prompt_id_format(client, mock_task, mock_task_from_id): "/api/projects/project123/task/task456/gen_prompt/invalid_generator_id" ) - assert response.status_code == 400 - assert "Unknown prompt generator: invalid_generator_id" in response.text + assert response.status_code == 422 + assert "Value error, Invalid prompt ID: invalid_generator_id" in response.text From 9d73952cd4b3265ebb784b0f57d356afd36fe815 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 18:51:28 -0500 Subject: [PATCH 017/102] Update prompt_builder_from_id to take a typed string for extra typechecking --- libs/core/kiln_ai/adapters/prompt_builders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 82be0626..3bdffed4 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -450,11 +450,11 @@ def _check_prompt_id(id: str) -> str: # Our UI has some names that are not the same as the class names, which also hint parameters. -def prompt_builder_from_id(prompt_id: str, task: Task) -> BasePromptBuilder: +def prompt_builder_from_id(prompt_id: PromptId, task: Task) -> BasePromptBuilder: """Convert a name used in the UI to the corresponding prompt builder class. Args: - prompt_id (str): The prompt ID. + prompt_id (PromptId): The prompt ID. Returns: type[BasePromptBuilder]: The corresponding prompt builder class. From b261e4eff54506768e55b583578a3070e63949d9 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 19:03:11 -0500 Subject: [PATCH 018/102] Remove duplicate data storage inside the adapter --- .../adapters/model_adapters/base_adapter.py | 24 +++++++++---------- .../model_adapters/langchain_adapters.py | 8 +++---- .../model_adapters/openai_model_adapter.py | 4 ++-- .../test_openai_model_adapter.py | 1 - .../kiln_ai/adapters/test_adapter_registry.py | 2 +- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index 133cc13e..62a73e01 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -10,6 +10,7 @@ from kiln_ai.datamodel import ( DataSource, DataSourceType, + Task, TaskOutput, TaskRun, ) @@ -54,32 +55,31 @@ def __init__( config: AdapterConfig | None = None, ): self.run_config = run_config - # TODO: remove these? Use run_config directly? self.prompt_builder = run_config.prompt_builder() - self.kiln_task = run_config.task - self.model_name = run_config.model_name - self.model_provider_name = run_config.model_provider_name self._model_provider: KilnModelProvider | None = None - self.output_schema = self.kiln_task.output_json_schema - self.input_schema = self.kiln_task.input_json_schema + self.output_schema = self.task().output_json_schema + self.input_schema = self.task().input_json_schema self.default_tags = tags self.base_adapter_config = config or AdapterConfig() + def task(self) -> Task: + return self.run_config.task + def model_provider(self) -> KilnModelProvider: """ Lazy load the model provider for this adapter. """ if self._model_provider is not None: return self._model_provider - if not self.model_name or not self.model_provider_name: + if not self.run_config.model_name or not self.run_config.model_provider_name: raise ValueError("model_name and model_provider_name must be provided") self._model_provider = kiln_model_provider_from( - self.model_name, self.model_provider_name + self.run_config.model_name, self.run_config.model_provider_name ) if not self._model_provider: raise ValueError( - f"model_provider_name {self.model_provider_name} not found for model {self.model_name}" + f"model_provider_name {self.run_config.model_provider_name} not found for model {self.run_config.model_name}" ) return self._model_provider @@ -89,7 +89,7 @@ async def invoke_returning_raw( input_source: DataSource | None = None, ) -> Dict | str: result = await self.invoke(input, input_source) - if self.kiln_task.output_json_schema is None: + if self.task().output_json_schema is None: return result.output.output else: return json.loads(result.output.output) @@ -143,7 +143,7 @@ async def invoke_returning_run_output( if ( self.base_adapter_config.allow_saving and Config.shared().autosave_runs - and self.kiln_task.path is not None + and self.task().path is not None ): run.save_to_file() else: @@ -219,7 +219,7 @@ def generate_run( ) new_task_run = TaskRun( - parent=self.kiln_task, + parent=self.task(), input=input_str, input_source=input_source, output=TaskOutput( diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index 271855ee..d276cb67 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -122,15 +122,15 @@ async def model(self) -> LangChainModelType: f"model {self._model} does not support structured output, cannot use output_json_schema" ) # Langchain expects title/description to be at top level, on top of json schema - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() if output_schema is None: raise ValueError( - f"output_json_schema is not valid json: {self.kiln_task.output_json_schema}" + f"output_json_schema is not valid json: {self.task().output_json_schema}" ) output_schema["title"] = "task_response" output_schema["description"] = "A response from the task" with_structured_output_options = self.get_structured_output_options( - self.model_name, self.model_provider_name + self.run_config.model_name, self.run_config.model_provider_name ) self._model = self._model.with_structured_output( output_schema, @@ -256,7 +256,7 @@ def get_structured_output_options( async def langchain_model_from(self) -> BaseChatModel: provider = self.model_provider() - return await langchain_model_from_provider(provider, self.model_name) + return await langchain_model_from_provider(provider, self.run_config.model_name) async def langchain_model_from_provider( diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index 6e63423d..cabbd29e 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -204,7 +204,7 @@ async def response_format_options(self) -> dict[str, Any]: case StructuredOutputMode.json_mode: return {"response_format": {"type": "json_object"}} case StructuredOutputMode.json_schema: - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() return { "response_format": { "type": "json_schema", @@ -230,7 +230,7 @@ async def response_format_options(self) -> dict[str, Any]: def tool_call_params(self) -> dict[str, Any]: # Add additional_properties: false to the schema (OpenAI requires this for some models) - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() if not isinstance(output_schema, dict): raise ValueError( "Invalid output schema for this task. Can not use tool calls." diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py index 2c2e0fca..b481f807 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py @@ -58,7 +58,6 @@ def test_initialization(config, mock_task): assert isinstance(adapter.client, AsyncOpenAI) assert adapter.config == config - assert adapter.kiln_task == mock_task assert adapter.run_config.task == mock_task assert adapter.run_config.prompt_id == "simple_prompt_builder" assert adapter.default_tags == ["test-tag"] diff --git a/libs/core/kiln_ai/adapters/test_adapter_registry.py b/libs/core/kiln_ai/adapters/test_adapter_registry.py index d803f2c2..38308e76 100644 --- a/libs/core/kiln_ai/adapters/test_adapter_registry.py +++ b/libs/core/kiln_ai/adapters/test_adapter_registry.py @@ -84,7 +84,7 @@ def test_langchain_adapter_creation(mock_config, basic_task, provider): ) assert isinstance(adapter, LangchainAdapter) - assert adapter.model_name == "test-model" + assert adapter.run_config.model_name == "test-model" # TODO should run for all cases From e0ab86cd26428bef36b1be4dc96c1b322c2ce4cf Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 22:11:55 -0500 Subject: [PATCH 019/102] Fix ID parsing --- libs/core/kiln_ai/adapters/prompt_builders.py | 4 ++-- libs/core/kiln_ai/adapters/test_prompt_builders.py | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 3bdffed4..9b53e13a 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -422,9 +422,9 @@ def _check_prompt_id(id: str) -> str: if id.startswith("id::"): # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id' parts = id.split("::") - if len(parts) != 4: + if len(parts) != 2 or len(parts[1]) == 0: raise ValueError( - f"Invalid saved prompt ID: {id}. Expected format: 'id::[project_id]::[task_id]::[prompt_id]'." + f"Invalid saved prompt ID: {id}. Expected format: 'id::[prompt_id]'." ) return id diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 695ae980..5e8a0c2a 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -603,7 +603,7 @@ def test_valid_prompt_generator_names(): def test_valid_saved_prompt_id(): """Test that valid saved prompt IDs are accepted""" - valid_id = "id::project_123::task_456::prompt_789" + valid_id = "id::prompt_789" model = TestModel(prompt_id=valid_id) assert model.prompt_id == valid_id @@ -619,11 +619,8 @@ def test_valid_fine_tune_prompt_id(): "invalid_id", [ pytest.param("id::project_123::task_456", id="missing_prompt_id"), - pytest.param( - "id::project_123::task_456::prompt_789::extra", id="too_many_parts" - ), + pytest.param("id::task_456::prompt_789", id="too_many_parts"), pytest.param("id::", id="empty_parts"), - pytest.param("id::project_123", id="too_few_parts"), ], ) def test_invalid_saved_prompt_id_format(invalid_id): From 781c66f66025c94c17a2c5468d9c619a081d8e80 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 16 Feb 2025 23:15:37 -0500 Subject: [PATCH 020/102] Fix prompt name UI --- .../[task_id]/[run_id]/run/+page.svelte | 41 +++++++++++-------- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte index 49f015d9..41b87ee3 100644 --- a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte +++ b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte @@ -30,25 +30,31 @@ let model_props: Record = {} $: { - // Attempt to lookup a nice name for the prompt - let prompt_name = $current_task_prompts?.prompts.find( - (prompt) => prompt.id === run?.output?.source?.properties?.prompt_id, - )?.name - let prompt_generator_name = $current_task_prompts?.generators.find( - (generator) => - generator.id === run?.output?.source?.properties?.prompt_builder_name, - )?.name + // Prompt ID previously was stored in the prompt_builder_name field + let prompt_id = ( + run?.output?.source?.properties?.prompt_id || + run?.output?.source?.properties?.prompt_builder_name || + "" + ).toString() + let prompt_name: string | undefined = undefined + // Attempt to lookup a nice name for the prompt. First from named prompts, then from generators // Special case for fine-tuned prompts - if ( - run?.output?.source?.properties?.prompt_builder_name === - "fine_tune_prompt_builder" - ) { - prompt_generator_name = "Fine-Tune Prompt" - prompt_name = undefined - } else if (!prompt_generator_name && !prompt_name) { - prompt_generator_name = - "" + run?.output?.source?.properties?.prompt_builder_name + if (prompt_id && prompt_id.startsWith("fine_tune_prompt::")) { + prompt_name = "Fine-Tune Prompt" + } + if (!prompt_name) { + prompt_name = $current_task_prompts?.prompts.find( + (prompt) => "id::" + prompt.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = $current_task_prompts?.generators.find( + (generator) => generator.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = prompt_id } let topic_path: string | undefined = undefined @@ -74,7 +80,6 @@ $model_info, ), "Model Provider": run?.output?.source?.properties?.model_provider, - "Prompt Generator": prompt_generator_name, Prompt: prompt_name, "Created By": run?.input_source?.properties?.created_by, "Created At": formatDate(run?.created_at), From 24c83486a423a68673ea1f141977204e098eec7a Mon Sep 17 00:00:00 2001 From: scosman Date: Mon, 17 Feb 2025 19:53:58 -0500 Subject: [PATCH 021/102] Refactoring: - PromptID into the datamodel. - RunConfig into a task --- app/desktop/studio_server/data_gen_api.py | 3 +- app/desktop/studio_server/prompt_api.py | 3 +- .../core/kiln_ai/adapters/adapter_registry.py | 2 +- .../adapters/model_adapters/base_adapter.py | 7 +- .../model_adapters/langchain_adapters.py | 6 +- .../model_adapters/openai_model_adapter.py | 4 +- .../model_adapters/test_base_adapter.py | 2 +- .../model_adapters/test_langchain_adapter.py | 2 +- .../test_saving_adapter_results.py | 2 +- .../model_adapters/test_structured_output.py | 10 +- libs/core/kiln_ai/adapters/prompt_builders.py | 67 +--------- .../kiln_ai/adapters/test_prompt_adaptors.py | 2 +- .../kiln_ai/adapters/test_prompt_builders.py | 111 +---------------- libs/core/kiln_ai/datamodel/__init__.py | 8 ++ libs/core/kiln_ai/datamodel/prompt_id.py | 69 +++++++++++ libs/core/kiln_ai/datamodel/run_config.py | 42 +------ libs/core/kiln_ai/datamodel/task.py | 23 ++++ libs/core/kiln_ai/datamodel/test_basemodel.py | 2 +- libs/core/kiln_ai/datamodel/test_prompt_id.py | 116 ++++++++++++++++++ libs/server/kiln_server/run_api.py | 9 +- libs/server/kiln_server/test_prompt_api.py | 3 +- 21 files changed, 253 insertions(+), 240 deletions(-) create mode 100644 libs/core/kiln_ai/datamodel/prompt_id.py create mode 100644 libs/core/kiln_ai/datamodel/test_prompt_id.py diff --git a/app/desktop/studio_server/data_gen_api.py b/app/desktop/studio_server/data_gen_api.py index 958cabdd..c6fb66f6 100644 --- a/app/desktop/studio_server/data_gen_api.py +++ b/app/desktop/studio_server/data_gen_api.py @@ -6,8 +6,7 @@ DataGenSampleTask, DataGenSampleTaskInput, ) -from kiln_ai.adapters.prompt_builders import PromptId -from kiln_ai.datamodel import DataSource, DataSourceType, TaskRun +from kiln_ai.datamodel import DataSource, DataSourceType, PromptId, TaskRun from kiln_server.run_api import model_provider_from_string from kiln_server.task_api import task_from_id from pydantic import BaseModel, ConfigDict, Field diff --git a/app/desktop/studio_server/prompt_api.py b/app/desktop/studio_server/prompt_api.py index 4f63431a..4e992983 100644 --- a/app/desktop/studio_server/prompt_api.py +++ b/app/desktop/studio_server/prompt_api.py @@ -1,5 +1,6 @@ from fastapi import FastAPI, HTTPException -from kiln_ai.adapters.prompt_builders import PromptId, prompt_builder_from_id +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.datamodel import PromptId from kiln_server.task_api import task_from_id from pydantic import BaseModel diff --git a/libs/core/kiln_ai/adapters/adapter_registry.py b/libs/core/kiln_ai/adapters/adapter_registry.py index 60786b51..a8a04ca6 100644 --- a/libs/core/kiln_ai/adapters/adapter_registry.py +++ b/libs/core/kiln_ai/adapters/adapter_registry.py @@ -8,8 +8,8 @@ OpenAICompatibleAdapter, OpenAICompatibleConfig, ) -from kiln_ai.adapters.prompt_builders import PromptId from kiln_ai.adapters.provider_tools import core_provider, openai_compatible_config +from kiln_ai.datamodel import PromptId from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index 62a73e01..313662c1 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -5,6 +5,7 @@ from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode from kiln_ai.adapters.parsers.parser_registry import model_parser_from_id +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id from kiln_ai.adapters.provider_tools import kiln_model_provider_from from kiln_ai.adapters.run_output import RunOutput from kiln_ai.datamodel import ( @@ -15,7 +16,7 @@ TaskRun, ) from kiln_ai.datamodel.json_schema import validate_schema -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config @@ -55,7 +56,9 @@ def __init__( config: AdapterConfig | None = None, ): self.run_config = run_config - self.prompt_builder = run_config.prompt_builder() + self.prompt_builder = prompt_builder_from_id( + run_config.prompt_id, run_config.task + ) self._model_provider: KilnModelProvider | None = None self.output_schema = self.task().output_json_schema diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index d276cb67..e9896c69 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -29,10 +29,8 @@ ollama_base_url, ollama_model_installed, ) -from kiln_ai.adapters.prompt_builders import ( - PromptId, -) -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index cabbd29e..d5edcba5 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -20,8 +20,8 @@ OpenAICompatibleConfig, ) from kiln_ai.adapters.parsers.json_parser import parse_json_string -from kiln_ai.adapters.prompt_builders import PromptId -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py index a9d67365..3628fc72 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py @@ -5,7 +5,7 @@ from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.datamodel import Task -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel.task import RunConfig class MockAdapter(BaseAdapter): diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py index 72519e8c..e62a87d4 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py @@ -19,7 +19,7 @@ langchain_model_from_provider, ) from kiln_ai.adapters.test_prompt_adaptors import build_test_task -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel.task import RunConfig @pytest.fixture diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py index 06d39dfe..0c904507 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py @@ -12,7 +12,7 @@ Project, Task, ) -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py index 84e1a253..2cc2bcbb 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py @@ -2,8 +2,6 @@ from pathlib import Path from typing import Dict -import jsonschema -import jsonschema.exceptions import pytest import kiln_ai.datamodel as datamodel @@ -16,13 +14,9 @@ RunOutput, ) from kiln_ai.adapters.ollama_tools import ollama_online -from kiln_ai.adapters.prompt_builders import ( - BasePromptBuilder, - PromptId, - SimpleChainOfThoughtPromptBuilder, -) from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.datamodel.test_json_schema import json_joke_schema, json_triangle_schema diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 9b53e13a..68f58c94 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -5,9 +5,8 @@ from pydantic import AfterValidator -from kiln_ai.datamodel import BasePrompt, Task, TaskRun +from kiln_ai.datamodel import PromptGenerators, PromptId, Task, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error -from kiln_ai.utils.formatting import snake_case class BasePromptBuilder(metaclass=ABCMeta): @@ -385,70 +384,6 @@ def chain_of_thought_prompt(self) -> str | None: return self.fine_tune_model.thinking_instructions -# Generators that can take any task and build a prompt -class PromptGenerators(str, Enum): - SIMPLE = "simple_prompt_builder" - MULTI_SHOT = "multi_shot_prompt_builder" - FEW_SHOT = "few_shot_prompt_builder" - REPAIRS = "repairs_prompt_builder" - SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder" - FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder" - MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder" - - -prompt_generator_values = [pg.value for pg in PromptGenerators] - - -# Our prompt ID can be one of: -# - A saved prompt ID -# - A fine-tune prompt ID -# - A prompt generator name -PromptId = Annotated[ - str, - AfterValidator(lambda v: _check_prompt_id(v)), -] -""" -A pydantic type that validates strings containing a valid prompt ID. -""" - - -def _check_prompt_id(id: str) -> str: - """ - Check that the prompt ID is valid. - """ - if id in prompt_generator_values: - return id - - if id.startswith("id::"): - # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id' - parts = id.split("::") - if len(parts) != 2 or len(parts[1]) == 0: - raise ValueError( - f"Invalid saved prompt ID: {id}. Expected format: 'id::[prompt_id]'." - ) - return id - - if id.startswith("eval_prompt::"): - # check it had a eval_id after the :: -- 'project_id::task_id::eval_id::eval_config_id' - parts = id.split("::") - if len(parts) != 5: - raise ValueError( - f"Invalid eval prompt ID: {id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]'." - ) - return id - - if id.startswith("fine_tune_prompt::"): - # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id' - fine_tune_id = id[18:] - if len(fine_tune_id) == 0: - raise ValueError( - f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'." - ) - return id - - raise ValueError(f"Invalid prompt ID: {id}") - - # Our UI has some names that are not the same as the class names, which also hint parameters. def prompt_builder_from_id(prompt_id: PromptId, task: Task) -> BasePromptBuilder: """Convert a name used in the UI to the corresponding prompt builder class. diff --git a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py index bd4188ed..c5f53324 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py +++ b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py @@ -11,9 +11,9 @@ from kiln_ai.adapters.ollama_tools import ollama_online from kiln_ai.adapters.prompt_builders import ( BasePromptBuilder, - PromptId, SimpleChainOfThoughtPromptBuilder, ) +from kiln_ai.datamodel import PromptId def get_all_models_and_providers(): diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 5e8a0c2a..5af63bdf 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -14,8 +14,6 @@ FineTunePromptBuilder, MultiShotChainOfThoughtPromptBuilder, MultiShotPromptBuilder, - PromptGenerators, - PromptId, RepairsPromptBuilder, SavedPromptBuilder, SimpleChainOfThoughtPromptBuilder, @@ -31,6 +29,8 @@ FinetuneDataStrategy, Project, Prompt, + PromptGenerators, + PromptId, Task, TaskOutput, TaskOutputRating, @@ -589,86 +589,6 @@ def test_build_prompt_with_json_instructions(tmp_path): assert requirement.instruction in prompt_with_json -# Test model to validate the PromptId type -class TestModel(BaseModel): - prompt_id: PromptId - - -def test_valid_prompt_generator_names(): - """Test that valid prompt generator names are accepted""" - for generator in PromptGenerators: - model = TestModel(prompt_id=generator.value) - assert model.prompt_id == generator.value - - -def test_valid_saved_prompt_id(): - """Test that valid saved prompt IDs are accepted""" - valid_id = "id::prompt_789" - model = TestModel(prompt_id=valid_id) - assert model.prompt_id == valid_id - - -def test_valid_fine_tune_prompt_id(): - """Test that valid fine-tune prompt IDs are accepted""" - valid_id = "fine_tune_prompt::ft_123456" - model = TestModel(prompt_id=valid_id) - assert model.prompt_id == valid_id - - -@pytest.mark.parametrize( - "invalid_id", - [ - pytest.param("id::project_123::task_456", id="missing_prompt_id"), - pytest.param("id::task_456::prompt_789", id="too_many_parts"), - pytest.param("id::", id="empty_parts"), - ], -) -def test_invalid_saved_prompt_id_format(invalid_id): - """Test that invalid saved prompt ID formats are rejected""" - with pytest.raises(ValidationError, match="Invalid saved prompt ID"): - TestModel(prompt_id=invalid_id) - - -@pytest.mark.parametrize( - "invalid_id,expected_error", - [ - ("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"), - ("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"), - ], -) -def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error): - """Test that invalid fine-tune prompt ID formats are rejected""" - with pytest.raises(ValidationError, match=expected_error): - TestModel(prompt_id=invalid_id) - - -def test_completely_invalid_formats(): - """Test that completely invalid formats are rejected""" - invalid_ids = [ - "", # Empty string - "invalid_format", # Random string - "id:wrong_format", # Almost correct but wrong separator - "fine_tune:wrong_format", # Almost correct but wrong prefix - ":::", # Just separators - ] - - for invalid_id in invalid_ids: - with pytest.raises(ValidationError, match="Invalid prompt ID"): - TestModel(prompt_id=invalid_id) - - -def test_prompt_generator_case_sensitivity(): - """Test that prompt generator names are case sensitive""" - # Take first generator and modify its case - first_generator = next(iter(PromptGenerators)).value - wrong_case = first_generator.upper() - if wrong_case == first_generator: - wrong_case = first_generator.lower() - - with pytest.raises(ValidationError): - TestModel(prompt_id=wrong_case) - - @pytest.fixture def valid_eval_config_datasource(): return DataSource( @@ -757,30 +677,3 @@ def test_eval_prompt_builder_validation_errors(tmp_path): ) with pytest.raises(ValueError, match="Eval config ID not found"): EvalPromptBuilder(task=task, eval_config_prompt_id=nonexistent_config) - - -@pytest.mark.parametrize( - "valid_id", - [ - "eval_prompt::project_123::task_456::eval_789::config_012", # Valid eval prompt ID - ], -) -def test_valid_eval_prompt_id(valid_id): - """Test that valid eval prompt IDs are accepted""" - model = TestModel(prompt_id=valid_id) - assert model.prompt_id == valid_id - - -@pytest.mark.parametrize( - "invalid_id,expected_error", - [ - ("eval_prompt::", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1::e1", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1::e1::c1::extra", "Invalid eval prompt ID"), - ], -) -def test_invalid_eval_prompt_id_format(invalid_id, expected_error): - """Test that invalid eval prompt ID formats are rejected""" - with pytest.raises(ValidationError, match=expected_error): - TestModel(prompt_id=invalid_id) diff --git a/libs/core/kiln_ai/datamodel/__init__.py b/libs/core/kiln_ai/datamodel/__init__.py index 09a33e51..f53f76ea 100644 --- a/libs/core/kiln_ai/datamodel/__init__.py +++ b/libs/core/kiln_ai/datamodel/__init__.py @@ -28,6 +28,11 @@ ) from kiln_ai.datamodel.project import Project from kiln_ai.datamodel.prompt import BasePrompt, Prompt +from kiln_ai.datamodel.prompt_id import ( + PromptGenerators, + PromptId, + prompt_generator_values, +) from kiln_ai.datamodel.task import Task, TaskRequirement from kiln_ai.datamodel.task_output import ( DataSource, @@ -66,4 +71,7 @@ "TaskOutputRating", "StructuredOutputMode", "FinetuneDataStrategy", + "PromptId", + "PromptGenerators", + "prompt_generator_values", ] diff --git a/libs/core/kiln_ai/datamodel/prompt_id.py b/libs/core/kiln_ai/datamodel/prompt_id.py new file mode 100644 index 00000000..4285aa00 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/prompt_id.py @@ -0,0 +1,69 @@ +from enum import Enum +from typing import Annotated + +from pydantic import AfterValidator + + +# Generators that can take any task and build a prompt +class PromptGenerators(str, Enum): + SIMPLE = "simple_prompt_builder" + MULTI_SHOT = "multi_shot_prompt_builder" + FEW_SHOT = "few_shot_prompt_builder" + REPAIRS = "repairs_prompt_builder" + SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder" + FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder" + MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder" + + +prompt_generator_values = [pg.value for pg in PromptGenerators] + + +PromptId = Annotated[ + str, + AfterValidator(lambda v: _check_prompt_id(v)), +] +""" +A pydantic type that validates strings containing a valid prompt ID. + +Prompt IDs can be one of: +- A saved prompt ID +- A fine-tune prompt ID +- A prompt generator name +""" + + +def _check_prompt_id(id: str) -> str: + """ + Check that the prompt ID is valid. + """ + if id in prompt_generator_values: + return id + + if id.startswith("id::"): + # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id' + parts = id.split("::") + if len(parts) != 2 or len(parts[1]) == 0: + raise ValueError( + f"Invalid saved prompt ID: {id}. Expected format: 'id::[prompt_id]'." + ) + return id + + if id.startswith("eval_prompt::"): + # check it had a eval_id after the :: -- 'project_id::task_id::eval_id::eval_config_id' + parts = id.split("::") + if len(parts) != 5: + raise ValueError( + f"Invalid eval prompt ID: {id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]'." + ) + return id + + if id.startswith("fine_tune_prompt::"): + # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id' + fine_tune_id = id[18:] + if len(fine_tune_id) == 0: + raise ValueError( + f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'." + ) + return id + + raise ValueError(f"Invalid prompt ID: {id}") diff --git a/libs/core/kiln_ai/datamodel/run_config.py b/libs/core/kiln_ai/datamodel/run_config.py index da25907f..8007550e 100644 --- a/libs/core/kiln_ai/datamodel/run_config.py +++ b/libs/core/kiln_ai/datamodel/run_config.py @@ -1,58 +1,28 @@ from typing import TYPE_CHECKING, Union -from pydantic import BaseModel, Field, model_validator +from pydantic import Field, model_validator from typing_extensions import Self -from kiln_ai.adapters.prompt_builders import ( - BasePromptBuilder, - PromptGenerators, - PromptId, - prompt_builder_from_id, -) from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel -from kiln_ai.datamodel.task import Task if TYPE_CHECKING: - from kiln_ai.datamodel.task import Task + from kiln_ai.datamodel.task import RunConfig, Task -class RunConfig(BaseModel): +class TaskRunConfig(KilnParentedModel): """ - A configuration for running a task. + A Kiln model for persisting a run config in a Kiln Project, nested under a task. - This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). - - For example: task, model, provider, prompt (ID, builder, etc), etc. - """ - - task: "Task" = Field(description="The task to run.") - model_name: str = Field(description="The model to use for this run config.") - model_provider_name: str = Field( - description="The provider to use for this run config." - ) - prompt_id: PromptId = Field( - description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.", - default=PromptGenerators.SIMPLE, - ) - - def prompt_builder(self) -> BasePromptBuilder: - return prompt_builder_from_id(self.prompt_id, self.task) - - -class TaskRunConfig(RunConfig, KilnParentedModel): - """ - A run config, parented to a Kiln Task. + Typically used to save a method of running a task for evaluation. A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). - - Used for saving and sharing run configs in a Kiln Project. """ name: str = NAME_FIELD description: str | None = Field( default=None, description="The description of the task run config." ) - run_config: RunConfig = Field( + run_config: "RunConfig" = Field( description="The run config to use for this task run." ) diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 6af3dc4f..9e71f277 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -16,10 +16,12 @@ from kiln_ai.datamodel.eval import Eval from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str from kiln_ai.datamodel.prompt import Prompt +from kiln_ai.datamodel.prompt_id import PromptGenerators, PromptId from kiln_ai.datamodel.task_run import TaskRun if TYPE_CHECKING: from kiln_ai.datamodel.project import Project + from kiln_ai.datamodel.task import RunConfig class TaskRequirement(BaseModel): @@ -38,6 +40,26 @@ class TaskRequirement(BaseModel): type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) +class RunConfig(BaseModel): + """ + A configuration for running a task. + + This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + + For example: task, model, provider, prompt, etc. + """ + + task: "Task" = Field(description="The task to run.") + model_name: str = Field(description="The model to use for this run config.") + model_provider_name: str = Field( + description="The provider to use for this run config." + ) + prompt_id: PromptId = Field( + description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.", + default=PromptGenerators.SIMPLE, + ) + + class Task( KilnParentedModel, KilnParentModel, @@ -47,6 +69,7 @@ class Task( "finetunes": Finetune, "prompts": Prompt, "evals": Eval, + # "run_configs": "RunConfig, }, ): """ diff --git a/libs/core/kiln_ai/datamodel/test_basemodel.py b/libs/core/kiln_ai/datamodel/test_basemodel.py index 2dc848d1..d93de053 100644 --- a/libs/core/kiln_ai/datamodel/test_basemodel.py +++ b/libs/core/kiln_ai/datamodel/test_basemodel.py @@ -15,7 +15,7 @@ string_to_valid_name, ) from kiln_ai.datamodel.model_cache import ModelCache -from kiln_ai.datamodel.run_config import RunConfig +from kiln_ai.datamodel.task import RunConfig @pytest.fixture diff --git a/libs/core/kiln_ai/datamodel/test_prompt_id.py b/libs/core/kiln_ai/datamodel/test_prompt_id.py new file mode 100644 index 00000000..4592e0c9 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_prompt_id.py @@ -0,0 +1,116 @@ +import pytest +from pydantic import BaseModel, ValidationError + +from kiln_ai.datamodel import ( + DataSource, + DataSourceType, + PromptGenerators, + PromptId, +) + + +# Test model to validate the PromptId type +class ModelTester(BaseModel): + prompt_id: PromptId + + +def test_valid_prompt_generator_names(): + """Test that valid prompt generator names are accepted""" + for generator in PromptGenerators: + model = ModelTester(prompt_id=generator.value) + assert model.prompt_id == generator.value + + +def test_valid_saved_prompt_id(): + """Test that valid saved prompt IDs are accepted""" + valid_id = "id::prompt_789" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +def test_valid_fine_tune_prompt_id(): + """Test that valid fine-tune prompt IDs are accepted""" + valid_id = "fine_tune_prompt::ft_123456" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id", + [ + pytest.param("id::project_123::task_456", id="missing_prompt_id"), + pytest.param("id::task_456::prompt_789", id="too_many_parts"), + pytest.param("id::", id="empty_parts"), + ], +) +def test_invalid_saved_prompt_id_format(invalid_id): + """Test that invalid saved prompt ID formats are rejected""" + with pytest.raises(ValidationError, match="Invalid saved prompt ID"): + ModelTester(prompt_id=invalid_id) + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"), + ("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"), + ], +) +def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error): + """Test that invalid fine-tune prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + ModelTester(prompt_id=invalid_id) + + +def test_completely_invalid_formats(): + """Test that completely invalid formats are rejected""" + invalid_ids = [ + "", # Empty string + "invalid_format", # Random string + "id:wrong_format", # Almost correct but wrong separator + "fine_tune:wrong_format", # Almost correct but wrong prefix + ":::", # Just separators + ] + + for invalid_id in invalid_ids: + with pytest.raises(ValidationError, match="Invalid prompt ID"): + ModelTester(prompt_id=invalid_id) + + +def test_prompt_generator_case_sensitivity(): + """Test that prompt generator names are case sensitive""" + # Take first generator and modify its case + first_generator = next(iter(PromptGenerators)).value + wrong_case = first_generator.upper() + if wrong_case == first_generator: + wrong_case = first_generator.lower() + + with pytest.raises(ValidationError): + ModelTester(prompt_id=wrong_case) + + +@pytest.mark.parametrize( + "valid_id", + [ + "eval_prompt::project_123::task_456::eval_789::config_012", # Valid eval prompt ID + ], +) +def test_valid_eval_prompt_id(valid_id): + """Test that valid eval prompt IDs are accepted""" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("eval_prompt::", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1::e1", "Invalid eval prompt ID"), + ("eval_prompt::p1::t1::e1::c1::extra", "Invalid eval prompt ID"), + ], +) +def test_invalid_eval_prompt_id_format(invalid_id, expected_error): + """Test that invalid eval prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + ModelTester(prompt_id=invalid_id) diff --git a/libs/server/kiln_server/run_api.py b/libs/server/kiln_server/run_api.py index e0ae2826..13b25990 100644 --- a/libs/server/kiln_server/run_api.py +++ b/libs/server/kiln_server/run_api.py @@ -5,8 +5,13 @@ from fastapi import FastAPI, HTTPException from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.prompt_builders import PromptId -from kiln_ai.datamodel import Task, TaskOutputRating, TaskOutputRatingType, TaskRun +from kiln_ai.datamodel import ( + PromptId, + Task, + TaskOutputRating, + TaskOutputRatingType, + TaskRun, +) from kiln_ai.datamodel.basemodel import ID_TYPE from pydantic import BaseModel, ConfigDict diff --git a/libs/server/kiln_server/test_prompt_api.py b/libs/server/kiln_server/test_prompt_api.py index a855af92..69a06dc0 100644 --- a/libs/server/kiln_server/test_prompt_api.py +++ b/libs/server/kiln_server/test_prompt_api.py @@ -3,8 +3,7 @@ import pytest from fastapi import FastAPI from fastapi.testclient import TestClient -from kiln_ai.adapters.prompt_builders import PromptGenerators -from kiln_ai.datamodel import Project, Prompt, Task +from kiln_ai.datamodel import Project, Prompt, PromptGenerators, Task from kiln_server.custom_errors import connect_custom_errors from kiln_server.prompt_api import _prompt_generators, connect_prompt_api From 33e60ba5e5d1d408f0a3203b89c48f0684dc28c4 Mon Sep 17 00:00:00 2001 From: scosman Date: Mon, 17 Feb 2025 20:24:35 -0500 Subject: [PATCH 022/102] Add tests, and refactor the run_config files --- libs/core/kiln_ai/datamodel/run_config.py | 45 ------- libs/core/kiln_ai/datamodel/task.py | 42 ++++++- libs/core/kiln_ai/datamodel/test_prompt_id.py | 2 - libs/core/kiln_ai/datamodel/test_task.py | 115 ++++++++++++++++++ 4 files changed, 154 insertions(+), 50 deletions(-) delete mode 100644 libs/core/kiln_ai/datamodel/run_config.py create mode 100644 libs/core/kiln_ai/datamodel/test_task.py diff --git a/libs/core/kiln_ai/datamodel/run_config.py b/libs/core/kiln_ai/datamodel/run_config.py deleted file mode 100644 index 8007550e..00000000 --- a/libs/core/kiln_ai/datamodel/run_config.py +++ /dev/null @@ -1,45 +0,0 @@ -from typing import TYPE_CHECKING, Union - -from pydantic import Field, model_validator -from typing_extensions import Self - -from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel - -if TYPE_CHECKING: - from kiln_ai.datamodel.task import RunConfig, Task - - -class TaskRunConfig(KilnParentedModel): - """ - A Kiln model for persisting a run config in a Kiln Project, nested under a task. - - Typically used to save a method of running a task for evaluation. - - A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). - """ - - name: str = NAME_FIELD - description: str | None = Field( - default=None, description="The description of the task run config." - ) - run_config: "RunConfig" = Field( - description="The run config to use for this task run." - ) - - # Workaround to return typed parent without importing Task - def parent_task(self) -> Union["Task", None]: - if self.parent is None or self.parent.__class__.__name__ != "Task": - return None - return self.parent # type: ignore - - @model_validator(mode="after") - def validate_task(self) -> Self: - # Check that the task in the run config matches the parent task - parent_task = self.parent_task() - if parent_task is None: - raise ValueError("Run config must be parented to a task") - if self.run_config.task is None: - raise ValueError("Run config must have a task") - if self.run_config.task.id != parent_task.id: - raise ValueError("Run config task must match parent task") - return self diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 9e71f277..1a44802f 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -1,6 +1,7 @@ from typing import TYPE_CHECKING, Dict, List, Union -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, model_validator +from typing_extensions import Self from kiln_ai.datamodel import Finetune from kiln_ai.datamodel.basemodel import ( @@ -21,7 +22,6 @@ if TYPE_CHECKING: from kiln_ai.datamodel.project import Project - from kiln_ai.datamodel.task import RunConfig class TaskRequirement(BaseModel): @@ -60,6 +60,42 @@ class RunConfig(BaseModel): ) +class TaskRunConfig(KilnParentedModel): + """ + A Kiln model for persisting a run config in a Kiln Project, nested under a task. + + Typically used to save a method of running a task for evaluation. + + A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + """ + + name: str = NAME_FIELD + description: str | None = Field( + default=None, description="The description of the task run config." + ) + run_config: "RunConfig" = Field( + description="The run config to use for this task run." + ) + + # Workaround to return typed parent without importing Task + def parent_task(self) -> Union["Task", None]: + if self.parent is None or self.parent.__class__.__name__ != "Task": + return None + return self.parent # type: ignore + + @model_validator(mode="after") + def validate_task(self) -> Self: + # Check that the task in the run config matches the parent task + parent_task = self.parent_task() + if parent_task is None: + raise ValueError("Run config must be parented to a task") + if self.run_config.task is None: + raise ValueError("Run config must have a task") + if self.run_config.task.id != parent_task.id: + raise ValueError("Run config task must match parent task") + return self + + class Task( KilnParentedModel, KilnParentModel, @@ -69,7 +105,7 @@ class Task( "finetunes": Finetune, "prompts": Prompt, "evals": Eval, - # "run_configs": "RunConfig, + "run_configs": TaskRunConfig, }, ): """ diff --git a/libs/core/kiln_ai/datamodel/test_prompt_id.py b/libs/core/kiln_ai/datamodel/test_prompt_id.py index 4592e0c9..23cd1d3a 100644 --- a/libs/core/kiln_ai/datamodel/test_prompt_id.py +++ b/libs/core/kiln_ai/datamodel/test_prompt_id.py @@ -2,8 +2,6 @@ from pydantic import BaseModel, ValidationError from kiln_ai.datamodel import ( - DataSource, - DataSourceType, PromptGenerators, PromptId, ) diff --git a/libs/core/kiln_ai/datamodel/test_task.py b/libs/core/kiln_ai/datamodel/test_task.py new file mode 100644 index 00000000..c123fa8e --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_task.py @@ -0,0 +1,115 @@ +import pytest +from pydantic import ValidationError + +from kiln_ai.datamodel.prompt_id import PromptGenerators +from kiln_ai.datamodel.task import RunConfig, Task, TaskRunConfig + + +def test_runconfig_valid_creation(): + task = Task(id="task1", name="Test Task", instruction="Do something") + + config = RunConfig(task=task, model_name="gpt-4", model_provider_name="openai") + + assert config.task == task + assert config.model_name == "gpt-4" + assert config.model_provider_name == "openai" + assert config.prompt_id == PromptGenerators.SIMPLE # Check default value + + +def test_runconfig_missing_required_fields(): + with pytest.raises(ValidationError) as exc_info: + RunConfig() + + errors = exc_info.value.errors() + assert len(errors) == 3 # task, model_name, and model_provider_name are required + assert any(error["loc"][0] == "task" for error in errors) + assert any(error["loc"][0] == "model_name" for error in errors) + assert any(error["loc"][0] == "model_provider_name" for error in errors) + + +def test_runconfig_custom_prompt_id(): + task = Task(id="task1", name="Test Task", instruction="Do something") + + config = RunConfig( + task=task, + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, + ) + + assert config.prompt_id == PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT + + +@pytest.fixture +def sample_task(): + return Task(name="Test Task", instruction="Test instruction") + + +@pytest.fixture +def sample_run_config(sample_task): + return RunConfig(task=sample_task, model_name="gpt-4", model_provider_name="openai") + + +def test_task_run_config_valid_creation(sample_task, sample_run_config): + config = TaskRunConfig( + name="Test Config", + description="Test description", + run_config=sample_run_config, + parent=sample_task, + ) + + assert config.name == "Test Config" + assert config.description == "Test description" + assert config.run_config == sample_run_config + assert config.parent_task() == sample_task + + +def test_task_run_config_minimal_creation(sample_task, sample_run_config): + # Test creation with only required fields + config = TaskRunConfig( + name="Test Config", run_config=sample_run_config, parent=sample_task + ) + + assert config.name == "Test Config" + assert config.description is None + assert config.run_config == sample_run_config + + +def test_task_run_config_missing_required_fields(sample_task): + # Test missing name + with pytest.raises(ValidationError) as exc_info: + TaskRunConfig( + run_config=RunConfig( + task=sample_task, model_name="gpt-4", model_provider_name="openai" + ), + parent=sample_task, + ) + assert "Field required" in str(exc_info.value) + + # Test missing run_config + with pytest.raises(ValidationError) as exc_info: + TaskRunConfig(name="Test Config", parent=sample_task) + assert "Field required" in str(exc_info.value) + + +def test_task_run_config_task_mismatch(sample_task, sample_run_config): + # Create a different task + different_task = Task(name="Different Task", instruction="Different instruction") + + # Test run_config task different from parent task + with pytest.raises(ValueError, match="Run config task must match parent task"): + TaskRunConfig( + name="Test Config", run_config=sample_run_config, parent=different_task + ) + + +def test_task_run_config_missing_task_in_run_config(sample_task): + with pytest.raises( + ValidationError, match="Input should be a valid dictionary or instance of Task" + ): + # Create a run config without a task + RunConfig( + model_name="gpt-4", + model_provider_name="openai", + task=None, # type: ignore + ) From cc8daa85e62b35b97e999863208aa65cf427187f Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 18 Feb 2025 13:41:37 -0500 Subject: [PATCH 023/102] New eval_run data structure --- libs/core/kiln_ai/adapters/eval/base_eval.py | 4 +- libs/core/kiln_ai/datamodel/eval.py | 34 +++++++- .../core/kiln_ai/datamodel/test_eval_model.py | 83 ++++++++++++++++++- 3 files changed, 116 insertions(+), 5 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index f28c0387..428b7e65 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -5,7 +5,7 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig -from kiln_ai.datamodel.eval import EvalConfig +from kiln_ai.datamodel.eval import EvalConfig, EvalScores from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -40,7 +40,7 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - async def run(self, input: Dict | str) -> Dict[str, float]: + async def run(self, input: Dict | str) -> EvalScores: run_adapter = adapter_for_task( self.target_task, # TODO: take these from evalRun diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 4acb5baf..41534942 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -1,6 +1,6 @@ import json from enum import Enum -from typing import TYPE_CHECKING, Any, Union +from typing import TYPE_CHECKING, Any, Dict, Union from pydantic import Field, model_validator from typing_extensions import Self @@ -17,6 +17,8 @@ if TYPE_CHECKING: from kiln_ai.datamodel.task import Task +EvalScores = Dict[str, float] + class EvalState(str, Enum): enabled = "enabled" @@ -28,7 +30,35 @@ class EvalConfigType(str, Enum): llm_as_judge = "llm_as_judge" -class EvalConfig(KilnParentedModel): +class EvalRun(KilnParentedModel): + """ + The results of running an eval on a single dataset item, with a specific TaskRunConfig and EvalConfig. + """ + + dataset_id: ID_TYPE = Field( + description="The ID of the dataset item that was used for this run (we only use it's input). Must belong to the same Task as this eval." + ) + task_run_config_id: ID_TYPE = Field( + description="The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval." + ) + # This may duplicate the dataset_id.input, but we're denormalizing intentionally. + input: str = Field( + description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." + ) + output: str = Field( + description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." + ) + scores: EvalScores = Field( + description="The scores of the evaluator (specifically the EvalConfig this object is a child of)." + ) + + def parent_eval_config(self) -> "EvalConfig": + if self.parent is None or self.parent.__class__.__name__ != "EvalConfig": + raise ValueError("parent must be an EvalConfig") + return self.parent # type: ignore + + +class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): """ A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index a9f5f9bf..eedab6a8 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -1,4 +1,5 @@ import pytest +from pydantic import ValidationError from kiln_ai.datamodel import BasePrompt from kiln_ai.datamodel.basemodel import KilnParentModel @@ -6,6 +7,7 @@ Eval, EvalConfig, EvalConfigType, + EvalRun, EvalState, ) from kiln_ai.datamodel.task import Task @@ -152,7 +154,7 @@ class DummyParent(KilnParentModel, parent_of={}): Eval(name="Test Eval", parent=DummyParent()) -def test_eval_with_configs(mock_task, valid_eval_config_data, tmp_path): +def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_path): task_path = tmp_path / "task.kiln" mock_task.path = task_path mock_task.save_to_file() @@ -164,6 +166,16 @@ def test_eval_with_configs(mock_task, valid_eval_config_data, tmp_path): config = EvalConfig(parent=eval, **valid_eval_config_data) config.save_to_file() + run = EvalRun( + parent=config, + dataset_id="dataset123", + task_run_config_id="config456", + input='{"key": "value"}', + output='{"result": "success"}', + scores={"accuracy": 0.95, "f1": 0.88}, + ) + run.save_to_file() + # Test configs can be retrieved from disk evals = mock_task.evals() assert len(evals) == 1 @@ -175,3 +187,72 @@ def test_eval_with_configs(mock_task, valid_eval_config_data, tmp_path): # and back up assert configs[0].parent_eval().parent_task().path == task_path + + # Test runs can be retrieved from disk + runs = configs[0].runs() + assert len(runs) == 1 + assert runs[0].dataset_id == "dataset123" + assert runs[0].task_run_config_id == "config456" + assert runs[0].input == '{"key": "value"}' + assert runs[0].output == '{"result": "success"}' + assert runs[0].scores == {"accuracy": 0.95, "f1": 0.88} + + # and back up + assert runs[0].parent_eval_config().parent_eval().parent_task().path == task_path + + +def test_eval_run_valid_creation(): + """Test creating an EvalRun with valid data""" + eval_run = EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input='{"key": "value"}', # JSON formatted input + output='{"result": "success"}', # JSON formatted output + scores={"accuracy": 0.95, "f1": 0.88}, + ) + + assert eval_run.dataset_id == "dataset123" + assert eval_run.task_run_config_id == "config456" + assert eval_run.input == '{"key": "value"}' + assert eval_run.output == '{"result": "success"}' + assert eval_run.scores == {"accuracy": 0.95, "f1": 0.88} + + +def test_eval_run_plaintext(): + """Test creating an EvalRun with plaintext input/output""" + eval_run = EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input="What is the capital of France?", + output="The capital of France is Paris.", + scores={"accuracy": 1.0}, + ) + + assert eval_run.input == "What is the capital of France?" + assert eval_run.output == "The capital of France is Paris." + + +def test_eval_run_missing_required_fields(): + """Test that omitting required fields raises ValidationError""" + with pytest.raises(ValidationError) as exc_info: + EvalRun( + dataset_id="dataset123", + # missing task_run_config_id + input="test", + output="test", + scores={"score": 1.0}, + ) + + assert "task_run_config_id" in str(exc_info.value) + + +def test_eval_run_invalid_scores(): + """Test that scores must be a dict of floats""" + with pytest.raises(ValidationError): + EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input="test", + output="test", + scores={"score": "not a float"}, # invalid score type + ) From 654239123b90707e56ff61090088e71df81373d4 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 18 Feb 2025 15:43:31 -0500 Subject: [PATCH 024/102] Better pydantic typing for dataset filters, similar to promptIDs. Add a tag-based dataset filter. --- app/desktop/studio_server/finetune_api.py | 8 +- .../studio_server/test_finetune_api.py | 66 +++++----- app/web_ui/src/lib/api_schema.d.ts | 16 ++- .../[task_id]/create_finetune/+page.svelte | 3 +- .../core/kiln_ai/datamodel/dataset_filters.py | 114 ++++++++++++++++++ libs/core/kiln_ai/datamodel/dataset_split.py | 68 ++--------- .../kiln_ai/datamodel/test_dataset_filters.py | 71 +++++++++++ .../kiln_ai/datamodel/test_dataset_split.py | 34 +++++- 8 files changed, 268 insertions(+), 112 deletions(-) create mode 100644 libs/core/kiln_ai/datamodel/dataset_filters.py create mode 100644 libs/core/kiln_ai/datamodel/test_dataset_filters.py diff --git a/app/desktop/studio_server/finetune_api.py b/app/desktop/studio_server/finetune_api.py index 82744ed8..ad2e4b46 100644 --- a/app/desktop/studio_server/finetune_api.py +++ b/app/desktop/studio_server/finetune_api.py @@ -24,9 +24,11 @@ FineTuneStatusType, Task, ) +from kiln_ai.datamodel.dataset_filters import ( + DatasetFilterId, +) from kiln_ai.datamodel.dataset_split import ( AllSplitDefinition, - DatasetFilterType, Train60Test20Val20SplitDefinition, Train80Test10Val10SplitDefinition, Train80Test20SplitDefinition, @@ -73,7 +75,7 @@ class CreateDatasetSplitRequest(BaseModel): """Request to create a dataset split""" dataset_split_type: DatasetSplitType - filter_type: DatasetFilterType + filter_id: DatasetFilterId name: str | None = None description: str | None = None @@ -206,7 +208,7 @@ async def create_dataset_split( name, task, split_definitions, - filter_type=request.filter_type, + filter_id=request.filter_id, description=request.description, ) dataset_split.save_to_file() diff --git a/app/desktop/studio_server/test_finetune_api.py b/app/desktop/studio_server/test_finetune_api.py index 087e73a9..b86eeecf 100644 --- a/app/desktop/studio_server/test_finetune_api.py +++ b/app/desktop/studio_server/test_finetune_api.py @@ -15,21 +15,18 @@ Project, Task, ) +from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.dataset_split import ( - AllDatasetFilter, AllSplitDefinition, - HighRatingDatasetFilter, - ThinkingModelDatasetFilter, - ThinkingModelHighRatedFilter, Train60Test20Val20SplitDefinition, Train80Test10Val10SplitDefinition, Train80Test20SplitDefinition, ) +from pydantic import BaseModel from app.desktop.studio_server.finetune_api import ( CreateDatasetSplitRequest, CreateFinetuneRequest, - DatasetFilterType, DatasetSplitType, connect_fine_tune_api, thinking_instructions_from_request, @@ -281,9 +278,28 @@ def test_dataset_split_type_enum(): assert DatasetSplitType.ALL.value == "all" -def test_dataset_filter_type_enum(): - assert DatasetFilterType.ALL.value == "all" - assert DatasetFilterType.HIGH_RATING.value == "high_rating" +class ModelTester(BaseModel): + dataset_id: DatasetFilterId + + +# Check these stings from UI exist +@pytest.mark.parametrize( + "id,expect_error", + [ + ("all", False), + ("high_rating", False), + ("thinking_model", False), + ("thinking_model_high_rated", False), + ("invalid", True), + ], +) +def test_dataset_filter_ids(id, expect_error): + if expect_error: + with pytest.raises(ValueError): + ModelTester(dataset_id=id) + else: + model = ModelTester(dataset_id=id) + assert model.dataset_id == id def test_api_split_types_mapping(): @@ -303,22 +319,6 @@ def test_api_split_types_mapping(): assert split_type in api_split_types -def test_api_filter_types_mapping(): - from kiln_ai.datamodel.dataset_split import dataset_filters - - assert dataset_filters[DatasetFilterType.ALL] == AllDatasetFilter - assert dataset_filters[DatasetFilterType.HIGH_RATING] == HighRatingDatasetFilter - assert ( - dataset_filters[DatasetFilterType.THINKING_MODEL] == ThinkingModelDatasetFilter - ) - assert ( - dataset_filters[DatasetFilterType.THINKING_MODEL_HIGH_RATED] - == ThinkingModelHighRatedFilter - ) - for filter_type in DatasetFilterType: - assert filter_type in dataset_filters - - @pytest.fixture def mock_dataset_split(): split = DatasetSplit( @@ -342,7 +342,7 @@ def test_create_dataset_split( with mock_from_task as from_task_mock, mock_save as save_mock: request_data = { "dataset_split_type": "train_test", - "filter_type": "high_rating", + "filter_id": "high_rating", "name": "Test Split", "description": "Test description", } @@ -360,7 +360,7 @@ def test_create_dataset_split( mock_task_from_id_disk_backed.assert_called_once_with("project1", "task1") from_task_mock.assert_called_once() args, kwargs = from_task_mock.call_args - assert kwargs["filter_type"] == DatasetFilterType.HIGH_RATING + assert kwargs["filter_id"] == "high_rating" save_mock.assert_called_once() @@ -374,7 +374,7 @@ def test_create_dataset_split_auto_name( mock_save = unittest.mock.patch.object(DatasetSplit, "save_to_file") with mock_from_task as from_task_mock, mock_save as save_mock: - request_data = {"dataset_split_type": "train_test", "filter_type": "all"} + request_data = {"dataset_split_type": "train_test", "filter_id": "all"} response = client.post( "/api/projects/project1/tasks/task1/dataset_splits", json=request_data @@ -395,33 +395,31 @@ def test_create_dataset_split_request_validation(): # Test valid request request = CreateDatasetSplitRequest( dataset_split_type=DatasetSplitType.TRAIN_TEST, - filter_type=DatasetFilterType.ALL, + filter_id="all", name="Test Split", description="Test description", ) assert request.dataset_split_type == DatasetSplitType.TRAIN_TEST - assert request.filter_type == DatasetFilterType.ALL + assert request.filter_id == "all" assert request.name == "Test Split" assert request.description == "Test description" # Test optional fields request = CreateDatasetSplitRequest( dataset_split_type=DatasetSplitType.TRAIN_TEST, - filter_type=DatasetFilterType.ALL, + filter_id="all", ) assert request.name is None assert request.description is None # Test invalid dataset split type with pytest.raises(ValueError): - CreateDatasetSplitRequest( - dataset_split_type="invalid_type", filter_type=DatasetFilterType.ALL - ) + CreateDatasetSplitRequest(dataset_split_type="invalid_type", filter_id="all") # Test invalid filter type with pytest.raises(ValueError): CreateDatasetSplitRequest( - dataset_split_type=DatasetSplitType.TRAIN_TEST, filter_type="invalid_type" + dataset_split_type=DatasetSplitType.TRAIN_TEST, filter_id="invalid_type" ) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index f88d2343..cd6ce7eb 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -685,7 +685,8 @@ export interface components { */ CreateDatasetSplitRequest: { dataset_split_type: components["schemas"]["DatasetSplitType"]; - filter_type: components["schemas"]["DatasetFilterType"]; + /** Filter Id */ + filter_id: string; /** Name */ name?: string | null; /** Description */ @@ -852,12 +853,6 @@ export interface components { * @enum {string} */ DataSourceType: "human" | "synthetic"; - /** - * DatasetFilterType - * @description Dataset filter names. - * @enum {string} - */ - DatasetFilterType: "all" | "high_rating" | "thinking_model" | "thinking_model_high_rated"; /** * DatasetSplit * @description A collection of task runs, with optional splits (train, test, validation). @@ -905,8 +900,11 @@ export interface components { split_contents: { [key: string]: string[]; }; - /** @description The filter used to build the dataset. */ - filter?: components["schemas"]["DatasetFilterType"] | null; + /** + * Filter + * @description The filter used to build the dataset. + */ + filter?: string | null; /** Model Type */ readonly model_type: string; }; diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte index 83064af0..1724638c 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte @@ -298,8 +298,7 @@ body: { // @ts-expect-error types are validated by the server dataset_split_type: new_dataset_split, - // @ts-expect-error types are validated by the server - filter_type: new_dataset_filter, + filter_id: new_dataset_filter, }, }, ) diff --git a/libs/core/kiln_ai/datamodel/dataset_filters.py b/libs/core/kiln_ai/datamodel/dataset_filters.py new file mode 100644 index 00000000..bbc69e9f --- /dev/null +++ b/libs/core/kiln_ai/datamodel/dataset_filters.py @@ -0,0 +1,114 @@ +from enum import Enum +from typing import Annotated, Protocol + +from pydantic import AfterValidator + +from kiln_ai.datamodel.task_run import TaskRun + + +class DatasetFilter(Protocol): + """A protocol defining the interface for dataset filters. + + This allows both stateless function-based filters and stateful class-based filters + to be used interchangeably, as long as they implement the __call__ method. + """ + + def __call__(self, task_run: TaskRun) -> bool: + """Return True if the task run should be included in the dataset.""" + ... + + +def AllDatasetFilter(_: TaskRun) -> bool: + return True + + +def HighRatingDatasetFilter(task_run: TaskRun) -> bool: + if task_run.output is None: + return False + if task_run.repaired_output is not None: + # Repairs always considered high quality + return True + if task_run.output.rating is None: + return False + return task_run.output.rating.is_high_quality() + + +def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool: + """ + A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought) + """ + return task_run.has_thinking_training_data() + + +def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool: + """ + A filter that returns True if the task has thinking data and the output is high quality + """ + return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run) + + +class TagFilter: + """ + A filter that returns True if the task has a tag matching the given tag. + """ + + def __init__(self, tag: str): + self.tag = tag + + def __call__(self, task_run: TaskRun) -> bool: + return self.tag in task_run.tags + + +class StaticDatasetFilters(str, Enum): + """Dataset filter names.""" + + ALL = "all" + HIGH_RATING = "high_rating" + THINKING_MODEL = "thinking_model" + THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated" + + +static_dataset_filters = { + StaticDatasetFilters.ALL: AllDatasetFilter, + StaticDatasetFilters.HIGH_RATING: HighRatingDatasetFilter, + StaticDatasetFilters.THINKING_MODEL: ThinkingModelDatasetFilter, + StaticDatasetFilters.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter, +} + +DatasetFilterId = Annotated[ + str, + AfterValidator(lambda v: _check_dataset_filter_id(v)), +] +""" +A pydantic type that validates strings containing a valid dataset filter ID. + +Dataset filter IDs can be one of: +- A built-in dataset filter name +- A tag:: filter, where is a string +""" + + +def _check_dataset_filter_id(id: str) -> str: + """ + Check that the dataset filter ID is valid. + """ + if id in static_dataset_filters: + return id + + if id.startswith("tag::") and len(id) > 5: + return id + + raise ValueError(f"Invalid dataset filter ID: {id}") + + +def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter: + """ + Get a dataset filter from an ID. + """ + if id.startswith("tag::") and len(id) > 5: + return TagFilter(id[5:]) + + if id in static_dataset_filters: + return static_dataset_filters[id] + + raise ValueError(f"Invalid dataset filter ID: {id}") diff --git a/libs/core/kiln_ai/datamodel/dataset_split.py b/libs/core/kiln_ai/datamodel/dataset_split.py index bb1c3833..00c88341 100644 --- a/libs/core/kiln_ai/datamodel/dataset_split.py +++ b/libs/core/kiln_ai/datamodel/dataset_split.py @@ -4,69 +4,21 @@ import math import random -from enum import Enum -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING from pydantic import BaseModel, Field, model_validator from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel -from kiln_ai.datamodel.task_run import TaskRun +from kiln_ai.datamodel.dataset_filters import ( + DatasetFilter, + DatasetFilterId, + dataset_filter_from_id, +) if TYPE_CHECKING: from kiln_ai.datamodel.task import Task -# A type alias that takes a TaskRun and returns a boolean indicating whether the task run should be included in the split. -# Several filters are defined below like AllDatasetFilter, HighRatingDatasetFilter, etc. -DatasetFilter = Callable[[TaskRun], bool] - - -def AllDatasetFilter(_: TaskRun) -> bool: - return True - - -def HighRatingDatasetFilter(task_run: TaskRun) -> bool: - if task_run.output is None: - return False - if task_run.repaired_output is not None: - # Repairs always considered high quality - return True - if task_run.output.rating is None: - return False - return task_run.output.rating.is_high_quality() - - -def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool: - """ - A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought) - """ - return task_run.has_thinking_training_data() - - -def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool: - """ - A filter that returns True if the task has thinking data and the output is high quality - """ - return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run) - - -class DatasetFilterType(str, Enum): - """Dataset filter names.""" - - ALL = "all" - HIGH_RATING = "high_rating" - THINKING_MODEL = "thinking_model" - THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated" - - -dataset_filters = { - DatasetFilterType.ALL: AllDatasetFilter, - DatasetFilterType.HIGH_RATING: HighRatingDatasetFilter, - DatasetFilterType.THINKING_MODEL: ThinkingModelDatasetFilter, - DatasetFilterType.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter, -} - - class DatasetSplitDefinition(BaseModel): """ A definition of a split in a dataset. @@ -126,7 +78,7 @@ class DatasetSplit(KilnParentedModel): split_contents: dict[str, list[str]] = Field( description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.", ) - filter: DatasetFilterType | None = Field( + filter: DatasetFilterId | None = Field( default=None, description="The filter used to build the dataset.", ) @@ -144,13 +96,13 @@ def from_task( name: str, task: "Task", splits: list[DatasetSplitDefinition], - filter_type: DatasetFilterType = DatasetFilterType.ALL, + filter_id: DatasetFilterId = "all", description: str | None = None, ): """ Build a dataset split from a task. """ - filter = dataset_filters[filter_type] + filter = dataset_filter_from_id(filter_id) split_contents = cls.build_split_contents(task, splits, filter) return cls( parent=task, @@ -158,7 +110,7 @@ def from_task( description=description, splits=splits, split_contents=split_contents, - filter=filter_type, + filter=filter_id, ) @classmethod diff --git a/libs/core/kiln_ai/datamodel/test_dataset_filters.py b/libs/core/kiln_ai/datamodel/test_dataset_filters.py new file mode 100644 index 00000000..43130f92 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_dataset_filters.py @@ -0,0 +1,71 @@ +import pytest +from pydantic import BaseModel + +from kiln_ai.datamodel.dataset_filters import ( + AllDatasetFilter, + DatasetFilterId, + HighRatingDatasetFilter, + StaticDatasetFilters, + TagFilter, + ThinkingModelDatasetFilter, + ThinkingModelHighRatedFilter, + dataset_filter_from_id, +) + +# Note: Many more filter tests in test_dataset_split.py + + +def test_all_dataset_filter_from_id(): + assert dataset_filter_from_id("all") == AllDatasetFilter + + +def test_high_rating_dataset_filter_from_id(): + assert dataset_filter_from_id("high_rating") == HighRatingDatasetFilter + + +def test_thinking_model_dataset_filter_from_id(): + assert dataset_filter_from_id("thinking_model") == ThinkingModelDatasetFilter + + +def test_thinking_model_high_rated_dataset_filter_from_id(): + assert ( + dataset_filter_from_id("thinking_model_high_rated") + == ThinkingModelHighRatedFilter + ) + + +def test_all_static_dataset_filters(): + for filter_id in StaticDatasetFilters: + assert dataset_filter_from_id(filter_id) is not None + + +class ModelTester(BaseModel): + dsid: DatasetFilterId + + +@pytest.mark.parametrize( + "tag,expected_error,expected_tag", + [ + ("tag::test", False, "test"), + ("tag::other", False, "other"), + ("tag::", True, None), + ("tag", True, None), + ("", True, None), + ], +) +def test_tag_filter(tag, expected_error, expected_tag): + # Check our model validators + if expected_error: + with pytest.raises(ValueError): + ModelTester(dsid=tag) + else: + ModelTester(dsid=tag) + + # Check the constructor + if expected_tag is None: + with pytest.raises(ValueError, match="Invalid dataset filter ID:"): + dataset_filter_from_id(tag) + else: + filter = dataset_filter_from_id(tag) + assert isinstance(filter, TagFilter) + assert filter.tag == expected_tag diff --git a/libs/core/kiln_ai/datamodel/test_dataset_split.py b/libs/core/kiln_ai/datamodel/test_dataset_split.py index b00d5a8e..c3b92caa 100644 --- a/libs/core/kiln_ai/datamodel/test_dataset_split.py +++ b/libs/core/kiln_ai/datamodel/test_dataset_split.py @@ -14,14 +14,16 @@ TaskRun, ) from kiln_ai.datamodel.dataset_split import ( - AllDatasetFilter, AllSplitDefinition, - DatasetFilterType, + Train60Test20Val20SplitDefinition, + Train80Test20SplitDefinition, +) +from kiln_ai.datamodel.test_dataset_filters import ( + AllDatasetFilter, HighRatingDatasetFilter, + TagFilter, ThinkingModelDatasetFilter, ThinkingModelHighRatedFilter, - Train60Test20Val20SplitDefinition, - Train80Test20SplitDefinition, ) @@ -44,6 +46,7 @@ def sample_task_runs(sample_task): task_runs = [] for i in range(10): rating = 5 if i < 6 else 1 # 6 high, 4 low ratings + tags = ["tag1"] if i < 6 else [] task_run = TaskRun( parent=sample_task, input=f"input_{i}", @@ -61,6 +64,7 @@ def sample_task_runs(sample_task): value=rating, type=TaskOutputRatingType.five_star ), ), + tags=tags, ) task_run.save_to_file() task_runs.append(task_run) @@ -201,10 +205,10 @@ def test_dataset_split_with_high_rating_filter(sample_task, sample_task_runs): "Split Name", sample_task, Train80Test20SplitDefinition, - filter_type=DatasetFilterType.HIGH_RATING, + filter_id="high_rating", ) - assert dataset.filter == DatasetFilterType.HIGH_RATING + assert dataset.filter == "high_rating" # Check that only high-rated task runs are included all_ids = [] @@ -331,3 +335,21 @@ def test_thinking_model_dataset_filter_high_rated( ) assert ThinkingModelHighRatedFilter(task_run) is expected_result + + +def test_tag_dataset_filter(sample_task_runs): + num_tagged = 0 + num_untagged = 0 + filter = TagFilter("tag1") + for task_run in sample_task_runs: + if "tag1" in task_run.tags: + num_tagged += 1 + assert "tag1" in task_run.tags + assert filter(task_run) is True + else: + num_untagged += 1 + assert "tag1" not in task_run.tags + assert filter(task_run) is False + + assert num_tagged == 6 + assert num_untagged == 4 From de6dff7a2348717e36ce204642fe2fa5c76d9279 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 18 Feb 2025 16:09:37 -0500 Subject: [PATCH 025/102] Add datasets to evals: 1 for evaluating the eval configs, which needs ratings 1 for running the eval --- .../core/kiln_ai/adapters/eval/test_g_eval.py | 7 ++++- .../kiln_ai/adapters/test_prompt_builders.py | 4 +++ libs/core/kiln_ai/datamodel/eval.py | 7 +++++ .../core/kiln_ai/datamodel/test_eval_model.py | 28 ++++++++++++++++--- 4 files changed, 41 insertions(+), 5 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 04a1fed7..36c6dd02 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -47,7 +47,12 @@ def test_task(tmp_path): @pytest.fixture def test_eval_config(test_task): - eval = Eval(name="Joke Quality Eval", parent=test_task) + eval = Eval( + name="Joke Quality Eval", + parent=test_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) eval.save_to_file() config = EvalConfig( diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 5af63bdf..231c7330 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -608,6 +608,8 @@ def test_eval_prompt_builder(tmp_path, valid_eval_config_datasource): eval = Eval( name="test_eval", parent=task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", ) eval.save_to_file() @@ -669,6 +671,8 @@ def test_eval_prompt_builder_validation_errors(tmp_path): eval = Eval( name="test_eval", parent=task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", ) eval.save_to_file() diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 41534942..d882c845 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -11,6 +11,7 @@ KilnParentedModel, KilnParentModel, ) +from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.prompt import BasePrompt from kiln_ai.datamodel.task_output import DataSource, DataSourceType @@ -125,6 +126,12 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig} default=None, description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", ) + eval_set_filter_id: DatasetFilterId = Field( + description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." + ) + eval_configs_filter_id: DatasetFilterId = Field( + description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." + ) # Workaround to return typed parent without importing Task def parent_task(self) -> Union["Task", None]: diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index eedab6a8..30ba6845 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -114,6 +114,8 @@ def test_eval_basic_properties(): description="Test Description", state=EvalState.enabled, current_config_id="config123", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", ) assert eval.name == "Test Eval" @@ -123,7 +125,11 @@ def test_eval_basic_properties(): def test_eval_default_values(): - eval = Eval(name="Test Eval") + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) assert eval.description is None assert eval.state == EvalState.enabled @@ -131,7 +137,12 @@ def test_eval_default_values(): def test_eval_parent_task_relationship(mock_task, valid_eval_config_data): - eval = Eval(name="Test Eval", parent=mock_task) + eval = Eval( + name="Test Eval", + parent=mock_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) config = EvalConfig(parent=eval, **valid_eval_config_data) assert eval.parent_task() == mock_task @@ -141,7 +152,11 @@ def test_eval_parent_task_relationship(mock_task, valid_eval_config_data): def test_eval_parent_task_none(): - eval = Eval(name="Test Eval") + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) assert eval.parent_task() is None @@ -159,7 +174,12 @@ def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_pat mock_task.path = task_path mock_task.save_to_file() - eval = Eval(name="Test Eval", parent=mock_task) + eval = Eval( + name="Test Eval", + parent=mock_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) eval.save_to_file() # Add config using the parent relationship From cfa6955acf03f2f3145837b9728b2e266ed2478b Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 18 Feb 2025 20:35:44 -0500 Subject: [PATCH 026/102] Add a fancy async evaluation runner. Not complete but checkpoint with tests --- libs/core/kiln_ai/adapters/eval/base_eval.py | 15 +- .../core/kiln_ai/adapters/eval/eval_runner.py | 148 ++++++++++++++++++ libs/core/kiln_ai/adapters/eval/g_eval.py | 17 +- .../kiln_ai/adapters/eval/test_eval_runner.py | 105 +++++++++++++ .../core/kiln_ai/adapters/eval/test_g_eval.py | 59 ++++--- libs/core/kiln_ai/datamodel/task.py | 36 ++++- libs/core/kiln_ai/datamodel/test_task.py | 33 ++-- 7 files changed, 351 insertions(+), 62 deletions(-) create mode 100644 libs/core/kiln_ai/adapters/eval/eval_runner.py create mode 100644 libs/core/kiln_ai/adapters/eval/test_eval_runner.py diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index 428b7e65..bfdcd2a4 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -1,18 +1,17 @@ import json from abc import abstractmethod -from typing import Dict from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig from kiln_ai.datamodel.eval import EvalConfig, EvalScores from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema -from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRun +from kiln_ai.datamodel.task import RunConfig, Task, TaskOutputRatingType, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error class BaseEval: - def __init__(self, eval_config: EvalConfig): + def __init__(self, eval_config: EvalConfig, run_config: RunConfig): self.eval_config = eval_config eval = eval_config.parent_eval() if not eval: @@ -23,6 +22,7 @@ def __init__(self, eval_config: EvalConfig): raise ValueError("Eval must have a parent task") self.target_task = task self.score_schema = BaseEval.build_score_schema(task, allow_float_scores=True) + self.run_config = run_config def model_and_provider(self) -> tuple[str, ModelProviderName]: model_name = self.eval_config.model.properties.get("model_name") @@ -40,12 +40,11 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - async def run(self, input: Dict | str) -> EvalScores: + async def run(self, input: str) -> EvalScores: run_adapter = adapter_for_task( self.target_task, - # TODO: take these from evalRun - "llama_3_1_8b", - ModelProviderName.groq, + self.run_config.model_name, + ModelProviderName(self.run_config.model_provider_name), base_adapter_config=AdapterConfig(allow_saving=False), ) @@ -59,7 +58,7 @@ async def run(self, input: Dict | str) -> EvalScores: @abstractmethod # Runs the eval on the given task run and returns a dictionary of scores which should conform to the score schema - async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: + async def run_eval(self, task_run: TaskRun) -> EvalScores: pass @classmethod diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py new file mode 100644 index 00000000..02e8e520 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -0,0 +1,148 @@ +import asyncio +from dataclasses import dataclass +from typing import AsyncGenerator, List + +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.registry import eval_adapter_from_type +from kiln_ai.datamodel.eval import EvalConfig +from kiln_ai.datamodel.task import TaskRunConfig +from kiln_ai.datamodel.task_run import TaskRun + + +@dataclass +class EvalJob: + item: TaskRun + task_run_config: TaskRunConfig + + +@dataclass +class EvalProgress: + complete: int | None = None + total: int | None = None + errors: int | None = None + + +class EvalRunner: + """ + Runs an eval. + + Specifically, runs a specific eval config on a list of task runs. + """ + + def __init__( + self, + eval_config: EvalConfig, + run_configs: List[TaskRunConfig], + ): + # confirm these are compatible + target_eval = eval_config.parent_eval() + if target_eval is None: + raise ValueError("Eval config requires a parent eval") + target_task = target_eval.parent_task() + if target_task is None: + raise ValueError("Eval config requires a (grand)parent task") + if len(run_configs) == 0: + raise ValueError("Eval config requires at least one run config") + + # confirm the run configs are for the target task + for run_config in run_configs: + parent_task = run_config.parent_task() + if parent_task is None: + raise ValueError("Each run config requires a parent task") + if parent_task.id != target_task.id: + raise ValueError( + "Run config is not for the same task as the eval config" + ) + + self.eval_config = eval_config + self.run_configs = run_configs + self.task = target_task + self.eval = target_eval + + def collect_tasks(self) -> List[EvalJob]: + return [] + + # return [ + # EvalJob(item=task_run, run_config=run_config) + # for task_run in self.task.runs() + # for run_config in self.run_configs + # ] + + async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]: + """ + Runs the eval with parallel workers and yields progress updates. + """ + jobs = self.collect_tasks() + + complete = 0 + errors = 0 + total = len(jobs) + + # Send initial status + yield EvalProgress(complete=complete, total=total, errors=errors) + + worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue() + for job in jobs: + worker_queue.put_nowait(job) + + # simple status queue to return progress. True=success, False=error + status_queue: asyncio.Queue[bool] = asyncio.Queue() + + workers = [] + for i in range(concurrency): + task = asyncio.create_task(self.run_worker(worker_queue, status_queue)) + workers.append(task) + + # Send status updates until workers are done, and they are all sent + while not status_queue.empty() or not all(worker.done() for worker in workers): + try: + # Use timeout to prevent hanging if all workers complete + # between our while condition check and get() + success = await asyncio.wait_for(status_queue.get(), timeout=0.1) + if success: + complete += 1 + else: + errors += 1 + + yield EvalProgress(complete=complete, total=total, errors=errors) + except asyncio.TimeoutError: + # Timeout is expected, just continue to recheck worker status + # Don't love this but beats sentinels for reliability + continue + + # These are redundant, but keeping them will catch async errors + await asyncio.gather(*workers) + await worker_queue.join() + + async def run_worker( + self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool] + ): + while True: + try: + job = worker_queue.get_nowait() + except asyncio.QueueEmpty: + # worker can end when the queue is empty + break + try: + success = await self.run_job(job) + await status_queue.put(success) + finally: + # Always mark the dequeued task as done, even on exceptions + worker_queue.task_done() + + async def run_job(self, job: EvalJob) -> bool: + try: + # Create the evaluator for this eval config/run config pair + evaluator = eval_adapter_from_type(self.eval_config.config_type)( + self.eval_config, job.task_run_config.run_config() + ) + if not isinstance(evaluator, BaseEval): + raise ValueError("Not able to create evaluator from eval config") + + result = await evaluator.run(job.item.input) + print(f"Result: {result}") + + return True + except Exception as e: + print(f"Error running job: {e}") + return False diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index edbf534a..000cb150 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -6,7 +6,8 @@ from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput from kiln_ai.adapters.prompt_builders import PromptGenerators from kiln_ai.datamodel import Project, Task, TaskRun -from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType +from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores +from kiln_ai.datamodel.task import RunConfig from openai.types.chat import ChatCompletionTokenLogprob # all the tokens we score for, and their float scores. @@ -74,7 +75,7 @@ class GEval(BaseEval): LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. """ - def __init__(self, eval_config: EvalConfig): + def __init__(self, eval_config: EvalConfig, run_config: RunConfig): if ( eval_config.config_type != EvalConfigType.g_eval and eval_config.config_type != EvalConfigType.llm_as_judge @@ -83,11 +84,11 @@ def __init__(self, eval_config: EvalConfig): "GEval must be initialized with a GEval or LLM as Judge Config" ) - super().__init__(eval_config) + super().__init__(eval_config, run_config) self.geval_task = GEvalTask(eval_config, self.target_task) - async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: + async def run_eval(self, task_run: TaskRun) -> EvalScores: """ Run this G-Eval on the given task run. """ @@ -131,12 +132,12 @@ async def run_eval(self, task_run: TaskRun) -> Dict[str, float]: else: return self.build_g_eval_score(run_output) - def build_llm_as_judge_score(self, run_output: RunOutput) -> Dict[str, float]: + def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: """ Build the LLM as Judge score for the given run and run output. """ # Convert the output format we asked for (discreet values) to our float scores - scores: Dict[str, float] = {} + scores: EvalScores = {} if not isinstance(run_output.output, dict): raise ValueError("LLM as Judge output must be a dictionary") @@ -147,7 +148,7 @@ def build_llm_as_judge_score(self, run_output: RunOutput) -> Dict[str, float]: scores[metric] = token_score return scores - def build_g_eval_score(self, run_output: RunOutput) -> Dict[str, float]: + def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: """ Build the G-Eval score for the given run and run output. @@ -174,7 +175,7 @@ def build_g_eval_score(self, run_output: RunOutput) -> Dict[str, float]: metrics: List[str] = list(outputs.keys()) metric_offsets = self.metric_offsets(raw_output, metrics) - final_scores: Dict[str, float] = {} + final_scores: EvalScores = {} for metric in metrics: score = self.g_eval_single_metric( run_output, metric, metric_offsets, raw_output diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py new file mode 100644 index 00000000..f0f07af1 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -0,0 +1,105 @@ +from unittest.mock import AsyncMock + +import pytest +from kiln_ai.adapters.eval.eval_runner import EvalRunner +from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType, Task +from kiln_ai.datamodel.eval import Eval, EvalConfig +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig + + +def test_asdf(): + assert 1 == 1 + + +@pytest.fixture +def mock_task(): + return Task( + name="test", + description="test", + instruction="do the thing", + ) + + +@pytest.fixture +def mock_eval(mock_task): + return Eval( + id="test", + name="test", + description="test", + eval_set_filter_id="all", + eval_configs_filter_id="all", + parent=mock_task, + ) + + +@pytest.fixture +def data_source(): + return DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "langchain_adapter", + }, + ) + + +@pytest.fixture +def mock_eval_runner(mock_eval, data_source, mock_task): + return EvalRunner( + eval_config=EvalConfig( + name="test", + model=data_source, + parent=mock_eval, + prompt=BasePrompt( + name="test", + prompt="test", + ), + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + ), + run_configs=[ + TaskRunConfig( + name="test", + description="test", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + ], + ) + + +# Test with and without concurrency +@pytest.mark.parametrize("concurrency", [1, 25]) +@pytest.mark.asyncio +async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency): + # Real async testing! + + job_count = 50 + # Job objects are not the right type, but since we're mocking run_job, it doesn't matter + jobs = [{} for _ in range(job_count)] + + # Mock collect_tasks to return our fake jobs + mock_eval_runner.collect_tasks = lambda: jobs + + # Mock run_job to return True immediately + mock_eval_runner.run_job = AsyncMock(return_value=True) + + # Expect the status updates in order, and 1 for each job + expected_compelted_count = 0 + async for progress in mock_eval_runner.run(concurrency=concurrency): + assert progress.complete == expected_compelted_count + expected_compelted_count += 1 + assert progress.errors == 0 + assert progress.total == job_count + + # Verify last status update was complete + assert expected_compelted_count == job_count + 1 + + # Verify run_job was called for each job + assert mock_eval_runner.run_job.call_count == job_count diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 36c6dd02..9806479e 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -17,6 +17,7 @@ TaskRun, ) from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType +from kiln_ai.datamodel.task import RunConfig @pytest.fixture @@ -86,6 +87,16 @@ def test_eval_config(test_task): return config +@pytest.fixture +def test_run_config(test_task): + return RunConfig( + model_name="llama_3_1_8b", + model_provider_name="groq", + prompt_id="simple_prompt_builder", + task=test_task, + ) + + @pytest.fixture def test_task_run(test_task): task_run = TaskRun( @@ -114,10 +125,12 @@ def test_task_run(test_task): "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] ) @pytest.mark.paid -async def test_run_g_eval(test_task, test_eval_config, test_task_run, config_type): +async def test_run_g_eval( + test_task, test_eval_config, test_task_run, config_type, test_run_config +): # Create G-Eval instance test_eval_config.config_type = config_type - g_eval = GEval(test_eval_config) + g_eval = GEval(test_eval_config, test_run_config) # Run the evaluation eval_result = await g_eval.run_eval(test_task_run) @@ -142,10 +155,12 @@ async def test_run_g_eval(test_task, test_eval_config, test_task_run, config_typ "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] ) @pytest.mark.paid -async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run, config_type): +async def test_run_g_eval_e2e( + test_task, test_eval_config, test_task_run, config_type, test_run_config +): # Create G-Eval instance test_eval_config.config_type = config_type - g_eval = GEval(test_eval_config) + g_eval = GEval(test_eval_config, test_run_config) # Run the evaluation eval_result = await g_eval.run("chickens") @@ -169,12 +184,14 @@ async def test_run_g_eval_e2e(test_task, test_eval_config, test_task_run, config assert 1.0 <= overall <= 5.0 -async def test_g_eval_logprobs(test_task, test_eval_config, test_task_run): +async def test_g_eval_logprobs( + test_task, test_eval_config, test_task_run, test_run_config +): # Create G-Eval instance run_output = pickle.loads(serialized_run_output) assert isinstance(run_output, RunOutput) assert run_output.output_logprobs is not None - g_eval = GEval(test_eval_config) + g_eval = GEval(test_eval_config, test_run_config) result = g_eval.build_g_eval_score(run_output) assert "overall_rating" in result @@ -204,11 +221,13 @@ async def test_g_eval_logprobs(test_task, test_eval_config, test_task_run): assert pytest.approx(appropriateness, 1e-12) != 1.0 -async def test_llm_as_judge(test_task, test_eval_config, test_task_run): +async def test_llm_as_judge( + test_task, test_eval_config, test_task_run, test_run_config +): # Create G-Eval instance, set to LLM as Judge run_output = pickle.loads(serialized_run_output) test_eval_config.config_type = EvalConfigType.llm_as_judge - g_eval = GEval(test_eval_config) + g_eval = GEval(test_eval_config, test_run_config) assert isinstance(run_output, RunOutput) assert run_output.output_logprobs is not None @@ -226,8 +245,10 @@ def test_token_case(): assert token.lower() == token -def test_metric_offsets_and_search_ranges(test_eval_config): - g_eval = GEval(test_eval_config) +def test_metric_offsets_and_search_ranges( + test_eval_config, test_run_config, test_task_run +): + g_eval = GEval(test_eval_config, test_run_config) raw_output = ( '{"topic_alignment": 4, "appropriateness": "pass", "overall_rating": 5}' ) @@ -258,8 +279,8 @@ def test_metric_offsets_and_search_ranges(test_eval_config): assert end == len(raw_output) # end of string -def test_metric_offsets_invalid(test_eval_config): - g_eval = GEval(test_eval_config) +def test_metric_offsets_invalid(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) raw_output = '{"topic_alignment": 4, "topic_alignment": 5}' metrics = ["topic_alignment"] @@ -300,13 +321,15 @@ def test_metric_offsets_invalid(test_eval_config): ("4.9999999", None), ], ) -def test_score_from_token_string(test_eval_config, token_string, expected_score): - g_eval = GEval(test_eval_config) +def test_score_from_token_string( + test_eval_config, token_string, expected_score, test_run_config +): + g_eval = GEval(test_eval_config, test_run_config) assert g_eval.score_from_token_string(token_string) == expected_score -def test_raw_output_from_logprobs(test_eval_config): - g_eval = GEval(test_eval_config) +def test_raw_output_from_logprobs(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) # Create a minimal RunOutput with some logprobs class MockLogprob: @@ -333,8 +356,8 @@ def __init__(self): assert raw == '{"score": 5}' -def test_rating_token_to_score(test_eval_config): - g_eval = GEval(test_eval_config) +def test_rating_token_to_score(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) class MockTopLogprob: def __init__(self, token, logprob): diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 1a44802f..39dc228e 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -40,16 +40,13 @@ class TaskRequirement(BaseModel): type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) -class RunConfig(BaseModel): +class RunConfigProperties(BaseModel): """ A configuration for running a task. - This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). - - For example: task, model, provider, prompt, etc. + This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). """ - task: "Task" = Field(description="The task to run.") model_name: str = Field(description="The model to use for this run config.") model_provider_name: str = Field( description="The provider to use for this run config." @@ -60,6 +57,18 @@ class RunConfig(BaseModel): ) +class RunConfig(RunConfigProperties): + """ + A configuration for running a task. + + This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + + For example: task, model, provider, prompt, etc. + """ + + task: "Task" = Field(description="The task to run.") + + class TaskRunConfig(KilnParentedModel): """ A Kiln model for persisting a run config in a Kiln Project, nested under a task. @@ -73,8 +82,8 @@ class TaskRunConfig(KilnParentedModel): description: str | None = Field( default=None, description="The description of the task run config." ) - run_config: "RunConfig" = Field( - description="The run config to use for this task run." + run_config_properties: RunConfigProperties = Field( + description="The run config properties to use for this task run." ) # Workaround to return typed parent without importing Task @@ -83,9 +92,22 @@ def parent_task(self) -> Union["Task", None]: return None return self.parent # type: ignore + def run_config(self) -> RunConfig: + parent_task = self.parent_task() + if parent_task is None: + raise ValueError("Run config must be parented to a task") + return RunConfig( + task=parent_task, + model_name=self.run_config_properties.model_name, + model_provider_name=self.run_config_properties.model_provider_name, + prompt_id=self.run_config_properties.prompt_id, + ) + @model_validator(mode="after") def validate_task(self) -> Self: # Check that the task in the run config matches the parent task + return self + # TODO P0 parent_task = self.parent_task() if parent_task is None: raise ValueError("Run config must be parented to a task") diff --git a/libs/core/kiln_ai/datamodel/test_task.py b/libs/core/kiln_ai/datamodel/test_task.py index c123fa8e..333ef733 100644 --- a/libs/core/kiln_ai/datamodel/test_task.py +++ b/libs/core/kiln_ai/datamodel/test_task.py @@ -2,7 +2,7 @@ from pydantic import ValidationError from kiln_ai.datamodel.prompt_id import PromptGenerators -from kiln_ai.datamodel.task import RunConfig, Task, TaskRunConfig +from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, Task, TaskRunConfig def test_runconfig_valid_creation(): @@ -46,40 +46,42 @@ def sample_task(): @pytest.fixture -def sample_run_config(sample_task): - return RunConfig(task=sample_task, model_name="gpt-4", model_provider_name="openai") +def sample_run_config_props(sample_task): + return RunConfigProperties(model_name="gpt-4", model_provider_name="openai") -def test_task_run_config_valid_creation(sample_task, sample_run_config): +def test_task_run_config_valid_creation(sample_task, sample_run_config_props): config = TaskRunConfig( name="Test Config", description="Test description", - run_config=sample_run_config, + run_config_properties=sample_run_config_props, parent=sample_task, ) assert config.name == "Test Config" assert config.description == "Test description" - assert config.run_config == sample_run_config + assert config.run_config_properties == sample_run_config_props assert config.parent_task() == sample_task -def test_task_run_config_minimal_creation(sample_task, sample_run_config): +def test_task_run_config_minimal_creation(sample_task, sample_run_config_props): # Test creation with only required fields config = TaskRunConfig( - name="Test Config", run_config=sample_run_config, parent=sample_task + name="Test Config", + run_config_properties=sample_run_config_props, + parent=sample_task, ) assert config.name == "Test Config" assert config.description is None - assert config.run_config == sample_run_config + assert config.run_config_properties == sample_run_config_props def test_task_run_config_missing_required_fields(sample_task): # Test missing name with pytest.raises(ValidationError) as exc_info: TaskRunConfig( - run_config=RunConfig( + run_config_properties=RunConfigProperties( task=sample_task, model_name="gpt-4", model_provider_name="openai" ), parent=sample_task, @@ -92,17 +94,6 @@ def test_task_run_config_missing_required_fields(sample_task): assert "Field required" in str(exc_info.value) -def test_task_run_config_task_mismatch(sample_task, sample_run_config): - # Create a different task - different_task = Task(name="Different Task", instruction="Different instruction") - - # Test run_config task different from parent task - with pytest.raises(ValueError, match="Run config task must match parent task"): - TaskRunConfig( - name="Test Config", run_config=sample_run_config, parent=different_task - ) - - def test_task_run_config_missing_task_in_run_config(sample_task): with pytest.raises( ValidationError, match="Input should be a valid dictionary or instance of Task" From cad36444f546f45696f92ffa698de1c64508a518 Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 19 Feb 2025 14:50:50 -0500 Subject: [PATCH 027/102] Add real task collection for eval runner. It's progressive: re-running will only run the needed jobs. If interupted or data is added to the dataset, just re-run the job. --- libs/core/kiln_ai/adapters/eval/base_eval.py | 5 +- .../core/kiln_ai/adapters/eval/eval_runner.py | 31 ++- .../kiln_ai/adapters/eval/test_eval_runner.py | 231 +++++++++++++++--- libs/core/kiln_ai/datamodel/eval.py | 3 + 4 files changed, 227 insertions(+), 43 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index bfdcd2a4..9a3f843b 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -40,7 +40,8 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - async def run(self, input: str) -> EvalScores: + # TODO add test, nothing breaks if this returns a tuple + async def run(self, input: str) -> tuple[TaskRun, EvalScores]: run_adapter = adapter_for_task( self.target_task, self.run_config.model_name, @@ -54,7 +55,7 @@ async def run(self, input: str) -> EvalScores: eval_output = await self.run_eval(run_output) validate_schema(eval_output, self.score_schema) - return eval_output + return run_output, eval_output @abstractmethod # Runs the eval on the given task run and returns a dictionary of scores which should conform to the score schema diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py index 02e8e520..73cb888c 100644 --- a/libs/core/kiln_ai/adapters/eval/eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -4,6 +4,7 @@ from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.eval.registry import eval_adapter_from_type +from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id from kiln_ai.datamodel.eval import EvalConfig from kiln_ai.datamodel.task import TaskRunConfig from kiln_ai.datamodel.task_run import TaskRun @@ -60,13 +61,27 @@ def __init__( self.eval = target_eval def collect_tasks(self) -> List[EvalJob]: - return [] + """ + Collect all jobs for this run, excluding any that have already been run. - # return [ - # EvalJob(item=task_run, run_config=run_config) - # for task_run in self.task.runs() - # for run_config in self.run_configs - # ] + The tasks: + - should be in one of the eval filters: the eval filter (what's being evaluated) or the eval config filter (what's being evaluated to compare eval configs). + - should not have already been run for this eval config + """ + config_filter = dataset_filter_from_id(self.eval.eval_configs_filter_id) + eval_filter = dataset_filter_from_id(self.eval.eval_set_filter_id) + + already_run = { + f"{run.dataset_id}::{run.task_run_config_id}" + for run in self.eval_config.runs(readonly=True) + } + return [ + EvalJob(item=task_run, task_run_config=run_config) + for task_run in self.task.runs(readonly=True) + if config_filter(task_run) or eval_filter(task_run) + for run_config in self.run_configs + if f"{task_run.id}::{run_config.id}" not in already_run + ] async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]: """ @@ -139,8 +154,8 @@ async def run_job(self, job: EvalJob) -> bool: if not isinstance(evaluator, BaseEval): raise ValueError("Not able to create evaluator from eval config") - result = await evaluator.run(job.item.input) - print(f"Result: {result}") + task_run, scores = await evaluator.run(job.item.input) + print(f"Result: {task_run.id} {scores}") return True except Exception as e: diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index f0f07af1..39d4e39b 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -2,27 +2,33 @@ import pytest from kiln_ai.adapters.eval.eval_runner import EvalRunner -from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType, Task -from kiln_ai.datamodel.eval import Eval, EvalConfig +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + Task, + TaskOutput, + TaskRun, +) +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalRun from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig -def test_asdf(): - assert 1 == 1 - - @pytest.fixture -def mock_task(): - return Task( +def mock_task(tmp_path): + task = Task( name="test", description="test", instruction="do the thing", + path=tmp_path / "task.kiln", ) + task.save_to_file() + return task @pytest.fixture def mock_eval(mock_task): - return Eval( + eval = Eval( id="test", name="test", description="test", @@ -30,6 +36,8 @@ def mock_eval(mock_task): eval_configs_filter_id="all", parent=mock_task, ) + eval.save_to_file() + return eval @pytest.fixture @@ -45,32 +53,48 @@ def data_source(): @pytest.fixture -def mock_eval_runner(mock_eval, data_source, mock_task): - return EvalRunner( - eval_config=EvalConfig( +def mock_eval_config(mock_eval, data_source): + eval_config = EvalConfig( + name="test", + model=data_source, + parent=mock_eval, + prompt=BasePrompt( name="test", - model=data_source, - parent=mock_eval, - prompt=BasePrompt( - name="test", - prompt="test", - ), - properties={ - "eval_steps": ["step1", "step2", "step3"], - }, + prompt="test", + ), + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + ) + eval_config.save_to_file() + return eval_config + + +@pytest.fixture +def mock_run_config( + mock_task, +): + rc = TaskRunConfig( + name="test", + description="test", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", ), - run_configs=[ - TaskRunConfig( - name="test", - description="test", - run_config_properties=RunConfigProperties( - model_name="gpt-4", - model_provider_name="openai", - prompt_id="simple_prompt_builder", - ), - parent=mock_task, - ) - ], + parent=mock_task, + ) + rc.save_to_file() + return rc + + +@pytest.fixture +def mock_eval_runner( + mock_eval, data_source, mock_task, mock_eval_config, mock_run_config +): + return EvalRunner( + eval_config=mock_eval_config, + run_configs=[mock_run_config], ) @@ -103,3 +127,144 @@ async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency): # Verify run_job was called for each job assert mock_eval_runner.run_job.call_count == job_count + + +def test_collect_tasks_filtering( + mock_eval_runner, mock_task, mock_eval_config, data_source +): + """Test that tasks are properly filtered based on eval filters""" + tags = ["tag1", "tag2", "tag3"] + task_runs = [] + for tag in tags: + # Create some task runs with different tags + task_run = TaskRun( + parent=mock_task, + input="test1", + input_source=data_source, + output=TaskOutput( + output="test1", + ), + tags=[tag], + ) + task_run.save_to_file() + task_runs.append(task_run) + + # Set up filters to only match tag1 + mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" + mock_eval_runner.eval.eval_configs_filter_id = "tag::tag2" + + jobs = mock_eval_runner.collect_tasks() + + # Should only get task_run1 jobs + assert len(jobs) == 2 + ids = [job.item.id for job in jobs] + assert task_runs[0].id in ids + assert task_runs[1].id in ids + assert task_runs[2].id not in ids + + +def test_collect_tasks_excludes_already_run(mock_eval_runner, mock_task, data_source): + """Test that already run tasks are excluded""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + # Prior to any eval runs, we should get the task run + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 1 + assert jobs[0].item.id == task_run.id + + # Create an eval run for this task + EvalRun( + parent=mock_eval_runner.eval_config, + dataset_id=task_run.id, + task_run_config_id=mock_eval_runner.run_configs[0].id, + input="test", + output="test", + scores={"score": 1.0}, + ).save_to_file() + + # Set filter to match the task + mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" + mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent" + + jobs = mock_eval_runner.collect_tasks() + + # Should get no jobs since the task was already run + assert len(jobs) == 0 + + +def test_collect_tasks_multiple_run_configs( + mock_eval_runner, mock_task, data_source, mock_run_config +): + """Test handling multiple run configs""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + # Add another run config + second_config = TaskRunConfig( + name="test2", + description="test2", + run_config_properties=RunConfigProperties( + model_name="gpt-3.5", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + second_config.save_to_file() + mock_eval_runner.run_configs.append(second_config) + + # Set filter to match the task + mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" + + jobs = mock_eval_runner.collect_tasks() + + # Should get 2 jobs, one for each config + assert len(jobs) == 2 + assert {job.task_run_config.id for job in jobs} == { + second_config.id, + mock_run_config.id, + } + + +def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source): + """Test empty cases - no matching tasks or no tasks at all""" + # Set filter that won't match anything + mock_eval_runner.eval.eval_set_filter_id = "tag::nonexistent" + mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent" + + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 0 + + # Create task run with non-matching tag + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["other_tag"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 0 diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index d882c845..89edd610 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -83,6 +83,9 @@ def parent_eval(self) -> "Eval": raise ValueError("parent must be an Eval") return self.parent # type: ignore + def runs(self, readonly: bool = False) -> list[EvalRun]: + return super().runs(readonly=readonly) # type: ignore + @model_validator(mode="after") def validate_properties(self) -> Self: if ( From 240bc8c5e5f4894722f5334e03682fb7d88d123e Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 19 Feb 2025 15:29:57 -0500 Subject: [PATCH 028/102] Finalize the eval running, with a run_job method --- .../core/kiln_ai/adapters/eval/eval_runner.py | 18 ++- .../kiln_ai/adapters/eval/test_eval_runner.py | 103 +++++++++++++++++- 2 files changed, 115 insertions(+), 6 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py index 73cb888c..fd4eceb7 100644 --- a/libs/core/kiln_ai/adapters/eval/eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -5,7 +5,7 @@ from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.eval.registry import eval_adapter_from_type from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id -from kiln_ai.datamodel.eval import EvalConfig +from kiln_ai.datamodel.eval import EvalConfig, EvalRun from kiln_ai.datamodel.task import TaskRunConfig from kiln_ai.datamodel.task_run import TaskRun @@ -154,10 +154,20 @@ async def run_job(self, job: EvalJob) -> bool: if not isinstance(evaluator, BaseEval): raise ValueError("Not able to create evaluator from eval config") - task_run, scores = await evaluator.run(job.item.input) - print(f"Result: {task_run.id} {scores}") + result_task_run, scores = await evaluator.run(job.item.input) + + # Save the job result + eval_run = EvalRun( + parent=self.eval_config, + task_run_config_id=job.task_run_config.id, + dataset_id=job.item.id, + scores=scores, + input=job.item.input, + output=result_task_run.output.output, + ) + eval_run.save_to_file() return True except Exception as e: - print(f"Error running job: {e}") + print(f"Error running eval job for dataset item {job.item.id}: {e}") return False diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index 39d4e39b..1c9d621a 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -1,7 +1,8 @@ -from unittest.mock import AsyncMock +from unittest.mock import AsyncMock, patch import pytest -from kiln_ai.adapters.eval.eval_runner import EvalRunner +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.eval_runner import EvalJob, EvalRunner from kiln_ai.datamodel import ( BasePrompt, DataSource, @@ -268,3 +269,101 @@ def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source): jobs = mock_eval_runner.collect_tasks() assert len(jobs) == 0 + + +@pytest.mark.asyncio +async def test_run_job_success( + mock_eval_runner, mock_task, data_source, mock_run_config +): + # Create a task run to evaluate + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + + # Create eval job + job = EvalJob(item=task_run, task_run_config=mock_run_config) + + # Mock the evaluator + mock_result_run = TaskRun( + input="test input", + input_source=data_source, + output=TaskOutput(output="evaluated output"), + ) + mock_scores = {"accuracy": 0.95} + + class MockEvaluator(BaseEval): + async def run(self, input_text): + return mock_result_run, mock_scores + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: MockEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is True + + # Verify eval run was saved + eval_runs = mock_eval_runner.eval_config.runs() + assert len(eval_runs) == 1 + saved_run = eval_runs[0] + assert saved_run.dataset_id == task_run.id + assert saved_run.task_run_config_id == mock_run_config.id + assert saved_run.scores == mock_scores + assert saved_run.input == "test input" + assert saved_run.output == "evaluated output" + + +@pytest.mark.asyncio +async def test_run_job_invalid_evaluator( + mock_eval_runner, mock_task, data_source, mock_run_config +): + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + job = EvalJob(item=task_run, task_run_config=mock_run_config) + + # Return an invalid evaluator type + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: object(), + ): + success = await mock_eval_runner.run_job(job) + + assert success is False + assert len(mock_eval_runner.eval_config.runs()) == 0 + + +@pytest.mark.asyncio +async def test_run_job_evaluator_error( + mock_eval_runner, mock_task, data_source, mock_run_config +): + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + job = EvalJob(item=task_run, task_run_config=mock_run_config) + + class ErrorEvaluator(BaseEval): + async def run(self, input_text): + raise ValueError("Evaluation failed") + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: ErrorEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is False + assert len(mock_eval_runner.eval_config.runs()) == 0 From 45f7d8fe3b88d0fd34b0e3531ffab43837b3412e Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 19 Feb 2025 15:50:33 -0500 Subject: [PATCH 029/102] Add test --- libs/core/kiln_ai/adapters/eval/base_eval.py | 1 - .../kiln_ai/adapters/eval/test_base_eval.py | 82 ++++++++++++++++++- 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index 9a3f843b..576b9add 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -40,7 +40,6 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - # TODO add test, nothing breaks if this returns a tuple async def run(self, input: str) -> tuple[TaskRun, EvalScores]: run_adapter = adapter_for_task( self.target_task, diff --git a/libs/core/kiln_ai/adapters/eval/test_base_eval.py b/libs/core/kiln_ai/adapters/eval/test_base_eval.py index 7772758d..276ce102 100644 --- a/libs/core/kiln_ai/adapters/eval/test_base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_base_eval.py @@ -2,7 +2,15 @@ import pytest from kiln_ai.adapters.eval.base_eval import BaseEval -from kiln_ai.datamodel.task import Task, TaskOutputRatingType, TaskRequirement +from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType +from kiln_ai.datamodel.eval import Eval, EvalConfig +from kiln_ai.datamodel.task import ( + RunConfigProperties, + Task, + TaskOutputRatingType, + TaskRequirement, + TaskRunConfig, +) def test_score_schema_five_star(): @@ -229,3 +237,75 @@ def test_score_schema_no_requirements(): # Should only have overall_rating assert len(schema["properties"]) == 1 assert "overall_rating" in schema["properties"] + + +class TestEval(BaseEval): + """Test implementation of BaseEval""" + + async def run_eval(self, task_run): + return {"overall_rating": 5, "quality": 4} + + +@pytest.mark.paid +@pytest.mark.asyncio +async def test_run_method(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ) + ], + ) + + eval_config = EvalConfig( + name="Test Eval Config", + model=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4o", + "model_provider": "openai", + "adapter_name": "test", + }, + ), + parent=Eval( + name="Test Eval", + parent=task, + eval_set_filter_id="all", + eval_configs_filter_id="all", + ), + prompt=BasePrompt( + name="Test Prompt", + prompt="Test prompt", + ), + properties={"eval_steps": ["test_step"]}, + ) + + run_config = TaskRunConfig( + name="Test Run Config", + run_config_properties=RunConfigProperties( + model_name="llama_3_1_8b", + model_provider_name="groq", + prompt_id="simple_prompt_builder", + ), + parent=task, + ) + + evaluator = TestEval(eval_config, run_config.run_config()) + + # Run the evaluation + task_run, eval_scores = await evaluator.run("test input") + + # Verify task run was created + assert task_run.input == "test input" + assert isinstance(task_run.output.output, str) + + # Verify eval scores match schema and contain expected values + assert eval_scores["overall_rating"] == 5 + assert eval_scores["quality"] == 4 + + # Verify schema validation worked (these keys should exist per schema) + assert set(eval_scores.keys()) == {"overall_rating", "quality"} From 9de461116c28a03eb8f318e5e39f40ec5197033f Mon Sep 17 00:00:00 2001 From: scosman Date: Thu, 20 Feb 2025 11:11:25 -0500 Subject: [PATCH 030/102] Evals now define the scores it should produce! No binding to task requirements, although UI should make it easy to use them. --- libs/core/kiln_ai/adapters/eval/base_eval.py | 52 +-- libs/core/kiln_ai/adapters/eval/g_eval.py | 20 +- .../kiln_ai/adapters/eval/test_base_eval.py | 243 ++++++------ .../kiln_ai/adapters/eval/test_eval_runner.py | 12 +- .../core/kiln_ai/adapters/eval/test_g_eval.py | 32 +- .../kiln_ai/adapters/test_prompt_builders.py | 14 +- libs/core/kiln_ai/datamodel/eval.py | 131 ++++++- .../core/kiln_ai/datamodel/test_eval_model.py | 345 +++++++++++++++++- 8 files changed, 664 insertions(+), 185 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index 576b9add..cd4f9147 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -4,7 +4,7 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig -from kiln_ai.datamodel.eval import EvalConfig, EvalScores +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema from kiln_ai.datamodel.task import RunConfig, Task, TaskOutputRatingType, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -21,7 +21,7 @@ def __init__(self, eval_config: EvalConfig, run_config: RunConfig): if not task: raise ValueError("Eval must have a parent task") self.target_task = task - self.score_schema = BaseEval.build_score_schema(task, allow_float_scores=True) + self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) self.run_config = run_config def model_and_provider(self) -> tuple[str, ModelProviderName]: @@ -62,7 +62,7 @@ async def run_eval(self, task_run: TaskRun) -> EvalScores: pass @classmethod - def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str: + def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: """ Build a JSON schema for the scoring output of the task requirements @@ -74,20 +74,17 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str # Note: python maintains order, which is good as we want the user defined order, and overall last properties = {} - for requirement in task.requirements: - property_key = string_to_json_key(requirement.name) - if property_key in properties or property_key == "overall_rating": - raise ValueError( - f"Duplicate requirement name: {requirement.name}. Can not be used as unique JSON schema key." - ) - if len(property_key) == 0: + for output_score in eval.output_scores: + output_score_json_key = output_score.json_key() + + if len(output_score_json_key) == 0: raise ValueError( - f"Invalid requirement name: {requirement.name}. Can not be used as JSON schema key." + f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." ) property: dict[str, str | int | float | list[str] | list[int]] = { - "title": requirement.name, + "title": output_score.name, } - match requirement.type: + match output_score.type: case TaskOutputRatingType.five_star: if allow_float_scores: property["type"] = "number" @@ -97,7 +94,7 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str property["enum"] = [1, 2, 3, 4, 5] property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." + f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." ) case TaskOutputRatingType.pass_fail: if allow_float_scores: @@ -105,12 +102,12 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str property["minimum"] = 0 property["maximum"] = 1 property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." + f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." ) else: property["enum"] = ["pass", "fail"] property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be either 'pass' or 'fail'." + f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." ) case TaskOutputRatingType.pass_fail_critical: if allow_float_scores: @@ -118,35 +115,20 @@ def build_score_schema(cls, task: Task, allow_float_scores: bool = False) -> str property["minimum"] = -1 property["maximum"] = 1 property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." + f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." ) else: property["enum"] = ["pass", "fail", "critical"] property["description"] = ( - f"{requirement.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." + f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." ) case TaskOutputRatingType.custom: # Skip custom rating types in evals continue case _: - raise_exhaustive_enum_error(requirement.type) + raise_exhaustive_enum_error(output_score.type) - properties[property_key] = property - - if allow_float_scores: - properties["overall_rating"] = { - "type": "number", - "minimum": 1, - "maximum": 5, - "title": "Overall Rating", - "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", - } - else: - properties["overall_rating"] = { - "enum": [1, 2, 3, 4, 5], - "title": "Overall Rating", - "description": "The overall rating for the task output.\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best.", - } + properties[output_score_json_key] = property schema = { "type": "object", diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 000cb150..f0a12d02 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -6,7 +6,7 @@ from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput from kiln_ai.adapters.prompt_builders import PromptGenerators from kiln_ai.datamodel import Project, Task, TaskRun -from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalScores from kiln_ai.datamodel.task import RunConfig from openai.types.chat import ChatCompletionTokenLogprob @@ -30,8 +30,7 @@ class GEvalTask(Task, parent_of={}): Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. """ - def __init__(self, eval_config: EvalConfig, target_task: Task): - # This keep the typechecker happy. TODO: shouldn't need this or parent_of above. + def __init__(self, eval_config: EvalConfig): tmp_project = Project(name="GEval") system_instruction = f""" @@ -51,11 +50,14 @@ def __init__(self, eval_config: EvalConfig, target_task: Task): for i, step in enumerate(steps): cot_instructions += f"{i + 1}) {step}\n" - # We restrict the LLM scoring to integer scores (see later logprob calculation, which requires integer scores) - # However, the overall score we output can be a float. - output_schema = BaseEval.build_score_schema( - target_task, allow_float_scores=False - ) + eval = eval_config.parent_eval() + if not eval: + raise ValueError("Eval config must have a parent eval") + + # Build the output schema from the eval's target output scores. + # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False + # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires integer scores) + output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) super().__init__( name="GEval Task", @@ -86,7 +88,7 @@ def __init__(self, eval_config: EvalConfig, run_config: RunConfig): super().__init__(eval_config, run_config) - self.geval_task = GEvalTask(eval_config, self.target_task) + self.geval_task = GEvalTask(eval_config) async def run_eval(self, task_run: TaskRun) -> EvalScores: """ diff --git a/libs/core/kiln_ai/adapters/eval/test_base_eval.py b/libs/core/kiln_ai/adapters/eval/test_base_eval.py index 276ce102..ecda6ef7 100644 --- a/libs/core/kiln_ai/adapters/eval/test_base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_base_eval.py @@ -3,7 +3,7 @@ import pytest from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType -from kiln_ai.datamodel.eval import Eval, EvalConfig +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore from kiln_ai.datamodel.task import ( RunConfigProperties, Task, @@ -14,32 +14,38 @@ def test_score_schema_five_star(): - # Create a task with a five-star requirement - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( + # Create an eval with a five-star score + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( name="Quality Score", instruction="Rate the quality", type=TaskOutputRatingType.five_star, - ) + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), ], ) - schema_str = BaseEval.build_score_schema(task) + schema_str = BaseEval.build_score_schema(eval) schema = json.loads(schema_str) # Check basic schema structure assert schema["type"] == "object" assert schema["required"] == ["quality_score", "overall_rating"] - # Check requirement property, and that it's an enum of 1-5 - req_prop = schema["properties"]["quality_score"] - assert req_prop["enum"] == [1, 2, 3, 4, 5] - assert "Quality Score" in req_prop["title"] - assert "Rate the quality" in req_prop["description"] - assert "between 1 and 5" in req_prop["description"] + # Check score property, and that it's an enum of 1-5 + score_prop = schema["properties"]["quality_score"] + assert score_prop["enum"] == [1, 2, 3, 4, 5] + assert "Quality Score" in score_prop["title"] + assert "Rate the quality" in score_prop["description"] + assert "between 1 and 5" in score_prop["description"] # Check overall rating property, and that it's an enum of 1-5 assert "overall_rating" in schema["properties"] @@ -51,34 +57,40 @@ def test_score_schema_five_star(): def test_score_schema_five_star_float(): - # Create a task with a five-star requirement - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( + # Create an eval with a five-star score + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( name="Quality Score", instruction="Rate the quality", type=TaskOutputRatingType.five_star, - ) + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), ], ) - schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) schema = json.loads(schema_str) # Check basic schema structure assert schema["type"] == "object" assert schema["required"] == ["quality_score", "overall_rating"] - # Check requirement property - req_prop = schema["properties"]["quality_score"] - assert req_prop["type"] == "number" - assert req_prop["minimum"] == 1 - assert req_prop["maximum"] == 5 - assert "Quality Score" in req_prop["title"] - assert "Rate the quality" in req_prop["description"] - assert "between 1 and 5" in req_prop["description"] + # Check score property + score_prop = schema["properties"]["quality_score"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == 1 + assert score_prop["maximum"] == 5 + assert "Quality Score" in score_prop["title"] + assert "Rate the quality" in score_prop["description"] + assert "between 1 and 5" in score_prop["description"] # Check overall rating property assert "overall_rating" in schema["properties"] @@ -92,101 +104,119 @@ def test_score_schema_five_star_float(): def test_score_schema_pass_fail(): - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( name="Pass Fail Test", instruction="Check if it passes", type=TaskOutputRatingType.pass_fail, - ) + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), ], ) - schema_str = BaseEval.build_score_schema(task) + schema_str = BaseEval.build_score_schema(eval) schema = json.loads(schema_str) - req_prop = schema["properties"]["pass_fail_test"] - assert req_prop["enum"] == ["pass", "fail"] - assert "Pass Fail Test" in req_prop["title"] - assert "Check if it passes" in req_prop["description"] - assert "'pass' or 'fail'" in req_prop["description"] + score_prop = schema["properties"]["pass_fail_test"] + assert score_prop["enum"] == ["pass", "fail"] + assert "Pass Fail Test" in score_prop["title"] + assert "Check if it passes" in score_prop["description"] + assert "'pass' or 'fail'" in score_prop["description"] assert schema["properties"]["overall_rating"] is not None # Now check that we can allow float scores with the proper float structure - schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) schema = json.loads(schema_str) - req_prop = schema["properties"]["pass_fail_test"] - assert req_prop["type"] == "number" - assert req_prop["minimum"] == 0 - assert req_prop["maximum"] == 1 + score_prop = schema["properties"]["pass_fail_test"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == 0 + assert score_prop["maximum"] == 1 assert ( "between 0 and 1, with 0 being a failure and 1 being a pass" - in req_prop["description"] + in score_prop["description"] ) def test_score_schema_pass_fail_critical(): - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( name="Critical Test", instruction="Check for critical issues", type=TaskOutputRatingType.pass_fail_critical, - ) + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), ], ) - schema_str = BaseEval.build_score_schema(task) + schema_str = BaseEval.build_score_schema(eval) schema = json.loads(schema_str) - req_prop = schema["properties"]["critical_test"] - assert "enum" in req_prop - assert req_prop["enum"] == ["pass", "fail", "critical"] - assert "'pass', 'fail', or 'critical'" in req_prop["description"] + score_prop = schema["properties"]["critical_test"] + assert "enum" in score_prop + assert score_prop["enum"] == ["pass", "fail", "critical"] + assert "'pass', 'fail', or 'critical'" in score_prop["description"] assert schema["properties"]["overall_rating"] is not None # Now check that we can allow float scores with the proper float structure - schema_str = BaseEval.build_score_schema(task, allow_float_scores=True) + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) schema = json.loads(schema_str) - req_prop = schema["properties"]["critical_test"] - assert req_prop["type"] == "number" - assert req_prop["minimum"] == -1 - assert req_prop["maximum"] == 1 - assert "between -1 and 1, with 1 being a pass" in req_prop["description"] + score_prop = schema["properties"]["critical_test"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == -1 + assert score_prop["maximum"] == 1 + assert "between -1 and 1, with 1 being a pass" in score_prop["description"] -def test_score_schema_multiple_requirements(): - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( +def test_score_schema_multiple_scores(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( name="Quality", instruction="Rate quality", type=TaskOutputRatingType.five_star, ), - TaskRequirement( + EvalOutputScore( name="Pass Check", instruction="Basic pass check", type=TaskOutputRatingType.pass_fail, ), - TaskRequirement( + EvalOutputScore( name="Security", instruction="Check security", type=TaskOutputRatingType.pass_fail_critical, ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), ], ) - schema_str = BaseEval.build_score_schema(task) + schema_str = BaseEval.build_score_schema(eval) schema = json.loads(schema_str) # Verify order is maintained @@ -198,45 +228,16 @@ def test_score_schema_multiple_requirements(): ] -def test_score_schema_custom_type_skipped(): - task = Task( - name="Test Task", - instruction="Test instruction", - requirements=[ - TaskRequirement( - name="Custom Rating", - instruction="Custom rating", - type=TaskOutputRatingType.custom, - ), - TaskRequirement( - name="Quality", - instruction="Rate quality", - type=TaskOutputRatingType.five_star, - ), - ], - ) - - schema_str = BaseEval.build_score_schema(task) - schema = json.loads(schema_str) - - # Custom type should be skipped - assert len(schema["properties"]) == 2 # one requirement + overall_rating - - # Verify only non-custom requirement and overall_rating are present - props = list(schema["properties"].keys()) - assert "quality" in props - assert "overall_rating" in props - - -def test_score_schema_no_requirements(): - task = Task(name="Test Task", instruction="Test instruction", requirements=[]) - - schema_str = BaseEval.build_score_schema(task) - schema = json.loads(schema_str) - - # Should only have overall_rating - assert len(schema["properties"]) == 1 - assert "overall_rating" in schema["properties"] +def test_score_schema_no_scores(): + # This should raise an error since at least one score is required + with pytest.raises(ValueError, match="output_scores are required"): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[], + ) + BaseEval.build_score_schema(eval) class TestEval(BaseEval): @@ -257,7 +258,7 @@ async def test_run_method(): name="Quality", instruction="Rate quality", type=TaskOutputRatingType.five_star, - ) + ), ], ) @@ -276,6 +277,18 @@ async def test_run_method(): parent=task, eval_set_filter_id="all", eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], ), prompt=BasePrompt( name="Test Prompt", diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index 1c9d621a..8aa47ec2 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -9,9 +9,10 @@ DataSourceType, Task, TaskOutput, + TaskOutputRatingType, TaskRun, ) -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalRun +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore, EvalRun from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig @@ -35,6 +36,13 @@ def mock_eval(mock_task): description="test", eval_set_filter_id="all", eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Accuracy", + instruction="Check if the output is accurate", + type=TaskOutputRatingType.pass_fail, + ), + ], parent=mock_task, ) eval.save_to_file() @@ -190,7 +198,7 @@ def test_collect_tasks_excludes_already_run(mock_eval_runner, mock_task, data_so task_run_config_id=mock_eval_runner.run_configs[0].id, input="test", output="test", - scores={"score": 1.0}, + scores={"accuracy": 1.0}, ).save_to_file() # Set filter to match the task diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 9806479e..e24fcb8b 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -16,7 +16,7 @@ TaskRequirement, TaskRun, ) -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore from kiln_ai.datamodel.task import RunConfig @@ -53,6 +53,20 @@ def test_eval_config(test_task): parent=test_task, eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="appropriateness", + type=TaskOutputRatingType.pass_fail, + ), + EvalOutputScore( + name="topic_alignment", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="overall_rating", + type=TaskOutputRatingType.five_star, + ), + ], ) eval.save_to_file() @@ -163,23 +177,23 @@ async def test_run_g_eval_e2e( g_eval = GEval(test_eval_config, test_run_config) # Run the evaluation - eval_result = await g_eval.run("chickens") + task_run, scores = await g_eval.run("chickens") # Verify the evaluation results - assert isinstance(eval_result, dict) + assert isinstance(scores, dict) - assert "topic_alignment" in eval_result - topic_alignment = eval_result["topic_alignment"] + assert "topic_alignment" in scores + topic_alignment = scores["topic_alignment"] assert isinstance(topic_alignment, float) assert 1 <= topic_alignment <= 5 - assert "appropriateness" in eval_result - appropriateness = eval_result["appropriateness"] + assert "appropriateness" in scores + appropriateness = scores["appropriateness"] assert isinstance(appropriateness, float) assert appropriateness >= 0.0 and appropriateness <= 1.0 - assert "overall_rating" in eval_result - overall = eval_result["overall_rating"] + assert "overall_rating" in scores + overall = scores["overall_rating"] assert isinstance(overall, float) assert 1.0 <= overall <= 5.0 diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 231c7330..bad1d1e4 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -36,7 +36,7 @@ TaskOutputRating, TaskRun, ) -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore def test_simple_prompt_builder(tmp_path): @@ -610,6 +610,12 @@ def test_eval_prompt_builder(tmp_path, valid_eval_config_datasource): parent=task, eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type="five_star", + ), + ], ) eval.save_to_file() @@ -673,6 +679,12 @@ def test_eval_prompt_builder_validation_errors(tmp_path): parent=task, eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type="five_star", + ), + ], ) eval.save_to_file() diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 89edd610..12c153f5 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -1,8 +1,8 @@ import json from enum import Enum -from typing import TYPE_CHECKING, Any, Dict, Union +from typing import TYPE_CHECKING, Any, Dict, List, Union -from pydantic import Field, model_validator +from pydantic import BaseModel, Field, model_validator from typing_extensions import Self from kiln_ai.datamodel.basemodel import ( @@ -11,9 +11,12 @@ KilnParentedModel, KilnParentModel, ) +from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType from kiln_ai.datamodel.dataset_filters import DatasetFilterId +from kiln_ai.datamodel.json_schema import string_to_json_key from kiln_ai.datamodel.prompt import BasePrompt from kiln_ai.datamodel.task_output import DataSource, DataSourceType +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error if TYPE_CHECKING: from kiln_ai.datamodel.task import Task @@ -31,6 +34,36 @@ class EvalConfigType(str, Enum): llm_as_judge = "llm_as_judge" +class EvalOutputScore(BaseModel): + """ + A definition of a score that an evaluator will produce. + + Very similar to TaskRequirement, but conceptually different so separate models. + """ + + name: str = Field( + description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." + ) + instruction: str | None = Field( + default=None, + description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", + ) + type: TaskOutputRatingType = Field( + description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." + ) + + def json_key(self) -> str: + return string_to_json_key(self.name) + + @model_validator(mode="after") + def validate_type(self) -> Self: + if self.type == TaskOutputRatingType.custom: + raise ValueError( + f"Custom scores are not supported in evaluators. '{self.json_key}' was set to a custom score." + ) + return self + + class EvalRun(KilnParentedModel): """ The results of running an eval on a single dataset item, with a specific TaskRunConfig and EvalConfig. @@ -53,11 +86,71 @@ class EvalRun(KilnParentedModel): description="The scores of the evaluator (specifically the EvalConfig this object is a child of)." ) - def parent_eval_config(self) -> "EvalConfig": - if self.parent is None or self.parent.__class__.__name__ != "EvalConfig": + def parent_eval_config(self) -> Union["EvalConfig", None]: + if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": raise ValueError("parent must be an EvalConfig") return self.parent # type: ignore + @model_validator(mode="after") + def validate_scores(self) -> Self: + # We're checking the scores have the expected keys from the grand-parent eval + if self.scores is None or len(self.scores) == 0: + raise ValueError("scores are required, and must have at least one score.") + + parent_eval_config = self.parent_eval_config() + eval = parent_eval_config.parent_eval() if parent_eval_config else None + if not eval: + # Can't validate without the grand-parent eval, allow it to be validated later + return self + + output_score_keys = [score.json_key() for score in eval.output_scores] + if set(output_score_keys) != set(self.scores.keys()): + raise ValueError( + f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" + ) + + # Check that each score is expected in this eval and the correct type + for output_score in eval.output_scores: + match output_score.type: + case TaskOutputRatingType.five_star: + five_star_score = self.scores[output_score.json_key()] + if ( + not isinstance(five_star_score, float) + or five_star_score < 1.0 + or five_star_score > 5.0 + ): + raise ValueError( + f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" + ) + case TaskOutputRatingType.pass_fail: + pass_fail_score = self.scores[output_score.json_key()] + if ( + not isinstance(pass_fail_score, float) + or pass_fail_score < 0.0 + or pass_fail_score > 1.0 + ): + raise ValueError( + f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" + ) + case TaskOutputRatingType.pass_fail_critical: + pass_fail_critical_score = self.scores[output_score.json_key()] + if ( + not isinstance(pass_fail_critical_score, float) + or pass_fail_critical_score < -1.0 + or pass_fail_critical_score > 1.0 + ): + raise ValueError( + f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" + ) + case TaskOutputRatingType.custom: + raise ValueError( + f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." + ) + case _: + # Catch missing cases + raise_exhaustive_enum_error(output_score.type) + return self + class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): """ @@ -76,10 +169,12 @@ class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun} default={}, description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", ) - prompt: BasePrompt = Field(description="The prompt to use for this eval config.") + prompt: BasePrompt = Field( + description="The prompt to use for this eval config. Both when running the task to generate outputs to evaluate and when explaining to the eval model what the goal of the task was. This is a frozen prompt, so this eval config is consistent over time (for example, if the user selects multi-shot prompting, this saves that dynamic prompt at the point the eval config is created). Freezing the prompt ensures consistent evals." + ) - def parent_eval(self) -> "Eval": - if self.parent is None or self.parent.__class__.__name__ != "Eval": + def parent_eval(self) -> Union["Eval", None]: + if self.parent is not None and self.parent.__class__.__name__ != "Eval": raise ValueError("parent must be an Eval") return self.parent # type: ignore @@ -135,12 +230,30 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig} eval_configs_filter_id: DatasetFilterId = Field( description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." ) + output_scores: List[EvalOutputScore] = Field( + description="The scores this evaluator should produce." + ) # Workaround to return typed parent without importing Task def parent_task(self) -> Union["Task", None]: - if self.parent is None or self.parent.__class__.__name__ != "Task": - return None + if self.parent is not None and self.parent.__class__.__name__ != "Task": + raise ValueError("parent must be a Task") return self.parent # type: ignore def configs(self, readonly: bool = False) -> list[EvalConfig]: return super().configs(readonly=readonly) # type: ignore + + @model_validator(mode="after") + def validate_scores(self) -> Self: + if self.output_scores is None or len(self.output_scores) == 0: + raise ValueError( + "output_scores are required, and must have at least one score." + ) + + # check for duplicate names (once transformed to JSON keys) + output_score_keys = [score.json_key() for score in self.output_scores] + if len(output_score_keys) != len(set(output_score_keys)): + raise ValueError( + f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" + ) + return self diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index 30ba6845..0aacdf16 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -7,11 +7,16 @@ Eval, EvalConfig, EvalConfigType, + EvalOutputScore, EvalRun, EvalState, ) from kiln_ai.datamodel.task import Task -from kiln_ai.datamodel.task_output import DataSource, DataSourceType +from kiln_ai.datamodel.task_output import ( + DataSource, + DataSourceType, + TaskOutputRatingType, +) @pytest.fixture @@ -116,12 +121,20 @@ def test_eval_basic_properties(): current_config_id="config123", eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ) + ], ) assert eval.name == "Test Eval" assert eval.description == "Test Description" assert eval.state == EvalState.enabled assert eval.current_config_id == "config123" + assert eval.output_scores[0].name == "accuracy" + assert eval.output_scores[0].type == TaskOutputRatingType.five_star def test_eval_default_values(): @@ -129,6 +142,12 @@ def test_eval_default_values(): name="Test Eval", eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="quality", + type=TaskOutputRatingType.pass_fail, + ) + ], ) assert eval.description is None @@ -142,6 +161,12 @@ def test_eval_parent_task_relationship(mock_task, valid_eval_config_data): parent=mock_task, eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.pass_fail, + ) + ], ) config = EvalConfig(parent=eval, **valid_eval_config_data) @@ -156,6 +181,12 @@ def test_eval_parent_task_none(): name="Test Eval", eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.pass_fail, + ) + ], ) assert eval.parent_task() is None @@ -179,6 +210,12 @@ def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_pat parent=mock_task, eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.pass_fail, + ) + ], ) eval.save_to_file() @@ -192,7 +229,7 @@ def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_pat task_run_config_id="config456", input='{"key": "value"}', output='{"result": "success"}', - scores={"accuracy": 0.95, "f1": 0.88}, + scores={"accuracy": 0.95}, ) run.save_to_file() @@ -215,7 +252,7 @@ def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_pat assert runs[0].task_run_config_id == "config456" assert runs[0].input == '{"key": "value"}' assert runs[0].output == '{"result": "success"}' - assert runs[0].scores == {"accuracy": 0.95, "f1": 0.88} + assert runs[0].scores == {"accuracy": 0.95} # and back up assert runs[0].parent_eval_config().parent_eval().parent_task().path == task_path @@ -228,14 +265,14 @@ def test_eval_run_valid_creation(): task_run_config_id="config456", input='{"key": "value"}', # JSON formatted input output='{"result": "success"}', # JSON formatted output - scores={"accuracy": 0.95, "f1": 0.88}, + scores={"accuracy": 0.95}, ) assert eval_run.dataset_id == "dataset123" assert eval_run.task_run_config_id == "config456" assert eval_run.input == '{"key": "value"}' assert eval_run.output == '{"result": "success"}' - assert eval_run.scores == {"accuracy": 0.95, "f1": 0.88} + assert eval_run.scores == {"accuracy": 0.95} def test_eval_run_plaintext(): @@ -276,3 +313,301 @@ def test_eval_run_invalid_scores(): output="test", scores={"score": "not a float"}, # invalid score type ) + + +def test_eval_missing_output_scores(): + """Test that eval creation fails when output_scores is missing""" + with pytest.raises(ValidationError) as exc_info: + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) + assert "output_scores" in str(exc_info.value) + + +def test_eval_empty_output_scores(): + """Test that eval creation fails when output_scores is empty""" + with pytest.raises( + ValueError, match="output_scores are required, and must have at least one score" + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[], + ) + + +def test_eval_duplicate_output_scores(): + """Test that eval creation fails when output_scores has duplicate names""" + with pytest.raises( + ValueError, + match="must have unique names", + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore(name="SCORE", type=TaskOutputRatingType.pass_fail), + ], + ) + + +def test_eval_invalid_score_type(): + """Test that eval creation fails with invalid rating type in output_scores""" + with pytest.raises( + ValueError, + match="Input should be 'five_star', 'pass_fail', 'pass_fail_critical'", + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type="invalid_type", + ) + ], + ) + + +def test_eval_valid_output_scores(): + """Test that eval creation succeeds with valid output_scores""" + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="critical_check", + type=TaskOutputRatingType.pass_fail_critical, + ), + EvalOutputScore(name="basic_check", type=TaskOutputRatingType.pass_fail), + ], + ) + assert len(eval.output_scores) == 3 + assert eval.output_scores[0].type == TaskOutputRatingType.five_star + assert eval.output_scores[0].name == "accuracy" + assert eval.output_scores[1].type == TaskOutputRatingType.pass_fail_critical + assert eval.output_scores[1].name == "critical_check" + assert eval.output_scores[2].type == TaskOutputRatingType.pass_fail + assert eval.output_scores[2].name == "basic_check" + + +@pytest.fixture +def valid_eval_run_data(): + return { + "dataset_id": "dataset123", + "task_run_config_id": "config456", + "input": "test input", + "output": "test output", + "scores": {"accuracy": 4.5}, + } + + +def test_eval_run_five_star_score_validation(valid_eval_config, valid_eval_run_data): + # Setup eval with five_star rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid score + run = EvalRun(parent=valid_eval_config, **valid_eval_run_data) + assert run.scores["accuracy"] == 4.5 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 0.5}}, + ) + + with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 5.5}}, + ) + + +def test_eval_run_pass_fail_score_validation(valid_eval_config, valid_eval_run_data): + # Setup eval with pass_fail rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="check", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid scores + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 1.0}} + ) + assert run.scores["check"] == 1.0 + + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 0.0}} + ) + assert run.scores["check"] == 0.0 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"check": -0.1}}, + ) + + with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"check": 1.1}}, + ) + + +def test_eval_run_pass_fail_critical_score_validation( + valid_eval_config, valid_eval_run_data +): + # Setup eval with pass_fail_critical rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="critical", + type=TaskOutputRatingType.pass_fail_critical, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid scores + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"critical": 1.0}} + ) + assert run.scores["critical"] == 1.0 + + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": -1.0}}, + ) + assert run.scores["critical"] == -1.0 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": -1.1}}, + ) + + with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": 1.1}}, + ) + + +def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="critical", + type=TaskOutputRatingType.pass_fail_critical, + ), + ], + ) + valid_eval_config.parent = eval + + # Correct + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}}, + ) + + # Correct but wrong order still okay + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}}, + ) + + # Missing score + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5}}, + ) + + # Extra score + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{ + **valid_eval_run_data, + "scores": {"accuracy": 4.5, "critical": 1.0, "extra": 1.0}, + }, + ) + + # Missing score w matching count + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}}, + ) + + +def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_data): + with pytest.raises( + ValueError, match="Custom scores are not supported in evaluators" + ): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="custom", + type=TaskOutputRatingType.custom, + ) + ], + ) From d8da2ca534cddd4efa3b116fb3bc95c573a7dabe Mon Sep 17 00:00:00 2001 From: scosman Date: Thu, 20 Feb 2025 13:08:21 -0500 Subject: [PATCH 031/102] Checkpoint of UI. CI won't pass, but saving considerable progress. --- app/desktop/desktop_server.py | 2 + app/web_ui/src/lib/api_schema.d.ts | 69 +++++++ app/web_ui/src/lib/stores.ts | 37 ++-- app/web_ui/src/lib/types.ts | 1 + app/web_ui/src/lib/ui/warning.svelte | 8 +- app/web_ui/src/lib/utils/form_list.svelte | 5 +- app/web_ui/src/routes/(app)/+layout.svelte | 52 +++++ .../evals/[project_id]/[task_id]/+page.svelte | 112 +++++++++++ .../evals/[project_id]/[task_id]/+page.ts | 1 + .../[task_id]/create_evaluator/+page.svelte | 180 ++++++++++++++++++ .../[task_id]/create_evaluator/+page.ts | 1 + .../create_evaluator/eval_template.ts | 8 + .../select_eval_template.svelte | 140 ++++++++++++++ .../[project_id]/[task_id]/empty_eval.svelte | 74 +++++++ .../[project_id]/[task_id]/+page.svelte | 2 +- .../[task_id]/empty_finetune.svelte | 7 +- 16 files changed, 676 insertions(+), 23 deletions(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.ts create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.ts create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py index b9d617d7..c05cfcc2 100644 --- a/app/desktop/desktop_server.py +++ b/app/desktop/desktop_server.py @@ -9,6 +9,7 @@ from fastapi import FastAPI from app.desktop.studio_server.data_gen_api import connect_data_gen_api +from app.desktop.studio_server.evals_api import connect_evals_api from app.desktop.studio_server.finetune_api import connect_fine_tune_api from app.desktop.studio_server.prompt_api import connect_prompt_api from app.desktop.studio_server.provider_api import connect_provider_api @@ -35,6 +36,7 @@ def make_app(): connect_settings(app) connect_data_gen_api(app) connect_fine_tune_api(app) + connect_evals_api(app) # Important: webhost must be last, it handles all other URLs connect_webhost(app) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index cd6ce7eb..a757f49e 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -657,6 +657,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/sdf": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Generate Evaluator */ + post: operations["generate_evaluator_api_projects__project_id__tasks__task_id__sdf_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -937,6 +954,26 @@ export interface components { * @enum {string} */ DatasetSplitType: "train_test" | "train_test_val" | "train_test_val_80" | "all"; + /** + * EvalOutputScore + * @description A definition of a score that an evaluator will produce. + * + * Very similar to TaskRequirement, but conceptually different so separate models. + */ + EvalOutputScore: { + /** + * Name + * @description The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance. + */ + name: string; + /** + * Instruction + * @description A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user. + */ + instruction?: string | null; + /** @description The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical'). */ + type: components["schemas"]["TaskOutputRatingType"]; + }; /** * FineTuneParameter * @description A parameter for a fine-tune. Hyperparameters, etc. @@ -3265,4 +3302,36 @@ export interface operations { }; }; }; + generate_evaluator_api_projects__project_id__tasks__task_id__sdf_post: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalOutputScore"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; } diff --git a/app/web_ui/src/lib/stores.ts b/app/web_ui/src/lib/stores.ts index dbd4d8f6..74e946d1 100644 --- a/app/web_ui/src/lib/stores.ts +++ b/app/web_ui/src/lib/stores.ts @@ -119,6 +119,27 @@ function localStorageStore(key: string, initialValue: T) { return store } +export async function load_task( + project_id: string, + task_id: string, +): Promise { + const { + data, // only present if 2XX response + error, // only present if 4XX or 5XX response + } = await client.GET("/api/projects/{project_id}/tasks/{task_id}", { + params: { + path: { + project_id: project_id, + task_id: task_id, + }, + }, + }) + if (error) { + throw error + } + return data +} + export async function load_current_task(project: Project | null) { let task: Task | null = null try { @@ -126,21 +147,7 @@ export async function load_current_task(project: Project | null) { if (!project || !project?.id || !task_id) { return } - const { - data, // only present if 2XX response - error, // only present if 4XX or 5XX response - } = await client.GET("/api/projects/{project_id}/tasks/{task_id}", { - params: { - path: { - project_id: project.id, - task_id: task_id, - }, - }, - }) - if (error) { - throw error - } - task = data + task = await load_task(project.id, task_id) // Load the current task's prompts after 50ms, as it's not the most critical data setTimeout(() => { diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index e5f98175..c29ef5a3 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -19,3 +19,4 @@ export type OllamaConnection = components["schemas"]["OllamaConnection"] export type RunSummary = components["schemas"]["RunSummary"] export type PromptResponse = components["schemas"]["PromptResponse"] export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"] +export type EvalOutputScore = components["schemas"]["EvalOutputScore"] diff --git a/app/web_ui/src/lib/ui/warning.svelte b/app/web_ui/src/lib/ui/warning.svelte index a4861728..32635241 100644 --- a/app/web_ui/src/lib/ui/warning.svelte +++ b/app/web_ui/src/lib/ui/warning.svelte @@ -1,11 +1,15 @@ {#if warning_message}
-
+
{warning_message}
diff --git a/app/web_ui/src/lib/utils/form_list.svelte b/app/web_ui/src/lib/utils/form_list.svelte index 34783165..79a55b21 100644 --- a/app/web_ui/src/lib/utils/form_list.svelte +++ b/app/web_ui/src/lib/utils/form_list.svelte @@ -3,6 +3,7 @@ export let content_label: string = "Item" export let start_with_one: boolean = true export let empty_content: unknown = {} + export let frozen: boolean = false // Unique ID for the list, for scrolling to top after removal let id = "form_list_" + Math.random().toString(36).substring(2, 15) @@ -79,7 +80,7 @@ {content_label} #{item_index + 1}
+ {/each} + diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte new file mode 100644 index 00000000..b69a41d2 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte @@ -0,0 +1,74 @@ + + + diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte index b5c5adcb..3a720aa3 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte @@ -12,7 +12,7 @@ $: project_id = $page.params.project_id $: task_id = $page.params.task_id - $: is_empty = !!finetunes && finetunes.length == 0 + $: is_empty = !finetunes || finetunes.length == 0 let finetunes: Finetune[] | null = null let finetunes_error: KilnError | null = null diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte index 9ef51548..9779c733 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte @@ -76,16 +76,17 @@
- Fine tuning learns from your dataset to create custom models. + Fine-Tuning Learns from Your Dataset to Create Custom Models
- Fine tunes can be faster, cheaper and more accurate than standard models. + Fine-tuned models can be faster, cheaper and more accurate than standard + models.
- Create Fine-Tune + Create a Fine-Tune Date: Thu, 20 Feb 2025 22:01:57 -0500 Subject: [PATCH 032/102] Huge update: - Create evals and eval config UI. Nice and clean. But the main evals page is still a mess. - Tempaltes for things like bias, maliciousness, jailbreaking, etc - Templates for Kiln tasks, using requirements for a robust - Eval API with tests - GEval prompt selection --- app/desktop/studio_server/evals_api.py | 114 ++++++ app/desktop/studio_server/provider_api.py | 7 + app/desktop/studio_server/test_eval_api.py | 198 ++++++++++ .../studio_server/test_provider_api.py | 5 + app/web_ui/src/lib/api_schema.d.ts | 294 ++++++++++++++- app/web_ui/src/lib/types.ts | 3 + app/web_ui/src/lib/utils/form_element.svelte | 3 +- .../[eval_id]/create_eval_config/+page.svelte | 340 ++++++++++++++++++ .../[eval_id]/create_eval_config/+page.ts | 1 + .../[task_id]/create_evaluator/+page.svelte | 185 +++++++++- .../create_evaluator/eval_template.ts | 5 +- .../select_eval_template.svelte | 123 +++++-- .../run/available_models_dropdown.svelte | 22 +- libs/core/kiln_ai/adapters/ml_model_list.py | 5 + libs/core/kiln_ai/datamodel/eval.py | 18 +- .../core/kiln_ai/datamodel/test_eval_model.py | 3 - 16 files changed, 1275 insertions(+), 51 deletions(-) create mode 100644 app/desktop/studio_server/evals_api.py create mode 100644 app/desktop/studio_server/test_eval_api.py create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/evals_api.py new file mode 100644 index 00000000..114e5343 --- /dev/null +++ b/app/desktop/studio_server/evals_api.py @@ -0,0 +1,114 @@ +from typing import Any + +from fastapi import FastAPI, HTTPException +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + PromptId, +) +from kiln_ai.datamodel.dataset_filters import DatasetFilterId +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalOutputScore, + EvalTemplate, +) +from kiln_server.task_api import task_from_id +from pydantic import BaseModel + + +def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: + task = task_from_id(project_id, task_id) + for eval in task.evals(): + if eval.id == eval_id: + return eval + + raise HTTPException( + status_code=404, + detail=f"Task not found. ID: {task_id}", + ) + + +class CreateEvaluatorRequest(BaseModel): + name: str + description: str + template: EvalTemplate | None + output_scores: list[EvalOutputScore] + eval_set_filter_id: DatasetFilterId + eval_configs_filter_id: DatasetFilterId + + +class CreateEvalConfigRequest(BaseModel): + type: EvalConfigType + properties: dict[str, Any] + model_name: str + provider: ModelProviderName + prompt_id: PromptId + + +def connect_evals_api(app: FastAPI): + @app.post("/api/projects/{project_id}/tasks/{task_id}/create_evaluator") + async def create_evaluator( + project_id: str, + task_id: str, + request: CreateEvaluatorRequest, + ) -> Eval: + task = task_from_id(project_id, task_id) + eval = Eval( + name=request.name, + description=request.description, + template=request.template, + output_scores=request.output_scores, + eval_set_filter_id=request.eval_set_filter_id, + eval_configs_filter_id=request.eval_configs_filter_id, + parent=task, + ) + eval.save_to_file() + return eval + + @app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}") + async def get_eval(project_id: str, task_id: str, eval_id: str) -> Eval: + return eval_from_id(project_id, task_id, eval_id) + + @app.post( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config" + ) + async def create_eval_config( + project_id: str, + task_id: str, + eval_id: str, + request: CreateEvalConfigRequest, + ) -> EvalConfig: + task = task_from_id(project_id, task_id) + eval = eval_from_id(project_id, task_id, eval_id) + + # Create a prompt instance to save to the eval config + prompt_builder = prompt_builder_from_id(request.prompt_id, task) + prompt = BasePrompt( + name=request.prompt_id, + generator_id=request.prompt_id, + prompt=prompt_builder.build_base_prompt(), + chain_of_thought_instructions=prompt_builder.chain_of_thought_prompt(), + ) + + eval_config = EvalConfig( + config_type=request.type, + properties=request.properties, + model=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": request.model_name, + "model_provider": request.provider, + # TODO remove this + "adapter_name": "eval", + }, + ), + prompt=prompt, + parent=eval, + ) + eval_config.save_to_file() + return eval_config diff --git a/app/desktop/studio_server/provider_api.py b/app/desktop/studio_server/provider_api.py index 610d77f5..6c6d395c 100644 --- a/app/desktop/studio_server/provider_api.py +++ b/app/desktop/studio_server/provider_api.py @@ -75,6 +75,7 @@ class ModelDetails(BaseModel): name: str supports_structured_output: bool supports_data_gen: bool + supports_logprobs: bool # True if this is a untested model (typically user added). We don't know if these support structured output, data gen, etc. They should appear in their own section in the UI. untested_model: bool = Field(default=False) task_filter: List[str] | None = Field(default=None) @@ -139,6 +140,7 @@ async def get_available_models() -> List[AvailableModels]: name=model.friendly_name, supports_structured_output=provider.supports_structured_output, supports_data_gen=provider.supports_data_gen, + supports_logprobs=provider.supports_logprobs, ) ) @@ -534,6 +536,7 @@ async def available_ollama_models() -> AvailableModels | None: name=model.friendly_name, supports_structured_output=ollama_provider.supports_structured_output, supports_data_gen=ollama_provider.supports_data_gen, + supports_logprobs=False, # Ollama doesn't support logprobs https://github.com/ollama/ollama/issues/2415 ) ) for ollama_model in ollama_connection.untested_models: @@ -543,6 +546,7 @@ async def available_ollama_models() -> AvailableModels | None: name=ollama_model, supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) @@ -595,6 +599,7 @@ def custom_models() -> AvailableModels | None: name=f"{provider_name_from_id(provider_id)}: {model_name}", supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) @@ -626,6 +631,7 @@ def all_fine_tuned_models() -> AvailableModels | None: # YMMV, but we'll assume all fine tuned models support structured output and data gen supports_structured_output=True, supports_data_gen=True, + supports_logprobs=False, task_filter=[str(task.id)], ) ) @@ -725,6 +731,7 @@ def openai_compatible_providers_load_cache() -> OpenAICompatibleProviderCache | name=model.id, supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py new file mode 100644 index 00000000..e4fe793d --- /dev/null +++ b/app/desktop/studio_server/test_eval_api.py @@ -0,0 +1,198 @@ +from unittest.mock import Mock, patch + +import pytest +from fastapi import FastAPI, HTTPException +from fastapi.testclient import TestClient +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + PromptId, + Task, +) +from kiln_ai.datamodel.dataset_filters import DatasetFilterId +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalOutputScore, + EvalTemplate, +) + +from app.desktop.studio_server.evals_api import ( + CreateEvalConfigRequest, + CreateEvaluatorRequest, + connect_evals_api, +) + + +@pytest.fixture +def app(): + app = FastAPI() + connect_evals_api(app) + return app + + +@pytest.fixture +def client(app): + return TestClient(app) + + +@pytest.fixture +def mock_task(tmp_path): + task = Task( + id="task1", + name="Test Task", + description="Test Description", + instruction="Test Instructions", + path=tmp_path / "task.kiln", + ) + task.save_to_file() + return task + + +@pytest.fixture +def mock_eval(mock_task): + eval = Eval( + id="eval1", + name="Test Eval", + description="Test Description", + template=EvalTemplate.bias, + output_scores=[ + EvalOutputScore(name="score1", description="desc1", type="five_star"), + ], + eval_set_filter_id="tag::eval_set", + eval_configs_filter_id="tag::golden", + parent=mock_task, + ) + eval.save_to_file() + return eval + + +@pytest.fixture +def mock_task_from_id(mock_task): + with patch("app.desktop.studio_server.evals_api.task_from_id") as mock: + mock.return_value = mock_task + yield mock + + +def test_get_eval_success(client, mock_task, mock_task_from_id, mock_eval): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/eval/eval1") + + assert response.status_code == 200 + result = response.json() + assert result["id"] == "eval1" + assert result["name"] == "Test Eval" + mock_task_from_id.assert_called_once_with("project1", "task1") + + +def test_get_eval_not_found(client, mock_task, mock_task_from_id): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/eval/non_existent") + + assert response.status_code == 404 + assert response.json()["detail"] == "Task not found. ID: task1" + + +@pytest.fixture +def valid_evaluator_request(): + return CreateEvaluatorRequest( + name="Test Evaluator", + description="Test Description", + template=None, + output_scores=[ + EvalOutputScore(name="score1", description="desc1", type="five_star"), + ], + eval_set_filter_id="tag::eval_set", + eval_configs_filter_id="tag::golden", + ) + + +@pytest.fixture +def valid_eval_config_request(): + return CreateEvalConfigRequest( + type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + model_name="gpt-4", + provider=ModelProviderName.openai, + prompt_id="simple_chain_of_thought_prompt_builder", + ) + + +@pytest.mark.asyncio +async def test_create_evaluator( + client, mock_task_from_id, valid_evaluator_request, mock_task +): + mock_task_from_id.return_value = mock_task + + with patch.object(Eval, "save_to_file") as mock_save: + response = client.post( + "/api/projects/project1/tasks/task1/create_evaluator", + json=valid_evaluator_request.model_dump(), + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == valid_evaluator_request.name + assert result["description"] == valid_evaluator_request.description + mock_save.assert_called_once() + + +@pytest.mark.asyncio +async def test_create_eval_config( + client, mock_task_from_id, valid_eval_config_request, mock_eval, mock_task +): + mock_task_from_id.return_value = mock_task + + with ( + patch("app.desktop.studio_server.evals_api.eval_from_id") as mock_eval_from_id, + patch( + "app.desktop.studio_server.evals_api.prompt_builder_from_id" + ) as mock_prompt_builder, + # patch.object(EvalConfig, "save_to_file") as mock_save, + ): + mock_eval_from_id.return_value = mock_eval + mock_prompt_builder.return_value.build_base_prompt.return_value = "base prompt" + mock_prompt_builder.return_value.chain_of_thought_prompt.return_value = ( + "cot prompt" + ) + + response = client.post( + "/api/projects/project1/tasks/task1/eval/eval1/create_eval_config", + json=valid_eval_config_request.model_dump(), + ) + + assert response.status_code == 200 + result = response.json() + assert result["config_type"] == valid_eval_config_request.type + assert result["properties"] == valid_eval_config_request.properties + assert result["model"]["type"] == DataSourceType.synthetic + assert ( + result["model"]["properties"]["model_name"] + == valid_eval_config_request.model_name + ) + assert ( + result["model"]["properties"]["model_provider"] + == valid_eval_config_request.provider + ) + assert isinstance(result["prompt"], dict) + # mock_save.assert_called_once() + + # Fetch disk + assert len(mock_eval.configs()) == 1 + config = mock_eval.configs()[0] + assert config.config_type == valid_eval_config_request.type + assert config.properties == valid_eval_config_request.properties + assert config.model.type == DataSourceType.synthetic + assert config.model.properties["model_name"] == valid_eval_config_request.model_name + assert ( + config.model.properties["model_provider"] == valid_eval_config_request.provider + ) + assert config.prompt.prompt == "base prompt" + assert config.prompt.chain_of_thought_instructions == "cot prompt" + assert config.properties["eval_steps"][0] == "step1" + assert config.properties["eval_steps"][1] == "step2" diff --git a/app/desktop/studio_server/test_provider_api.py b/app/desktop/studio_server/test_provider_api.py index 1e909778..f3e5dd9e 100644 --- a/app/desktop/studio_server/test_provider_api.py +++ b/app/desktop/studio_server/test_provider_api.py @@ -405,6 +405,7 @@ async def test_get_available_models(app, client): "name": "Model 2", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -419,6 +420,7 @@ async def test_get_available_models(app, client): "name": "Model 1", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -433,6 +435,7 @@ async def test_get_available_models(app, client): "name": "Model 2", "supports_structured_output": False, "supports_data_gen": False, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -495,6 +498,7 @@ async def test_get_available_models_ollama_exception(app, client): "name": "Model 1", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -1214,6 +1218,7 @@ def test_openai_compatible_providers(): name="model1", supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ], diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index a757f49e..3cf38fff 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -657,7 +657,7 @@ export interface paths { patch?: never; trace?: never; }; - "/api/projects/{project_id}/tasks/{task_id}/sdf": { + "/api/projects/{project_id}/tasks/{task_id}/create_evaluator": { parameters: { query?: never; header?: never; @@ -666,8 +666,42 @@ export interface paths { }; get?: never; put?: never; - /** Generate Evaluator */ - post: operations["generate_evaluator_api_projects__project_id__tasks__task_id__sdf_post"]; + /** Create Evaluator */ + post: operations["create_evaluator_api_projects__project_id__tasks__task_id__create_evaluator_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval */ + get: operations["get_eval_api_projects__project_id__tasks__task_id__eval__eval_id__get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Eval Config */ + post: operations["create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post"]; delete?: never; options?: never; head?: never; @@ -687,6 +721,34 @@ export interface components { /** Models */ models: components["schemas"]["ModelDetails"][]; }; + /** + * BasePrompt + * @description A prompt for a task. This is the basic data storage format which can be used throughout a project. + * + * The "Prompt" model name is reserved for the custom prompts parented by a task. + */ + BasePrompt: { + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Generator Id + * @description The id of the generator that created this prompt. + */ + generator_id?: string | null; + /** + * Prompt + * @description The prompt for the task. + */ + prompt: string; + /** + * Chain Of Thought Instructions + * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. + */ + chain_of_thought_instructions?: string | null; + }; /** Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post */ Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post: { /** Run Ids */ @@ -709,6 +771,31 @@ export interface components { /** Description */ description?: string | null; }; + /** CreateEvalConfigRequest */ + CreateEvalConfigRequest: { + type: components["schemas"]["EvalConfigType"]; + /** Properties */ + properties: Record; + /** Model Name */ + model_name: string; + provider: components["schemas"]["ModelProviderName"]; + /** Prompt Id */ + prompt_id: string; + }; + /** CreateEvaluatorRequest */ + CreateEvaluatorRequest: { + /** Name */ + name: string; + /** Description */ + description: string; + template: components["schemas"]["EvalTemplate"] | null; + /** Output Scores */ + output_scores: components["schemas"]["EvalOutputScore"][]; + /** Eval Set Filter Id */ + eval_set_filter_id: string; + /** Eval Configs Filter Id */ + eval_configs_filter_id: string; + }; /** * CreateFinetuneRequest * @description Request to create a finetune @@ -954,6 +1041,110 @@ export interface components { * @enum {string} */ DatasetSplitType: "train_test" | "train_test_val" | "train_test_val_80" | "all"; + /** Eval */ + Eval: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description The description of the eval + */ + description?: string | null; + /** + * @description The state of the eval: enabled or disabled. + * @default enabled + */ + state: components["schemas"]["EvalState"]; + /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */ + template?: components["schemas"]["EvalTemplate"] | null; + /** + * Current Config Id + * @description The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs. + */ + current_config_id?: string | null; + /** + * Eval Set Filter Id + * @description The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id. + */ + eval_set_filter_id: string; + /** + * Eval Configs Filter Id + * @description The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id. + */ + eval_configs_filter_id: string; + /** + * Output Scores + * @description The scores this evaluator should produce. + */ + output_scores: components["schemas"]["EvalOutputScore"][]; + /** Model Type */ + readonly model_type: string; + }; + /** + * EvalConfig + * @description A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. + * + * A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid when the same eval is run with the same config. + */ + EvalConfig: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** @description The model to use for this eval config. */ + model: components["schemas"]["DataSource"]; + /** + * @description This is used to determine the type of eval to run. + * @default g_eval + */ + config_type: components["schemas"]["EvalConfigType"]; + /** + * Properties + * @description Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict. + * @default {} + */ + properties: Record; + /** @description The prompt to use for this eval config. Both when running the task to generate outputs to evaluate and when explaining to the eval model what the goal of the task was. This is a frozen prompt, so this eval config is consistent over time (for example, if the user selects multi-shot prompting, this saves that dynamic prompt at the point the eval config is created). Freezing the prompt ensures consistent evals. */ + prompt: components["schemas"]["BasePrompt"]; + /** Model Type */ + readonly model_type: string; + }; + /** + * EvalConfigType + * @enum {string} + */ + EvalConfigType: "g_eval" | "llm_as_judge"; /** * EvalOutputScore * @description A definition of a score that an evaluator will produce. @@ -974,6 +1165,17 @@ export interface components { /** @description The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical'). */ type: components["schemas"]["TaskOutputRatingType"]; }; + /** + * EvalState + * @enum {string} + */ + EvalState: "enabled" | "disabled"; + /** + * EvalTemplate + * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval. + * @enum {string} + */ + EvalTemplate: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; /** * FineTuneParameter * @description A parameter for a fine-tune. Hyperparameters, etc. @@ -1200,6 +1402,8 @@ export interface components { supports_structured_output: boolean; /** Supports Data Gen */ supports_data_gen: boolean; + /** Supports Logprobs */ + supports_logprobs: boolean; /** * Untested Model * @default false @@ -1215,6 +1419,12 @@ export interface components { * @enum {string} */ ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b"; + /** + * ModelProviderName + * @description Enumeration of supported AI model providers. + * @enum {string} + */ + ModelProviderName: "openai" | "groq" | "amazon_bedrock" | "ollama" | "openrouter" | "fireworks_ai" | "kiln_fine_tune" | "kiln_custom_registry" | "openai_compatible"; /** OllamaConnection */ OllamaConnection: { /** Message */ @@ -3302,7 +3512,7 @@ export interface operations { }; }; }; - generate_evaluator_api_projects__project_id__tasks__task_id__sdf_post: { + create_evaluator_api_projects__project_id__tasks__task_id__create_evaluator_post: { parameters: { query?: never; header?: never; @@ -3312,6 +3522,43 @@ export interface operations { }; cookie?: never; }; + requestBody: { + content: { + "application/json": components["schemas"]["CreateEvaluatorRequest"]; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Eval"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_eval_api_projects__project_id__tasks__task_id__eval__eval_id__get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + }; + cookie?: never; + }; requestBody?: never; responses: { /** @description Successful Response */ @@ -3320,7 +3567,44 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["EvalOutputScore"]; + "application/json": components["schemas"]["Eval"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["CreateEvalConfigRequest"]; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalConfig"]; }; }; /** @description Validation Error */ diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index c29ef5a3..516acc44 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -20,3 +20,6 @@ export type RunSummary = components["schemas"]["RunSummary"] export type PromptResponse = components["schemas"]["PromptResponse"] export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"] export type EvalOutputScore = components["schemas"]["EvalOutputScore"] +export type EvalTemplate = components["schemas"]["EvalTemplate"] +export type Eval = components["schemas"]["Eval"] +export type EvalConfigType = components["schemas"]["EvalConfigType"] diff --git a/app/web_ui/src/lib/utils/form_element.svelte b/app/web_ui/src/lib/utils/form_element.svelte index fd4fc903..6329fe56 100644 --- a/app/web_ui/src/lib/utils/form_element.svelte +++ b/app/web_ui/src/lib/utils/form_element.svelte @@ -12,6 +12,7 @@ export let max_length: number | null = null export let error_message: string | null = null // start null because they haven't had a chance to edit it yet export let light_label: boolean = false // styling + export let hide_label: boolean = false export let select_options: [unknown, string][] = [] export let select_options_grouped: [string, [unknown, string][]][] = [] export let on_select: (e: Event) => void = () => {} @@ -75,7 +76,7 @@ for={id} class="text-sm font-medium text-left flex flex-col gap-1 pb-[4px]" > -
+
{label} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte new file mode 100644 index 00000000..7bbff52d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -0,0 +1,340 @@ + + +
+ + {#if loading} +
+
+
+ {:else if loading_error} +
+
Error Loading Task Information
+
+ {loading_error?.getMessage() || "An unknown error occurred"} +
+
+ {:else} + +
Step 1: Select Evaluator Algorithm
+ +
+ {#each evaluator_algorithms as evaluator} + + {/each} +
+ + {#if selected_algo} +
+
+ Part 2: Select Prompt and Model +
+
+ Specify which prompt and model will be used to run the eval. +
+
+ + + + + {/if} + + {#if selected_algo && model && prompt_method} +
+
+ Part 3: Evaluation Instructions +
+
+ This is a list of instructions to be used by the evaluator's + model. It will 'think' through each of these steps in order before + generating final scores. +
+ {#if evaluator?.template} +
+ We've pre-populated the evaluation steps for you based on the + template you selected ({evaluator.template}). Feel free to edit. +
+ {/if} +
+ + + + + {/if} +
+ {/if} +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte index b7ce6987..345e6d37 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte @@ -1,8 +1,8 @@
{#if loading}
@@ -80,11 +161,16 @@ /> {:else} 0 && output_scores[0].name)) + )} >
Part 1: Evaluator Details
Define the scores that the evaluator will output.
- {#if selected_template !== "custom"} + {#if selected_template !== "none"}
@@ -133,7 +219,7 @@ light_label={true} bind:value={output_scores[item_index].name} max_length={32} - disabled={selected_template !== "custom"} + disabled={selected_template !== "none"} />
@@ -148,7 +234,7 @@ ["pass_fail_critical", "Pass / Fail / Critical"], ]} bind:value={output_scores[item_index].type} - disabled={selected_template !== "custom"} + disabled={selected_template !== "none"} />
@@ -159,7 +245,7 @@ id="score_instructions_{item_index}" light_label={true} bind:value={output_scores[item_index].instruction} - disabled={selected_template !== "custom"} + disabled={selected_template !== "none"} />
@@ -167,13 +253,82 @@
- Part 3: Evaluation Datasets + Part 3: Evaluation Dataset
- Specify which which parts of your dataset this evaluator should run + Specify which which part of your dataset this evaluator should run on.
+ + + {#if eval_dataset === "custom_tag"} + + {/if} + +
+
+ Part 3: Dataset to Evaluate Evaluation Configs +
+
+ Specify which which part of your dataset this evaluator should run + on when attemping to find the ideal evaluation config (prompt, + model, etc). +
+
+ + + {#if config_dataset === "custom_tag"} + + {/if} {/if} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts index 77884823..3a36e57b 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts @@ -1,7 +1,8 @@ -import type { EvalOutputScore } from "$lib/types" +import type { EvalOutputScore, EvalTemplate } from "$lib/types" export type EvalTemplateResult = { - template_id: string + // Server IDs are EvalTemplate. We have a custom "none" value for the UI. + template_id: EvalTemplate | "none" name: string description: string output_scores: EvalOutputScore[] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte index 84dc59e1..efac69e3 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte @@ -1,25 +1,46 @@ -
-
+
+
Select Evaluator Template
{#each evaluator_template_descriptions as template_description} @@ -125,12 +202,16 @@
Recommended
+ {:else if template_description.highlight_title} +
+ {template_description.highlight_title} +
{/if} -
+
{template_description.name}
-
+
{template_description.description}
diff --git a/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte b/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte index 4350eab7..917a2182 100644 --- a/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte +++ b/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte @@ -12,12 +12,14 @@ export let model: string = $ui_state.selected_model export let requires_structured_output: boolean = false export let requires_data_gen: boolean = false + export let requires_logprobs: boolean = false export let error_message: string | null = null $: $ui_state.selected_model = model $: model_options = format_model_options( $available_models || {}, requires_structured_output, requires_data_gen, + requires_logprobs, $ui_state.current_task_id, ) @@ -31,6 +33,7 @@ providers: AvailableModels[], structured_output: boolean, requires_data_gen: boolean, + requires_logprobs: boolean, current_task_id: string | null, ): [string, [unknown, string][]][] { let options = [] @@ -63,6 +66,10 @@ unsupported_models.push([id, long_label]) continue } + if (requires_logprobs && !model.supports_logprobs) { + unsupported_models.push([id, long_label]) + continue + } model_list.push([id, model.name]) } if (model_list.length > 0) { @@ -75,9 +82,14 @@ } if (unsupported_models.length > 0) { - const not_recommended_label = requires_data_gen - ? "Not Recommended - Data Gen Not Supported" - : "Not Recommended - Structured Output Fails" + let not_recommended_label = "Not Recommended" + if (requires_data_gen) { + not_recommended_label = "Not Recommended - Data Gen Not Supported" + } else if (requires_structured_output) { + not_recommended_label = "Not Recommended - Structured Output Fails" + } else if (requires_logprobs) { + not_recommended_label = "Not Recommended - Logprobs Not Supported" + } options.push([not_recommended_label, unsupported_models]) } @@ -118,6 +130,10 @@ + {:else if requires_logprobs} + {:else if requires_structured_output} Date: Thu, 20 Feb 2025 22:35:29 -0500 Subject: [PATCH 033/102] Fix up main evals screen. Design far from final but it's functional. --- app/desktop/studio_server/evals_api.py | 5 + app/desktop/studio_server/test_eval_api.py | 13 +++ app/web_ui/src/lib/api_schema.d.ts | 49 ++++++++++ .../evals/[project_id]/[task_id]/+page.svelte | 91 +++++++++---------- 4 files changed, 111 insertions(+), 47 deletions(-) diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/evals_api.py index 114e5343..4b0c7abf 100644 --- a/app/desktop/studio_server/evals_api.py +++ b/app/desktop/studio_server/evals_api.py @@ -74,6 +74,11 @@ async def create_evaluator( async def get_eval(project_id: str, task_id: str, eval_id: str) -> Eval: return eval_from_id(project_id, task_id, eval_id) + @app.get("/api/projects/{project_id}/tasks/{task_id}/evals") + async def get_evals(project_id: str, task_id: str) -> list[Eval]: + task = task_from_id(project_id, task_id) + return task.evals() + @app.post( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config" ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index e4fe793d..1175077b 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -77,6 +77,19 @@ def mock_task_from_id(mock_task): yield mock +def test_get_evals_success(client, mock_task, mock_task_from_id, mock_eval): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/evals") + + assert response.status_code == 200 + result = response.json() + assert len(result) == 1 + assert result[0]["id"] == "eval1" + assert result[0]["name"] == "Test Eval" + mock_task_from_id.assert_called_once_with("project1", "task1") + + def test_get_eval_success(client, mock_task, mock_task_from_id, mock_eval): mock_task_from_id.return_value = mock_task diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 3cf38fff..f957cf3b 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -691,6 +691,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/evals": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Evals */ + get: operations["get_evals_api_projects__project_id__tasks__task_id__evals_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config": { parameters: { query?: never; @@ -3581,6 +3598,38 @@ export interface operations { }; }; }; + get_evals_api_projects__project_id__tasks__task_id__evals_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Eval"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post: { parameters: { query?: never; diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte index 800d162d..45c572b6 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte @@ -1,53 +1,58 @@ - {#if loading} + {#if evals_loading}
@@ -67,46 +72,38 @@
- {:else if evaluators} + {:else if evals_error} +
+
Error Loading Evaluators
+
+ {evals_error.getMessage() || "An unknown error occurred"} +
+
+ {:else if evals}
- - - - {#each evaluators as evaluator} + {#each evals as evaluator} { - goto( - `/evals/${project_id}/${task_id}/evaluator/${evaluator.id}`, - ) + goto(`/evals/${project_id}/${task_id}/eval/${evaluator.id}`) }} > - - - {/each}
ID Name Provider Base Model Created At
{evaluator.id} {evaluator.name} {provider_name_from_id(evaluator.provider)} {evaluator.base_model_id} {formatDate(evaluator.created_at)}
- {:else if evaluators_error} -
-
Error Loading Evaluators
-
- {evaluators_error.getMessage() || "An unknown error occurred"} -
-
{/if}
From a9a6bb51757debd615b4d58168842657f6a4fd68 Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 21 Feb 2025 12:07:55 -0500 Subject: [PATCH 034/102] WIP evaluator view --- app/desktop/studio_server/evals_api.py | 7 + app/desktop/studio_server/test_eval_api.py | 51 ++++ app/web_ui/src/lib/api_schema.d.ts | 50 ++++ app/web_ui/src/lib/types.ts | 1 + .../evals/[project_id]/[task_id]/+page.svelte | 2 +- .../[task_id]/[eval_id]/+page.svelte | 277 ++++++++++++++++++ .../[project_id]/[task_id]/[eval_id]/+page.ts | 1 + 7 files changed, 388 insertions(+), 1 deletion(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/evals_api.py index 4b0c7abf..f967a5f9 100644 --- a/app/desktop/studio_server/evals_api.py +++ b/app/desktop/studio_server/evals_api.py @@ -79,6 +79,13 @@ async def get_evals(project_id: str, task_id: str) -> list[Eval]: task = task_from_id(project_id, task_id) return task.evals() + @app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs") + async def get_eval_configs( + project_id: str, task_id: str, eval_id: str + ) -> list[EvalConfig]: + eval = eval_from_id(project_id, task_id, eval_id) + return eval.configs() + @app.post( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config" ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 1175077b..78e304eb 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -70,6 +70,32 @@ def mock_eval(mock_task): return eval +@pytest.fixture +def mock_eval_config(mock_eval): + eval_config = EvalConfig( + id="eval_config1", + config_type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + parent=mock_eval, + model=DataSource( + id="model1", + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "TODO", + }, + ), + prompt=BasePrompt( + name="test", + prompt="base prompt", + chain_of_thought_instructions="cot prompt", + ), + ) + eval_config.save_to_file() + return eval_config + + @pytest.fixture def mock_task_from_id(mock_task): with patch("app.desktop.studio_server.evals_api.task_from_id") as mock: @@ -209,3 +235,28 @@ async def test_create_eval_config( assert config.prompt.chain_of_thought_instructions == "cot prompt" assert config.properties["eval_steps"][0] == "step1" assert config.properties["eval_steps"][1] == "step2" + + +def test_get_eval_configs( + client, mock_task_from_id, mock_eval, mock_task, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + with patch("app.desktop.studio_server.evals_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_configs" + ) + + assert response.status_code == 200 + configs = response.json() + assert isinstance(configs, list) + assert len(configs) == 1 + + config = configs[0] + assert config["config_type"] == mock_eval_config.config_type + assert config["properties"] == mock_eval_config.properties + assert config["model"]["type"] == mock_eval_config.model.type + assert isinstance(config["prompt"], dict) + + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index f957cf3b..aa0b336c 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -708,6 +708,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Configs */ + get: operations["get_eval_configs_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config": { parameters: { query?: never; @@ -3630,6 +3647,39 @@ export interface operations { }; }; }; + get_eval_configs_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalConfig"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post: { parameters: { query?: never; diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 516acc44..7aad5ae2 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -23,3 +23,4 @@ export type EvalOutputScore = components["schemas"]["EvalOutputScore"] export type EvalTemplate = components["schemas"]["EvalTemplate"] export type Eval = components["schemas"]["Eval"] export type EvalConfigType = components["schemas"]["EvalConfigType"] +export type EvalConfig = components["schemas"]["EvalConfig"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte index 45c572b6..012c49c3 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte @@ -95,7 +95,7 @@ { - goto(`/evals/${project_id}/${task_id}/eval/${evaluator.id}`) + goto(`/evals/${project_id}/${task_id}/${evaluator.id}`) }} > {evaluator.id} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte new file mode 100644 index 00000000..6680d79c --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -0,0 +1,277 @@ + + + + {#if loading} +
+
+
+ {:else if error} +
+
Error Loading Evaluators
+
+ {error.getMessage() || "An unknown error occurred"} +
+
+ {:else if evaluator} +
+
+
Properties
+
+ {#each get_eval_properties(evaluator) as property} +
{property.name}
+
+ {property.value} +
+ {/each} +
+
+
+
+
Eval Config
+
+ How this evaluator will be run. +
+
+ +
+ {#each get_eval_config_properties(current_eval_config_id) as property} +
{property.name}
+
+ {property.value} +
+ {/each} +
+
+
+ {/if} +
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts @@ -0,0 +1 @@ +export const prerender = false From 1ffa3e85bc1c86130dd2ff59f3d033f34e473259 Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 21 Feb 2025 15:59:56 -0500 Subject: [PATCH 035/102] Make the "add config" functional, and better labels/code sharing --- app/web_ui/src/lib/stores.ts | 23 +++++ .../[task_id]/[run_id]/run/+page.svelte | 24 +----- .../[task_id]/[eval_id]/+page.svelte | 85 ++++++++++++------- 3 files changed, 79 insertions(+), 53 deletions(-) diff --git a/app/web_ui/src/lib/stores.ts b/app/web_ui/src/lib/stores.ts index 74e946d1..5aefc889 100644 --- a/app/web_ui/src/lib/stores.ts +++ b/app/web_ui/src/lib/stores.ts @@ -229,6 +229,29 @@ export function provider_name_from_id(provider_id: string): string { return provider?.provider_name || provider_id } +export function prompt_name_from_id(prompt_id: string): string { + // Attempt to lookup a nice name for the prompt. First from named prompts, then from generators + // Special case for fine-tuned prompts + let prompt_name: string | undefined = undefined + if (prompt_id && prompt_id.startsWith("fine_tune_prompt::")) { + prompt_name = "Fine-Tune Prompt" + } + if (!prompt_name) { + prompt_name = get(current_task_prompts)?.prompts.find( + (prompt) => "id::" + prompt.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = get(current_task_prompts)?.generators.find( + (generator) => generator.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = prompt_id + } + return prompt_name +} + // Available prompts for the current export async function load_available_prompts() { const project = get(current_project) diff --git a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte index 41b87ee3..85daf6c8 100644 --- a/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte +++ b/app/web_ui/src/routes/(app)/dataset/[project_id]/[task_id]/[run_id]/run/+page.svelte @@ -7,7 +7,7 @@ current_task, model_name, model_info, - current_task_prompts, + prompt_name_from_id, } from "$lib/stores" import { page } from "$app/stores" import { onMount } from "svelte" @@ -37,26 +37,6 @@ "" ).toString() - let prompt_name: string | undefined = undefined - // Attempt to lookup a nice name for the prompt. First from named prompts, then from generators - // Special case for fine-tuned prompts - if (prompt_id && prompt_id.startsWith("fine_tune_prompt::")) { - prompt_name = "Fine-Tune Prompt" - } - if (!prompt_name) { - prompt_name = $current_task_prompts?.prompts.find( - (prompt) => "id::" + prompt.id === prompt_id, - )?.name - } - if (!prompt_name) { - prompt_name = $current_task_prompts?.generators.find( - (generator) => generator.id === prompt_id, - )?.name - } - if (!prompt_name) { - prompt_name = prompt_id - } - let topic_path: string | undefined = undefined if ( run?.input_source?.properties?.topic_path && @@ -80,7 +60,7 @@ $model_info, ), "Model Provider": run?.output?.source?.properties?.model_provider, - Prompt: prompt_name, + Prompt: prompt_name_from_id(prompt_id), "Created By": run?.input_source?.properties?.created_by, "Created At": formatDate(run?.created_at), Topic: topic_path, diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 6680d79c..51596841 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -7,7 +7,17 @@ import { page } from "$app/stores" import { formatDate } from "$lib/utils/formatters" import FormElement from "$lib/utils/form_element.svelte" - import type { EvalConfig, EvalConfigType } from "$lib/types" + import type { EvalConfig, EvalConfigType, ProviderModels } from "$lib/types" + import { goto } from "$app/navigation" + import { + model_info, + load_model_info, + model_name, + provider_name_from_id, + prompt_name_from_id, + load_available_prompts, + load_available_models, + } from "$lib/stores" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -31,6 +41,10 @@ // Can actually do these in parallel get_eval() get_eval_configs() + // These are all just needed for better labels + load_model_info() + load_available_prompts() + load_available_models() }) async function get_eval() { @@ -97,17 +111,11 @@ } $: add_eval_config(current_eval_config_id) - let last_selected_valid_id: string | null = null function add_eval_config(selected_id: string | null) { - if (selected_id !== "add_config") { - last_selected_valid_id = selected_id - return + if (selected_id === "add_config") { + goto(`/evals/${project_id}/${task_id}/${eval_id}/create_eval_config`) } - - // Reset the selected id, so we don't leave "add_config" selected - current_eval_config_id = last_selected_valid_id - alert("Not implemented") } type UiProperty = { @@ -124,10 +132,15 @@ ) } - function get_eval_config_name(eval_config: EvalConfig): string { + function get_eval_config_name( + eval_config: EvalConfig, + model_info: ProviderModels | null, + ): string { let name = eval_config_to_ui_name(eval_config.config_type) let parts = [] - parts.push(eval_config.model.properties["model_name"]) + parts.push( + model_name(eval_config.model.properties["model_name"], model_info), + ) parts.push(eval_config.prompt.name) return name + " (" + parts.join(", ") + ")" } @@ -169,6 +182,7 @@ } function get_eval_config_properties( eval_config_id: string | null, + model_info: ProviderModels | null, ): UiProperty[] { if (!eval_config_id) { return [] @@ -187,32 +201,39 @@ }) properties.push({ name: "Model", - value: eval_config.model.properties["model_name"] + "", + value: model_name( + eval_config.model.properties["model_name"] + "", + model_info, + ), }) properties.push({ name: "Provider", - value: eval_config.model.properties["model_provider"] + "", + value: provider_name_from_id( + eval_config.model.properties["model_provider"] + "", + ), }) properties.push({ name: "Prompt", - value: eval_config.prompt.name + "", + value: prompt_name_from_id(eval_config.prompt.name + ""), }) return properties } function get_eval_config_select_options( configs: EvalConfig[] | null, - ): [string, string][] { - if (!configs) { - return [] - } - const results: [string, string][] = [] - for (const c of configs) { + ): [string, [unknown, string][]][] { + const configs_options: [string, string][] = [] + for (const c of configs || []) { if (c.id) { - results.push([c.id, get_eval_config_name(c)]) + configs_options.push([c.id, get_eval_config_name(c, $model_info)]) } } - results.push(["add_config", "Add Config"]) + + const results: [string, [unknown, string][]][] = [] + if (configs_options.length > 0) { + results.push(["Eval Configs", configs_options]) + } + results.push(["Manage", [["add_config", "Add Config"]]]) return results } @@ -252,19 +273,21 @@
How this evaluator will be run.
+
-
- {#each get_eval_config_properties(current_eval_config_id) as property} + {#each get_eval_config_properties(current_eval_config_id, $model_info) as property}
{property.name}
{property.value} From 170cb34cc5ff0e1ca5c5f887c89f9cfd3c7cbd3e Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 21 Feb 2025 21:34:48 -0500 Subject: [PATCH 036/102] Checkpoint: create task_runs, async run API (not done), list task_runs, improved UI --- app/desktop/studio_server/evals_api.py | 126 ++++++- app/desktop/studio_server/test_eval_api.py | 39 ++- app/web_ui/src/lib/api_client.ts | 4 +- app/web_ui/src/lib/api_schema.d.ts | 239 ++++++++++++++ app/web_ui/src/lib/ui/dialog.svelte | 5 +- .../[task_id]/[eval_id]/+page.svelte | 309 ++++++++++++++++-- .../[eval_id]/create_eval_config/+page.svelte | 8 +- .../run/available_models_dropdown.svelte | 11 + libs/core/kiln_ai/datamodel/eval.py | 1 + libs/core/kiln_ai/datamodel/task.py | 3 + .../core/kiln_ai/datamodel/test_eval_model.py | 2 + 11 files changed, 711 insertions(+), 36 deletions(-) diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/evals_api.py index f967a5f9..401f191b 100644 --- a/app/desktop/studio_server/evals_api.py +++ b/app/desktop/studio_server/evals_api.py @@ -1,6 +1,9 @@ +import asyncio +import json from typing import Any -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import StreamingResponse from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.adapters.prompt_builders import prompt_builder_from_id from kiln_ai.datamodel import ( @@ -17,6 +20,8 @@ EvalOutputScore, EvalTemplate, ) +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig +from kiln_ai.utils.name_generator import generate_memorable_name from kiln_server.task_api import task_from_id from pydantic import BaseModel @@ -33,6 +38,34 @@ def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: ) +def eval_config_from_id( + project_id: str, task_id: str, eval_id: str, eval_config_id: str +) -> EvalConfig: + eval = eval_from_id(project_id, task_id, eval_id) + for config in eval.configs(): + if config.id == eval_config_id: + return config + + raise HTTPException( + status_code=404, + detail=f"Eval config not found. ID: {eval_config_id}", + ) + + +def task_run_config_from_id( + project_id: str, task_id: str, run_config_id: str +) -> TaskRunConfig: + task = task_from_id(project_id, task_id) + for run_config in task.run_configs(): + if run_config.id == run_config_id: + return run_config + + raise HTTPException( + status_code=404, + detail=f"Task run config not found. ID: {run_config_id}", + ) + + class CreateEvaluatorRequest(BaseModel): name: str description: str @@ -43,6 +76,7 @@ class CreateEvaluatorRequest(BaseModel): class CreateEvalConfigRequest(BaseModel): + name: str | None = None type: EvalConfigType properties: dict[str, Any] model_name: str @@ -50,6 +84,18 @@ class CreateEvalConfigRequest(BaseModel): prompt_id: PromptId +class CreateTaskRunConfigRequest(BaseModel): + name: str | None = None + description: str | None = None + model_name: str + model_provider_name: ModelProviderName + prompt_id: PromptId + + +class RunEvalConfigRequest(BaseModel): + run_config_ids: list[str] + + def connect_evals_api(app: FastAPI): @app.post("/api/projects/{project_id}/tasks/{task_id}/create_evaluator") async def create_evaluator( @@ -70,6 +116,13 @@ async def create_evaluator( eval.save_to_file() return eval + @app.get("/api/projects/{project_id}/tasks/{task_id}/task_run_configs") + async def get_task_run_configs( + project_id: str, task_id: str + ) -> list[TaskRunConfig]: + task = task_from_id(project_id, task_id) + return task.run_configs() + @app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}") async def get_eval(project_id: str, task_id: str, eval_id: str) -> Eval: return eval_from_id(project_id, task_id, eval_id) @@ -86,6 +139,27 @@ async def get_eval_configs( eval = eval_from_id(project_id, task_id, eval_id) return eval.configs() + @app.post("/api/projects/{project_id}/tasks/{task_id}/task_run_config") + async def create_task_run_config( + project_id: str, + task_id: str, + request: CreateTaskRunConfigRequest, + ) -> TaskRunConfig: + task = task_from_id(project_id, task_id) + name = request.name or generate_memorable_name() + task_run_config = TaskRunConfig( + parent=task, + name=name, + description=request.description, + run_config_properties=RunConfigProperties( + model_name=request.model_name, + model_provider_name=request.model_provider_name, + prompt_id=request.prompt_id, + ), + ) + task_run_config.save_to_file() + return task_run_config + @app.post( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config" ) @@ -97,6 +171,7 @@ async def create_eval_config( ) -> EvalConfig: task = task_from_id(project_id, task_id) eval = eval_from_id(project_id, task_id, eval_id) + name = request.name or generate_memorable_name() # Create a prompt instance to save to the eval config prompt_builder = prompt_builder_from_id(request.prompt_id, task) @@ -108,6 +183,7 @@ async def create_eval_config( ) eval_config = EvalConfig( + name=name, config_type=request.type, properties=request.properties, model=DataSource( @@ -124,3 +200,51 @@ async def create_eval_config( ) eval_config.save_to_file() return eval_config + + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run" + ) + async def run_eval_config( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + run_config_ids: list[str] = Query([]), + all_run_configs: bool = Query(False), + ) -> StreamingResponse: + # TODO a lock by eval_id -> error if one is already running + + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + + # Load the list of run configs to use. Two options: + run_configs: list[TaskRunConfig] = [] + if all_run_configs: + run_configs = task_from_id(project_id, task_id).run_configs() + else: + if len(run_config_ids) == 0: + raise HTTPException( + status_code=400, + detail="No run config ids provided. At least one run config id is required.", + ) + run_configs = [ + task_run_config_from_id(project_id, task_id, run_config_id) + for run_config_id in run_config_ids + ] + + async def event_generator(): + for i in range(10): # Simulate 10 steps + await asyncio.sleep(0.2) # Simulate work + data = { + "progress": i + 1, + "total": 10, + "status": "processing" if i < 9 else "complete", + } + print(data) + yield f"data: {json.dumps(data)}\n\n" + yield "data: complete\n\n" + + return StreamingResponse( + content=event_generator(), + media_type="text/event-stream", + ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 78e304eb..1ab4ae52 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -74,6 +74,7 @@ def mock_eval(mock_task): def mock_eval_config(mock_eval): eval_config = EvalConfig( id="eval_config1", + name="Test Eval Config", config_type=EvalConfigType.g_eval, properties={"eval_steps": ["step1", "step2"]}, parent=mock_eval, @@ -154,6 +155,7 @@ def valid_evaluator_request(): @pytest.fixture def valid_eval_config_request(): return CreateEvalConfigRequest( + name="Test Eval Config", type=EvalConfigType.g_eval, properties={"eval_steps": ["step1", "step2"]}, model_name="gpt-4", @@ -181,6 +183,41 @@ async def test_create_evaluator( mock_save.assert_called_once() +@pytest.mark.asyncio +async def test_create_task_run_config(client, mock_task_from_id, mock_task): + mock_task_from_id.return_value = mock_task + + response = client.post( + "/api/projects/project1/tasks/task1/task_run_config", + json={ + "name": "Test Task Run Config", + "description": "Test Description", + "model_name": "gpt-4o", + "model_provider_name": "openai", + "prompt_id": "simple_chain_of_thought_prompt_builder", + }, + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == "Test Task Run Config" + assert result["description"] == "Test Description" + assert result["run_config_properties"]["model_name"] == "gpt-4o" + assert result["run_config_properties"]["model_provider_name"] == "openai" + assert ( + result["run_config_properties"]["prompt_id"] + == "simple_chain_of_thought_prompt_builder" + ) + + # Fetch it from API + fetch_response = client.get("/api/projects/project1/tasks/task1/task_run_configs") + assert fetch_response.status_code == 200 + configs = fetch_response.json() + assert len(configs) == 1 + assert configs[0]["id"] == result["id"] + assert configs[0]["name"] == result["name"] + + @pytest.mark.asyncio async def test_create_eval_config( client, mock_task_from_id, valid_eval_config_request, mock_eval, mock_task @@ -192,7 +229,6 @@ async def test_create_eval_config( patch( "app.desktop.studio_server.evals_api.prompt_builder_from_id" ) as mock_prompt_builder, - # patch.object(EvalConfig, "save_to_file") as mock_save, ): mock_eval_from_id.return_value = mock_eval mock_prompt_builder.return_value.build_base_prompt.return_value = "base prompt" @@ -207,6 +243,7 @@ async def test_create_eval_config( assert response.status_code == 200 result = response.json() + assert result["name"] == valid_eval_config_request.name assert result["config_type"] == valid_eval_config_request.type assert result["properties"] == valid_eval_config_request.properties assert result["model"]["type"] == DataSourceType.synthetic diff --git a/app/web_ui/src/lib/api_client.ts b/app/web_ui/src/lib/api_client.ts index a39cf3dd..8b4e9e0e 100644 --- a/app/web_ui/src/lib/api_client.ts +++ b/app/web_ui/src/lib/api_client.ts @@ -1,6 +1,8 @@ import createClient from "openapi-fetch" import type { paths } from "./api_schema" +export const base_url = "http://localhost:8757" + export const client = createClient({ - baseUrl: "http://localhost:8757", + baseUrl: base_url, }) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index aa0b336c..8bd29475 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -674,6 +674,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/task_run_configs": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Task Run Configs */ + get: operations["get_task_run_configs_api_projects__project_id__tasks__task_id__task_run_configs_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}": { parameters: { query?: never; @@ -725,6 +742,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/task_run_config": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Task Run Config */ + post: operations["create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config": { parameters: { query?: never; @@ -742,6 +776,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Run Eval Config */ + get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -807,6 +858,8 @@ export interface components { }; /** CreateEvalConfigRequest */ CreateEvalConfigRequest: { + /** Name */ + name?: string | null; type: components["schemas"]["EvalConfigType"]; /** Properties */ properties: Record; @@ -861,6 +914,18 @@ export interface components { custom_thinking_instructions?: string | null; data_strategy: components["schemas"]["FinetuneDataStrategy"]; }; + /** CreateTaskRunConfigRequest */ + CreateTaskRunConfigRequest: { + /** Name */ + name?: string | null; + /** Description */ + description?: string | null; + /** Model Name */ + model_name: string; + model_provider_name: components["schemas"]["ModelProviderName"]; + /** Prompt Id */ + prompt_id: string; + }; /** DataGenCategoriesApiInput */ DataGenCategoriesApiInput: { /** @@ -1156,6 +1221,11 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; /** @description The model to use for this eval config. */ model: components["schemas"]["DataSource"]; /** @@ -1677,6 +1747,30 @@ export interface components { /** @description The type of rating */ type: components["schemas"]["TaskOutputRatingType"]; }; + /** + * RunConfigProperties + * @description A configuration for running a task. + * + * This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + */ + RunConfigProperties: { + /** + * Model Name + * @description The model to use for this run config. + */ + model_name: string; + /** + * Model Provider Name + * @description The provider to use for this run config. + */ + model_provider_name: string; + /** + * Prompt Id + * @description The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided. + * @default simple_prompt_builder + */ + prompt_id: string; + }; /** RunSummary */ RunSummary: { /** Id */ @@ -2087,6 +2181,46 @@ export interface components { /** Model Type */ readonly model_type: string; }; + /** + * TaskRunConfig + * @description A Kiln model for persisting a run config in a Kiln Project, nested under a task. + * + * Typically used to save a method of running a task for evaluation. + * + * A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + */ + TaskRunConfig: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description The description of the task run config. + */ + description?: string | null; + /** @description The run config properties to use for this task run. */ + run_config_properties: components["schemas"]["RunConfigProperties"]; + /** Model Type */ + readonly model_type: string; + }; /** ValidationError */ ValidationError: { /** Location */ @@ -3582,6 +3716,38 @@ export interface operations { }; }; }; + get_task_run_configs_api_projects__project_id__tasks__task_id__task_run_configs_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["TaskRunConfig"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; get_eval_api_projects__project_id__tasks__task_id__eval__eval_id__get: { parameters: { query?: never; @@ -3680,6 +3846,42 @@ export interface operations { }; }; }; + create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["CreateTaskRunConfigRequest"]; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["TaskRunConfig"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post: { parameters: { query?: never; @@ -3717,4 +3919,41 @@ export interface operations { }; }; }; + run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get: { + parameters: { + query?: { + run_config_ids?: string[]; + all_run_configs?: boolean; + }; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; } diff --git a/app/web_ui/src/lib/ui/dialog.svelte b/app/web_ui/src/lib/ui/dialog.svelte index 9645b6f0..ffd23807 100644 --- a/app/web_ui/src/lib/ui/dialog.svelte +++ b/app/web_ui/src/lib/ui/dialog.svelte @@ -9,6 +9,7 @@ asyncAction?: () => Promise action?: () => boolean isCancel?: boolean + isPrimary?: boolean disabled?: boolean } export let action_buttons: ActionButton[] = [] @@ -91,7 +92,9 @@ {:else}
Results
+
+ Filtered by the selected eval config and grouped by task run config. +
- + From 5ae11a27d18d37cbd36f0e1d1852199764db7c53 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 11:02:02 -0500 Subject: [PATCH 038/102] Nice eval progress UI, and fix a bug where the eval runner didn't work with structured tasks --- .../[task_id]/[eval_id]/+page.svelte | 279 +++++++++++++----- libs/core/kiln_ai/adapters/eval/base_eval.py | 7 +- 2 files changed, 208 insertions(+), 78 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index bfb7f8c0..c9adaf77 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -25,6 +25,7 @@ import Dialog from "$lib/ui/dialog.svelte" import AvailableModelsDropdown from "../../../../run/available_models_dropdown.svelte" import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte" + import Warning from "$lib/ui/warning.svelte" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -244,14 +245,14 @@ value: eval_config_to_ui_name(eval_config.config_type), }) properties.push({ - name: "Model", + name: "Eval Model", value: model_name( eval_config.model.properties["model_name"] + "", model_info, ), }) properties.push({ - name: "Provider", + name: "Eval Provider", value: provider_name_from_id( eval_config.model.properties["model_provider"] + "", ), @@ -282,17 +283,30 @@ } let run_dialog: Dialog | null = null + let running_progress_dialog: Dialog | null = null - let eval_running = false let eval_run_error: KilnError | null = null - let progress = "not_started" + let eval_state: + | "not_started" + | "running" + | "complete" + | "complete_with_errors" = "not_started" + let eval_complete_count = 0 + let eval_total_count = 0 + let eval_error_count = 0 + function run_eval(): boolean { - progress = "starting" if (!current_eval_config_id) { - throw new Error("No eval config selected") + eval_run_error = new KilnError("No eval config selected", null) + eval_state = "complete_with_errors" + return false } - eval_running = true + eval_state = "running" + eval_complete_count = 0 + eval_total_count = 0 + eval_error_count = 0 + const eventSource = new EventSource( `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true`, ) @@ -300,27 +314,31 @@ eventSource.onmessage = (event) => { try { if (event.data === "complete") { - progress = "complete" eventSource.close() - eval_running = false + eval_state = + eval_error_count > 0 ? "complete_with_errors" : "complete" } else { const data = JSON.parse(event.data) - progress = data.progress + eval_complete_count = data.progress + eval_total_count = data.total + eval_error_count = data.errors + eval_state = "running" } } catch (error) { - console.error("Error parsing SSE data:", error) + eval_run_error = createKilnError(error) + eval_state = "complete_with_errors" } } // Don't restart on an error eventSource.onerror = (error) => { - console.error("SSE error:", error) eventSource.close() - progress = "error" - eval_running = false + eval_state = "complete_with_errors" eval_run_error = createKilnError(error) } + // Switch over to the progress dialog + running_progress_dialog?.show() return true } @@ -374,21 +392,10 @@ { - add_task_config_dialog?.show() - }, - primary: true, - }, - { - label: "Run Evals", - handler: () => { - run_dialog?.show() - }, - primary: true, + label: "Evaluate Eval Configs", + href: `/evals/${project_id}/${task_id}/${eval_id}/TODO`, }, ]} > @@ -422,12 +429,8 @@
-
Config
- +
Config
+ {/each} +
Config Quality
+
-
-
Results
-
- Filtered by the selected eval config and grouped by task run config. -
-
-
Name Prompt Model Provider Prompt
- - - - - - - - - - {#each task_run_configs || [] as task_run_config} - + {#if task_run_configs?.length} +
+
+
Results
+
+ Filtered by the selected eval config and grouped by task run + config. +
+
+
+ {#if eval_state === "not_started"} + + + {:else} +
- - - + {#if eval_state === "running"} +
+ Running... + {:else if eval_state === "complete"} + Eval Complete + {:else if eval_state === "complete_with_errors"} + Eval Complete with Errors + {:else} + Eval Status + {/if} + + {/if} + + +
+
Name Model Provider Prompt
{task_run_config.name} - {model_name( - task_run_config?.run_config_properties?.model_name, - $model_info, - )} - - {provider_name_from_id( - task_run_config?.run_config_properties?.model_provider_name, - )} - - {prompt_name_from_id( - task_run_config?.run_config_properties?.prompt_id, - )} -
+ + + + + + - {/each} - -
Run Config Name Task Model Task Provider Task Prompt
-
+ + + {#each task_run_configs || [] as task_run_config} + { + console.log("TODO: link") + }} + > + {task_run_config.name} + + {model_name( + task_run_config?.run_config_properties?.model_name, + $model_info, + )} + + + {provider_name_from_id( + task_run_config?.run_config_properties + ?.model_provider_name, + )} + + + {prompt_name_from_id( + task_run_config?.run_config_properties?.prompt_id, + )} + + + {/each} + + +
+ {:else} +
Results
+
+
Create a Run Config
+
+ A task run config defines how the task is run, such as which model + and prompt to use. Create one to run this evaluator. +
+ +
+ {/if}
{/if} @@ -536,6 +604,56 @@
+ +
+ {#if eval_state === "complete"} +
Eval Complete 🎉
+ {#if eval_total_count == 0} +
+ No evals were run, because everything was already up to date! +
+ {/if} + {:else if eval_state === "complete_with_errors"} +
Eval Complete with Errors
+ {:else if eval_state === "running"} +
+
Running...
+ {/if} +
+ {#if eval_total_count > 0} +
+ {eval_complete_count + eval_error_count} of {eval_total_count} +
+ {/if} + {#if eval_error_count > 0} +
+ {eval_error_count} error{eval_error_count === 1 ? "" : "s"} +
+ {/if} + {#if eval_run_error} +
+ {eval_run_error.getMessage() || "An unknown error occurred"} +
+ {/if} +
+
+
+ -
-
Run Eval
+
+
Run this eval on the selected eval configuration?
+
Don't close this page if you want to monitor progress.
+
diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index cd4f9147..70bff103 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -48,8 +48,13 @@ async def run(self, input: str) -> tuple[TaskRun, EvalScores]: base_adapter_config=AdapterConfig(allow_saving=False), ) + # Parse stuctured input if needed + parsed_input = input + if self.target_task.output_json_schema is not None: + parsed_input = json.loads(input) + # we don't save by default here. We'll save manually after validating the output - run_output = await run_adapter.invoke(input) + run_output = await run_adapter.invoke(parsed_input) eval_output = await self.run_eval(run_output) validate_schema(eval_output, self.score_schema) From f419006dedcda3d774365412c94a9270df175f25 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 11:14:40 -0500 Subject: [PATCH 039/102] Fix all linter warnings/errors --- app/desktop/studio_server/evals_api.py | 1 - app/web_ui/src/lib/types.ts | 1 + .../evals/[project_id]/[task_id]/[eval_id]/+page.svelte | 2 +- libs/core/kiln_ai/adapters/eval/base_eval.py | 6 +++--- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/evals_api.py index e72af284..6d67eb26 100644 --- a/app/desktop/studio_server/evals_api.py +++ b/app/desktop/studio_server/evals_api.py @@ -1,4 +1,3 @@ -import asyncio import json from typing import Any diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 7aad5ae2..7ca9ee2a 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -24,3 +24,4 @@ export type EvalTemplate = components["schemas"]["EvalTemplate"] export type Eval = components["schemas"]["Eval"] export type EvalConfigType = components["schemas"]["EvalConfigType"] export type EvalConfig = components["schemas"]["EvalConfig"] +export type TaskRunConfig = components["schemas"]["TaskRunConfig"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index c9adaf77..c7619a9d 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -394,7 +394,7 @@ subtitle={evaluator?.name} action_buttons={[ { - label: "Evaluate Eval Configs", + label: "Evaluate Eval Quality", href: `/evals/${project_id}/${task_id}/${eval_id}/TODO`, }, ]} diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index 70bff103..c8a2dd7f 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -5,8 +5,8 @@ from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores -from kiln_ai.datamodel.json_schema import string_to_json_key, validate_schema -from kiln_ai.datamodel.task import RunConfig, Task, TaskOutputRatingType, TaskRun +from kiln_ai.datamodel.json_schema import validate_schema +from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -48,7 +48,7 @@ async def run(self, input: str) -> tuple[TaskRun, EvalScores]: base_adapter_config=AdapterConfig(allow_saving=False), ) - # Parse stuctured input if needed + # Parse structured input if needed parsed_input = input if self.target_task.output_json_schema is not None: parsed_input = json.loads(input) From 897a086ef72021986d873d0644b3dafe62a38036 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 11:18:28 -0500 Subject: [PATCH 040/102] reaname evals_api to eval_api for consistency --- app/desktop/desktop_server.py | 2 +- .../studio_server/{evals_api.py => eval_api.py} | 0 app/desktop/studio_server/test_eval_api.py | 16 ++++++++-------- 3 files changed, 9 insertions(+), 9 deletions(-) rename app/desktop/studio_server/{evals_api.py => eval_api.py} (100%) diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py index c05cfcc2..b8b10b87 100644 --- a/app/desktop/desktop_server.py +++ b/app/desktop/desktop_server.py @@ -9,7 +9,7 @@ from fastapi import FastAPI from app.desktop.studio_server.data_gen_api import connect_data_gen_api -from app.desktop.studio_server.evals_api import connect_evals_api +from app.desktop.studio_server.eval_api import connect_evals_api from app.desktop.studio_server.finetune_api import connect_fine_tune_api from app.desktop.studio_server.prompt_api import connect_prompt_api from app.desktop.studio_server.provider_api import connect_provider_api diff --git a/app/desktop/studio_server/evals_api.py b/app/desktop/studio_server/eval_api.py similarity index 100% rename from app/desktop/studio_server/evals_api.py rename to app/desktop/studio_server/eval_api.py diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 68da549d..5adc3f70 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -22,7 +22,7 @@ ) from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig -from app.desktop.studio_server.evals_api import ( +from app.desktop.studio_server.eval_api import ( CreateEvalConfigRequest, CreateEvaluatorRequest, connect_evals_api, @@ -118,7 +118,7 @@ def mock_run_config(mock_task): @pytest.fixture def mock_task_from_id(mock_task): - with patch("app.desktop.studio_server.evals_api.task_from_id") as mock: + with patch("app.desktop.studio_server.eval_api.task_from_id") as mock: mock.return_value = mock_task yield mock @@ -244,9 +244,9 @@ async def test_create_eval_config( mock_task_from_id.return_value = mock_task with ( - patch("app.desktop.studio_server.evals_api.eval_from_id") as mock_eval_from_id, + patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, patch( - "app.desktop.studio_server.evals_api.prompt_builder_from_id" + "app.desktop.studio_server.eval_api.prompt_builder_from_id" ) as mock_prompt_builder, ): mock_eval_from_id.return_value = mock_eval @@ -298,7 +298,7 @@ def test_get_eval_configs( ): mock_task_from_id.return_value = mock_task - with patch("app.desktop.studio_server.evals_api.eval_from_id") as mock_eval_from_id: + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: mock_eval_from_id.return_value = mock_eval response = client.get( "/api/projects/project1/tasks/task1/eval/eval1/eval_configs" @@ -338,9 +338,9 @@ async def mock_run(): with ( patch( - "app.desktop.studio_server.evals_api.task_run_config_from_id" + "app.desktop.studio_server.eval_api.task_run_config_from_id" ) as mock_run_config_from_id, - patch("app.desktop.studio_server.evals_api.EvalRunner") as MockEvalRunner, + patch("app.desktop.studio_server.eval_api.EvalRunner") as MockEvalRunner, ): mock_run_config_from_id.return_value = mock_run_config mock_eval_runner = Mock() @@ -380,7 +380,7 @@ async def test_run_eval_config_no_run_configs_error( mock_task_from_id.return_value = mock_task with patch( - "app.desktop.studio_server.evals_api.eval_config_from_id" + "app.desktop.studio_server.eval_api.eval_config_from_id" ) as mock_eval_config_from_id: mock_eval_config_from_id.return_value = mock_eval_config From e0510bb381a465d3d612eddc8fa2c1c777d2fcd9 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 11:29:31 -0500 Subject: [PATCH 041/102] CR feedback --- app/desktop/studio_server/eval_api.py | 2 - app/desktop/studio_server/test_eval_api.py | 37 +++++++++++++++++++ .../[task_id]/create_finetune/+page.svelte | 6 +-- .../connect_providers.svelte | 25 ++++++------- 4 files changed, 50 insertions(+), 20 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 6d67eb26..bbae3d1d 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -213,8 +213,6 @@ async def run_eval_config( run_config_ids: list[str] = Query([]), all_run_configs: bool = Query(False), ) -> StreamingResponse: - # TODO a lock by eval_id -> error if one is already running - eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) # Load the list of run configs to use. Two options: diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 5adc3f70..e76a0bef 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -26,6 +26,8 @@ CreateEvalConfigRequest, CreateEvaluatorRequest, connect_evals_api, + eval_config_from_id, + task_run_config_from_id, ) @@ -394,3 +396,38 @@ async def test_run_eval_config_no_run_configs_error( response.json()["detail"] == "No run config ids provided. At least one run config id is required." ) + + +@pytest.mark.asyncio +async def test_eval_config_from_id( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + eval_config = eval_config_from_id("project1", "task1", "eval1", "eval_config1") + + assert eval_config.id == "eval_config1" + assert eval_config.name == "Test Eval Config" + assert eval_config.config_type == EvalConfigType.g_eval + assert eval_config.properties == {"eval_steps": ["step1", "step2"]} + + with pytest.raises(HTTPException, match="Eval config not found. ID: non_existent"): + eval_config_from_id("project1", "task1", "eval1", "non_existent") + + +@pytest.mark.asyncio +async def test_task_run_config_from_id( + client, mock_task_from_id, mock_task, mock_run_config +): + mock_task_from_id.return_value = mock_task + + run_config = task_run_config_from_id("project1", "task1", "run_config1") + + assert run_config.id == "run_config1" + assert run_config.name == "Test Run Config" + assert run_config.description == "Test Description" + + with pytest.raises( + HTTPException, match="Task run config not found. ID: non_existent" + ): + task_run_config_from_id("project1", "task1", "non_existent") diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte index 1724638c..40441dac 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte @@ -3,7 +3,7 @@ import FormContainer from "$lib/utils/form_container.svelte" import FormElement from "$lib/utils/form_element.svelte" import { page } from "$app/stores" - import { client } from "$lib/api_client" + import { client, base_url } from "$lib/api_client" import { KilnError, createKilnError } from "$lib/utils/error_handlers" import { onMount } from "svelte" import { formatDate } from "$lib/utils/formatters" @@ -473,9 +473,7 @@ .map(([key, value]) => `${key}=${encodeURIComponent(value || "")}`) .join("&") - window.open( - "http://localhost:8757/api/download_dataset_jsonl?" + query_string, - ) + window.open(base_url + "/api/download_dataset_jsonl?" + query_string) } diff --git a/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte b/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte index bd490638..b4ac6f9b 100644 --- a/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte +++ b/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte @@ -5,7 +5,7 @@ import FormElement from "$lib/utils/form_element.svelte" import FormContainer from "$lib/utils/form_container.svelte" import { KilnError, createKilnError } from "$lib/utils/error_handlers" - import { client } from "$lib/api_client" + import { client, base_url } from "$lib/api_client" type Provider = { name: string @@ -309,19 +309,16 @@ api_key_submitting = true try { const provider_id = api_key_provider ? api_key_provider.id : "" - let res = await fetch( - "http://localhost:8757/api/provider/connect_api_key", - { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - provider: provider_id, - key_data: apiKeyData, - }), + let res = await fetch(base_url + "/api/provider/connect_api_key", { + method: "POST", + headers: { + "Content-Type": "application/json", }, - ) + body: JSON.stringify({ + provider: provider_id, + key_data: apiKeyData, + }), + }) let data = await res.json() if (res.status !== 200) { @@ -354,7 +351,7 @@ let custom_openai_compatible_providers: CustomOpenAICompatibleProvider[] = [] const check_existing_providers = async () => { try { - let res = await fetch("http://localhost:8757/api/settings") + let res = await fetch(base_url + "/api/settings") let data = await res.json() if (data["open_ai_api_key"]) { status.openai.connected = true From 34868d084bc1dbea0063c6556d8c1fdef18d3468 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 11:41:24 -0500 Subject: [PATCH 042/102] CR feedback: better links, better errors --- .../(app)/evals/[project_id]/[task_id]/+page.svelte | 4 ++-- .../[project_id]/[task_id]/[eval_id]/+page.svelte | 10 ++++++++-- .../[eval_id]/create_eval_config/+page.svelte | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte index 012c49c3..b094f8d7 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte @@ -86,8 +86,8 @@ - + @@ -98,8 +98,8 @@ goto(`/evals/${project_id}/${task_id}/${evaluator.id}`) }} > - + {/each} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index c7619a9d..4ee913d3 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -50,6 +50,9 @@ onMount(async () => { // Wait for page params to load await tick() + // Load the selected eval config from the query params if it exists + current_eval_config_id = + $page.url.searchParams.get("selected_eval_config") || null // Wait for these 3 to load, as they are needed for better labels. Usually already cached and instant. await Promise.all([ load_model_info(), @@ -81,7 +84,8 @@ throw error } evaluator = data - if (evaluator.current_config_id) { + // Use the eval's default, unless we already have a selected eval config (eg from query params) + if (evaluator.current_config_id && !current_eval_config_id) { current_eval_config_id = evaluator.current_config_id } } catch (error) { @@ -299,7 +303,9 @@ if (!current_eval_config_id) { eval_run_error = new KilnError("No eval config selected", null) eval_state = "complete_with_errors" - return false + // True to close the dialog, and show the error in the progress dialog + running_progress_dialog?.show() + return true } eval_state = "running" diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index d78ef090..7cd38d1f 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -180,7 +180,7 @@ } create_evaluator_loading = true - const { error } = await client.POST( + const { data, error } = await client.POST( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config", { params: { @@ -208,7 +208,7 @@ } complete = true goto( - `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}`, + `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`, ) } catch (e) { create_evaluator_error = createKilnError(e) From 91e775bf724cda1caa0211555ec8c5d08fe9e141 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 12:17:01 -0500 Subject: [PATCH 043/102] CR feedback --- .../[task_id]/[eval_id]/+page.svelte | 67 +++++++++---------- 1 file changed, 33 insertions(+), 34 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 4ee913d3..da0cf053 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -50,17 +50,14 @@ onMount(async () => { // Wait for page params to load await tick() - // Load the selected eval config from the query params if it exists - current_eval_config_id = - $page.url.searchParams.get("selected_eval_config") || null // Wait for these 3 to load, as they are needed for better labels. Usually already cached and instant. await Promise.all([ load_model_info(), load_available_prompts(), load_available_models(), ]) - // Can load the actual data in parallel - get_eval() + // Get the eval first (want it to set the current config id), then the rest in parallel + await get_eval() get_eval_configs() get_task_run_configs() }) @@ -84,10 +81,11 @@ throw error } evaluator = data - // Use the eval's default, unless we already have a selected eval config (eg from query params) - if (evaluator.current_config_id && !current_eval_config_id) { - current_eval_config_id = evaluator.current_config_id - } + // Set the selected eval config: prefer query params, then eval's default, then + current_eval_config_id = + $page.url.searchParams.get("selected_eval_config") || + evaluator.current_config_id || + null } catch (error) { eval_error = createKilnError(error) } finally { @@ -114,7 +112,7 @@ throw error } eval_configs = data - // This may be already set by evaluator.current_config_id, if so we prioritize that + // This may be already set by evaluator loading, if so we prioritize that, but fallback to first if ( !current_eval_config_id && eval_configs.length > 0 && @@ -154,8 +152,8 @@ } } + // Watches the current eval config id, and if it's "add_config" then navigates to the create eval config page $: check_add_eval_config(current_eval_config_id) - function check_add_eval_config(selected_id: string | null) { if (selected_id === "add_config") { goto(`/evals/${project_id}/${task_id}/${eval_id}/create_eval_config`) @@ -176,7 +174,7 @@ ) } - // A name for the eval config that is human readable + // A name for the eval config that is human readable and helpful // Combine's it's memorable name with it's properties function get_eval_config_name( eval_config: EvalConfig, @@ -198,10 +196,6 @@ name: "Name", value: evaluator.name, }) - properties.push({ - name: "ID", - value: evaluator.id || "unknown", - }) if (evaluator.description) { properties.push({ name: "Description", @@ -218,7 +212,6 @@ value: outputs.join(", "), }) } - // TODO nicer labels here properties.push({ name: "Eval Set", value: evaluator.eval_set_filter_id, @@ -229,19 +222,23 @@ }) return properties } + function get_eval_config_properties( eval_config_id: string | null, model_info: ProviderModels | null, ): UiProperty[] { - if (!eval_config_id) { - return [] - } const eval_config = eval_configs?.find( (config) => config.id === eval_config_id, ) if (!eval_config) { - return [] + return [ + { + name: "No Config Selected", + value: "Select a config from dropdown above", + }, + ] } + const properties: UiProperty[] = [] properties.push({ @@ -261,6 +258,7 @@ eval_config.model.properties["model_provider"] + "", ), }) + // TODO remove this once we consolidate prompts properties.push({ name: "Prompt", value: prompt_name_from_id(eval_config.prompt.name + ""), @@ -303,7 +301,7 @@ if (!current_eval_config_id) { eval_run_error = new KilnError("No eval config selected", null) eval_state = "complete_with_errors" - // True to close the dialog, and show the error in the progress dialog + // True to close the run dialog, and then show the error in the progress dialog running_progress_dialog?.show() return true } @@ -320,6 +318,7 @@ eventSource.onmessage = (event) => { try { if (event.data === "complete") { + // Special end message eventSource.close() eval_state = eval_error_count > 0 ? "complete_with_errors" : "complete" @@ -336,14 +335,14 @@ } } - // Don't restart on an error + // Don't restart on an error (default SSE behavior) eventSource.onerror = (error) => { eventSource.close() eval_state = "complete_with_errors" eval_run_error = createKilnError(error) } - // Switch over to the progress dialog + // Switch over to the progress dialog, closing the run dialog running_progress_dialog?.show() return true } @@ -376,7 +375,7 @@ }, body: { model_name: task_run_config_model_name, - // @ts-expect-error not checking values + // @ts-expect-error not checking types here, server will check them model_provider_name: task_run_config_provider_name, prompt_id: task_run_config_prompt_method, }, @@ -385,7 +384,7 @@ if (error) { throw error } - // Load the updated list of task run configs + // Load the updated list of task run configs after success get_task_run_configs() } catch (error) { add_task_config_error = createKilnError(error) @@ -413,7 +412,7 @@
-
Error Loading Evaluators
+
Error Loading Evaluator
{error.getMessage() || "An unknown error occurred"}
@@ -464,13 +463,13 @@
-
+
{#if task_run_configs?.length} -
+
Results
- Filtered by the selected eval config and grouped by task run + Filtered by the selected eval config. Rows are grouped by task run config.
@@ -576,7 +575,7 @@

- Create a task config, defining how to run this task (model+prompt). + Create a task run config, defining a way to run this task (model+prompt).

- Your evaluator can compare multiple task configs to find the best one for + Your evaluator can compare multiple run configs to find the best one for running this task.

@@ -676,7 +675,7 @@ ]} >
-
Run this eval on the selected eval configuration?
+
Run this eval with the selected configuration?
Don't close this page if you want to monitor progress.
Date: Sat, 22 Feb 2025 15:28:37 -0500 Subject: [PATCH 044/102] Eval results!! --- app/desktop/studio_server/eval_api.py | 62 +++++++++++- app/desktop/studio_server/test_eval_api.py | 95 ++++++++++++++++++ app/web_ui/src/lib/api_schema.d.ts | 65 +++++++++++++ app/web_ui/src/lib/types.ts | 1 + .../[task_id]/[eval_id]/+page.svelte | 96 +++++++++++++++++-- 5 files changed, 312 insertions(+), 7 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index bbae3d1d..8666c373 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -1,5 +1,5 @@ import json -from typing import Any +from typing import Any, Dict from fastapi import FastAPI, HTTPException, Query from fastapi.responses import StreamingResponse @@ -96,6 +96,15 @@ class RunEvalConfigRequest(BaseModel): run_config_ids: list[str] +class ScoreSummary(BaseModel): + mean_score: float + + +class EvalResultSummary(BaseModel): + # run_config_id -> output_score_id -> ScoreSummary + results: Dict[str, Dict[str, ScoreSummary]] + + def connect_evals_api(app: FastAPI): @app.post("/api/projects/{project_id}/tasks/{task_id}/create_evaluator") async def create_evaluator( @@ -252,3 +261,54 @@ async def event_generator(): content=event_generator(), media_type="text/event-stream", ) + + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary" + ) + async def get_eval_config_score_summary( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + ) -> EvalResultSummary: + eval = eval_from_id(project_id, task_id, eval_id) + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + + # task_run_config_id -> output_score_id -> score/total + total_scores: Dict[str, Dict[str, float]] = {} + score_counts: Dict[str, Dict[str, int]] = {} + + # TODO: is the dataset item still in the dataset? They can add/remove tags + # TODO: is the score for each run_config complete + + # important: readonly makes this much faster + for eval_run in eval_config.runs(readonly=True): + for output_score in eval.output_scores: + score_key = output_score.json_key() + run_config_id = str(eval_run.task_run_config_id) + if run_config_id not in total_scores: + total_scores[run_config_id] = {} + score_counts[run_config_id] = {} + if score_key not in total_scores[run_config_id]: + total_scores[run_config_id][score_key] = 0 + score_counts[run_config_id][score_key] = 0 + if score_key in eval_run.scores: + total_scores[run_config_id][score_key] += eval_run.scores[score_key] + score_counts[run_config_id][score_key] += 1 + print( + f"adding score to {run_config_id} {score_key} = {eval_run.scores[score_key]}" + ) + + # Convert to score summaries + results: Dict[str, Dict[str, ScoreSummary]] = {} + for run_config_id, output_scores in total_scores.items(): + results[run_config_id] = {} + for output_score_id, score in output_scores.items(): + if score_counts[run_config_id][output_score_id] > 0: + results[run_config_id][output_score_id] = ScoreSummary( + mean_score=score / score_counts[run_config_id][output_score_id] + ) + + return EvalResultSummary( + results=results, + ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index e76a0bef..2193140b 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -1,4 +1,5 @@ import json +from typing import Dict, Tuple from unittest.mock import Mock, patch import pytest @@ -18,6 +19,7 @@ EvalConfig, EvalConfigType, EvalOutputScore, + EvalRun, EvalTemplate, ) from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig @@ -431,3 +433,96 @@ async def test_task_run_config_from_id( HTTPException, match="Task run config not found. ID: non_existent" ): task_run_config_from_id("project1", "task1", "non_existent") + + +@pytest.fixture +def mock_eval_for_score_summary(): + eval = Mock(spec=Eval) + eval.output_scores = [ + EvalOutputScore(name="accuracy", description="Test accuracy", type="pass_fail"), + EvalOutputScore( + name="relevance", description="Test relevance", type="pass_fail" + ), + ] + return eval + + +@pytest.fixture +def mock_eval_config_for_score_summary(): + config = Mock(spec=EvalConfig) + + scores: Tuple[str, Dict[str, float]] = [ + # Run 1 - normal + ("run1", {"accuracy": 0.8, "relevance": 0.9}), + ("run1", {"accuracy": 0.6, "relevance": 0.7}), + # Run 2 - only 1 score + ("run2", {"accuracy": 0.9, "relevance": 0.85}), + # Run 3 - no valid scores + ("run3", {"other": 0.5}), + # Run 4 - ensure no divide by zero + ("run4", {"accuracy": 0.5}), + ] + runs = [] + + id = 0 + for run_id, score in scores: + id += 1 + runs.append( + EvalRun( + task_run_config_id=run_id, + scores=score, + input="input", + output="output", + dataset_id=f"dataset_id_{id}", + ) + ) + + config.runs.return_value = runs + return config + + +@pytest.mark.asyncio +async def test_get_eval_config_score_summary( + client, mock_eval_for_score_summary, mock_eval_config_for_score_summary +): + with ( + patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, + patch( + "app.desktop.studio_server.eval_api.eval_config_from_id" + ) as mock_eval_config_from_id, + ): + mock_eval_from_id.return_value = mock_eval_for_score_summary + mock_eval_config_from_id.return_value = mock_eval_config_for_score_summary + + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/score_summary" + ) + + assert response.status_code == 200 + top_level_result = response.json() + + # Verify the structure of the response + assert "results" in top_level_result + results = top_level_result["results"] + + # Check average scores for run1 + assert results["run1"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 + assert results["run1"]["relevance"]["mean_score"] == 0.8 # Only one valid score + + # Check average scores for run2 + assert results["run2"]["accuracy"]["mean_score"] == 0.9 + assert results["run2"]["relevance"]["mean_score"] == 0.85 + + # run 3 has non valid scores + assert results["run3"] == {} + + # run 4 has no scores + assert results["run4"]["accuracy"]["mean_score"] == 0.5 + assert "relevance" not in results["run4"] + + # Verify the mocks were called correctly + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") + mock_eval_config_from_id.assert_called_once_with( + "project1", "task1", "eval1", "eval_config1" + ) + mock_eval_config_for_score_summary.runs.assert_called_once_with(readonly=True) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 8bd29475..f6f59c98 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -793,6 +793,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Build Score Summary */ + get: operations["build_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -1269,6 +1286,15 @@ export interface components { /** @description The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical'). */ type: components["schemas"]["TaskOutputRatingType"]; }; + /** EvalResultSummary */ + EvalResultSummary: { + /** Results */ + results: { + [key: string]: { + [key: string]: components["schemas"]["ScoreSummary"]; + }; + }; + }; /** * EvalState * @enum {string} @@ -1809,6 +1835,11 @@ export interface components { /** Tags */ tags?: string[] | null; }; + /** ScoreSummary */ + ScoreSummary: { + /** Mean Score */ + mean_score: number; + }; /** * StructuredOutputMode * @description Enumeration of supported structured output modes. @@ -3956,4 +3987,38 @@ export interface operations { }; }; }; + build_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalResultSummary"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; } diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 7ca9ee2a..7da878dd 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -25,3 +25,4 @@ export type Eval = components["schemas"]["Eval"] export type EvalConfigType = components["schemas"]["EvalConfigType"] export type EvalConfig = components["schemas"]["EvalConfig"] export type TaskRunConfig = components["schemas"]["TaskRunConfig"] +export type EvalResultSummary = components["schemas"]["EvalResultSummary"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index da0cf053..5d2256c8 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -11,6 +11,7 @@ EvalConfigType, ProviderModels, TaskRunConfig, + EvalResultSummary, } from "$lib/types" import { goto } from "$app/navigation" import { @@ -26,6 +27,7 @@ import AvailableModelsDropdown from "../../../../run/available_models_dropdown.svelte" import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte" import Warning from "$lib/ui/warning.svelte" + import { title_to_name } from "$lib/utils/json_schema_editor/json_schema_templates" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -44,8 +46,17 @@ let task_run_configs_error: KilnError | null = null let task_run_configs_loading = true - $: loading = eval_loading || eval_configs_loading || task_run_configs_loading + let score_summary: EvalResultSummary | null = null + let score_summary_error: KilnError | null = null + let score_summary_loading = false + + $: loading = + eval_loading || + eval_configs_loading || + task_run_configs_loading || + score_summary_loading $: error = eval_error || eval_configs_error || task_run_configs_error + // Note: not including score_summary_error, because it's not a critical error we should block the UI for onMount(async () => { // Wait for page params to load @@ -58,8 +69,10 @@ ]) // Get the eval first (want it to set the current config id), then the rest in parallel await get_eval() - get_eval_configs() - get_task_run_configs() + // These two can be parallel + await Promise.all([get_eval_configs(), get_task_run_configs()]) + // This needs the selected eval config id + get_score_summary() }) async function get_eval() { @@ -152,11 +165,49 @@ } } - // Watches the current eval config id, and if it's "add_config" then navigates to the create eval config page - $: check_add_eval_config(current_eval_config_id) - function check_add_eval_config(selected_id: string | null) { + async function get_score_summary() { + if (!current_eval_config_id) { + score_summary_error = new KilnError("No eval config selected", null) + return + } + try { + score_summary_loading = true + const { data, error } = await client.GET( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary", + { + params: { + path: { + project_id, + task_id, + eval_id, + eval_config_id: current_eval_config_id, + }, + }, + }, + ) + if (error) { + throw error + } + score_summary = data + } catch (error) { + score_summary_error = createKilnError(error) + } finally { + score_summary_loading = false + } + } + + // Watches the current eval config id + $: watch_selected_eval_config(current_eval_config_id) + function watch_selected_eval_config(selected_id: string | null) { if (selected_id === "add_config") { + // if it's "add_config" then navigates to the create eval config page goto(`/evals/${project_id}/${task_id}/${eval_id}/create_eval_config`) + return + } + // If the selected id is not null, then get the score summary + score_summary = null + if (selected_id) { + get_score_summary() } } @@ -306,6 +357,7 @@ return true } + score_summary = null eval_state = "running" eval_complete_count = 0 eval_total_count = 0 @@ -322,6 +374,7 @@ eventSource.close() eval_state = eval_error_count > 0 ? "complete_with_errors" : "complete" + get_score_summary() } else { const data = JSON.parse(event.data) eval_complete_count = data.progress @@ -332,6 +385,7 @@ } catch (error) { eval_run_error = createKilnError(error) eval_state = "complete_with_errors" + get_score_summary() } } @@ -340,6 +394,7 @@ eventSource.close() eval_state = "complete_with_errors" eval_run_error = createKilnError(error) + get_score_summary() } // Switch over to the progress dialog, closing the run dialog @@ -472,6 +527,12 @@ Filtered by the selected eval config. Rows are grouped by task run config.
+ {#if score_summary_error} +
+ {score_summary_error.getMessage() || + "An unknown error occurred fetching scores."} +
+ {/if}
{#if eval_state === "not_started"} @@ -516,6 +577,20 @@
+ {#each evaluator.output_scores as output_score} + + {/each} @@ -544,6 +619,15 @@ task_run_config?.run_config_properties?.prompt_id, )} + {#each evaluator.output_scores as output_score} + {@const score = + score_summary?.results?.["" + task_run_config.id]?.[ + title_to_name(output_score.name) + ]?.mean_score} + + {/each} {/each} From 02792e6834efb89bdcfb28eade2d54310e77185c Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 22 Feb 2025 17:22:50 -0500 Subject: [PATCH 045/102] Check completeness, for out UI Music is amazing --- app/desktop/studio_server/eval_api.py | 71 +++++++++++++++++++--- app/desktop/studio_server/test_eval_api.py | 58 ++++++++++++++---- 2 files changed, 109 insertions(+), 20 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 8666c373..1b70ea66 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict +from typing import Any, Dict, Set from fastapi import FastAPI, HTTPException, Query from fastapi.responses import StreamingResponse @@ -11,8 +11,9 @@ DataSource, DataSourceType, PromptId, + Task, ) -from kiln_ai.datamodel.dataset_filters import DatasetFilterId +from kiln_ai.datamodel.dataset_filters import DatasetFilterId, dataset_filter_from_id from kiln_ai.datamodel.eval import ( Eval, EvalConfig, @@ -103,6 +104,14 @@ class ScoreSummary(BaseModel): class EvalResultSummary(BaseModel): # run_config_id -> output_score_id -> ScoreSummary results: Dict[str, Dict[str, ScoreSummary]] + # run_config_id -> percent of the dataset that has been processed + run_config_percent_complete: Dict[str, float] + + +def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[str]: + # Fetch all the dataset items IDs in a filter + filter = dataset_filter_from_id(filter_id) + return {run.dataset_id for run in task.runs() if filter(run)} def connect_evals_api(app: FastAPI): @@ -271,21 +280,50 @@ async def get_eval_config_score_summary( eval_id: str, eval_config_id: str, ) -> EvalResultSummary: + task = task_from_id(project_id, task_id) eval = eval_from_id(project_id, task_id, eval_id) eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + task_runs_configs = task.run_configs() + + # Build a set of all the dataset items IDs we expect to have scores for + expected_dataset_ids = dataset_ids_in_filter(task, eval.eval_set_filter_id) + if len(expected_dataset_ids) == 0: + raise HTTPException( + status_code=400, + detail="No dataset ids in eval set filter. Cannot compute score summary.", + ) + + # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[str, Set[str]] = { + str(run_config.id): set(expected_dataset_ids) + for run_config in task_runs_configs + } + # Track how often we are missing scores in a eval_config. Should be 0 for a complete eval_config + partial_incomplete_counts: Dict[str, int] = { + str(run_config.id): 0 for run_config in task_runs_configs + } # task_run_config_id -> output_score_id -> score/total total_scores: Dict[str, Dict[str, float]] = {} score_counts: Dict[str, Dict[str, int]] = {} - # TODO: is the dataset item still in the dataset? They can add/remove tags - # TODO: is the score for each run_config complete - # important: readonly makes this much faster for eval_run in eval_config.runs(readonly=True): + run_config_id = str(eval_run.task_run_config_id) + + # Check if we should count this eval_run. Not every eval_run has to go into the stats: + # - a dataset_id can be removed from the dataset filter (removed a tag) + # - this dataset_id was already counted (okay there are dupes, but shouldn't be double counted) + if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]: + continue + else: + remaining_expected_dataset_ids[run_config_id].remove( + eval_run.dataset_id + ) + + incomplete = False for output_score in eval.output_scores: score_key = output_score.json_key() - run_config_id = str(eval_run.task_run_config_id) if run_config_id not in total_scores: total_scores[run_config_id] = {} score_counts[run_config_id] = {} @@ -295,9 +333,12 @@ async def get_eval_config_score_summary( if score_key in eval_run.scores: total_scores[run_config_id][score_key] += eval_run.scores[score_key] score_counts[run_config_id][score_key] += 1 - print( - f"adding score to {run_config_id} {score_key} = {eval_run.scores[score_key]}" - ) + else: + # We're missing a required score, so this eval_run is incomplete + incomplete = True + + if incomplete: + partial_incomplete_counts[run_config_id] += 1 # Convert to score summaries results: Dict[str, Dict[str, ScoreSummary]] = {} @@ -309,6 +350,18 @@ async def get_eval_config_score_summary( mean_score=score / score_counts[run_config_id][output_score_id] ) + # Calculate the percent of the dataset that has been processed + run_config_percent_complete: Dict[str, float] = {} + for run_config in task_runs_configs: + run_config_id = str(run_config.id) + # Partial incomplete (missing scores), and fully incomplete (no eval_run) + incomplete_count = partial_incomplete_counts[run_config_id] + len( + remaining_expected_dataset_ids[run_config_id] + ) + percent_incomplete = incomplete_count / len(expected_dataset_ids) + run_config_percent_complete[str(run_config.id)] = 1 - percent_incomplete + return EvalResultSummary( results=results, + run_config_percent_complete=run_config_percent_complete, ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 2193140b..841ea051 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -444,6 +444,7 @@ def mock_eval_for_score_summary(): name="relevance", description="Test relevance", type="pass_fail" ), ] + eval.eval_set_filter_id = "tag::eval_set" return eval @@ -451,21 +452,26 @@ def mock_eval_for_score_summary(): def mock_eval_config_for_score_summary(): config = Mock(spec=EvalConfig) - scores: Tuple[str, Dict[str, float]] = [ + scores: Tuple[str, str, Dict[str, float]] = [ # Run 1 - normal - ("run1", {"accuracy": 0.8, "relevance": 0.9}), - ("run1", {"accuracy": 0.6, "relevance": 0.7}), - # Run 2 - only 1 score - ("run2", {"accuracy": 0.9, "relevance": 0.85}), - # Run 3 - no valid scores - ("run3", {"other": 0.5}), - # Run 4 - ensure no divide by zero - ("run4", {"accuracy": 0.5}), + ("run1", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run1", "dataset_id_2", {"accuracy": 0.6, "relevance": 0.7}), + # Run 2 - only 1 score, should be 0.5 complete + ("run2", "dataset_id_1", {"accuracy": 0.9, "relevance": 0.85}), + # Run 3 - no valid scores, 0.0 complete + ("run3", "dataset_id_1", {"other": 0.5}), + # Run 4 - Partial incomplete doesn't divide by zero, still 0.0 complete + ("run4", "dataset_id_1", {"accuracy": 0.5}), + # Run 5 - duplicate dataset_id not double counted, item not in dataset filter ignored + ("run5", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run5", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run5", "dataset_id_2", {"accuracy": 0.6, "relevance": 0.7}), + ("run5", "not_in_filter", {"accuracy": 0.1, "relevance": 0.1}), ] runs = [] id = 0 - for run_id, score in scores: + for run_id, dataset_id, score in scores: id += 1 runs.append( EvalRun( @@ -473,7 +479,7 @@ def mock_eval_config_for_score_summary(): scores=score, input="input", output="output", - dataset_id=f"dataset_id_{id}", + dataset_id=dataset_id, ) ) @@ -487,12 +493,30 @@ async def test_get_eval_config_score_summary( ): with ( patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, + patch( + "app.desktop.studio_server.eval_api.dataset_ids_in_filter" + ) as mock_dataset_ids_in_filter, patch( "app.desktop.studio_server.eval_api.eval_config_from_id" ) as mock_eval_config_from_id, + patch("app.desktop.studio_server.eval_api.task_from_id") as mock_task_from_id, ): mock_eval_from_id.return_value = mock_eval_for_score_summary mock_eval_config_from_id.return_value = mock_eval_config_for_score_summary + mock_dataset_ids_in_filter.return_value = { + "dataset_id_1", + "dataset_id_2", + } + + mock_task = Mock(spec=Task) + mock_task.run_configs.return_value = [ + Mock(spec=TaskRunConfig, id="run1"), + Mock(spec=TaskRunConfig, id="run2"), + Mock(spec=TaskRunConfig, id="run3"), + Mock(spec=TaskRunConfig, id="run4"), + Mock(spec=TaskRunConfig, id="run5"), + ] + mock_task_from_id.return_value = mock_task response = client.get( "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/score_summary" @@ -504,21 +528,32 @@ async def test_get_eval_config_score_summary( # Verify the structure of the response assert "results" in top_level_result results = top_level_result["results"] + assert "run_config_percent_complete" in top_level_result + run_config_percent_complete = top_level_result["run_config_percent_complete"] # Check average scores for run1 assert results["run1"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 assert results["run1"]["relevance"]["mean_score"] == 0.8 # Only one valid score + assert run_config_percent_complete["run1"] == 1.0 # Check average scores for run2 assert results["run2"]["accuracy"]["mean_score"] == 0.9 assert results["run2"]["relevance"]["mean_score"] == 0.85 + assert run_config_percent_complete["run2"] == 0.5 # run 3 has non valid scores assert results["run3"] == {} + assert run_config_percent_complete["run3"] == 0.0 # run 4 has no scores assert results["run4"]["accuracy"]["mean_score"] == 0.5 assert "relevance" not in results["run4"] + assert run_config_percent_complete["run4"] == 0.0 + + # Check average scores for run5 - duplicate dataset_id not double counted + assert results["run5"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 + assert results["run5"]["relevance"]["mean_score"] == 0.8 # Only one valid score + assert run_config_percent_complete["run5"] == 1.0 # Verify the mocks were called correctly mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") @@ -526,3 +561,4 @@ async def test_get_eval_config_score_summary( "project1", "task1", "eval1", "eval_config1" ) mock_eval_config_for_score_summary.runs.assert_called_once_with(readonly=True) + mock_dataset_ids_in_filter.assert_called_once_with(mock_task, "tag::eval_set") From c73b0e62ed53d902a4735ee5c8caa9a9ff00fef3 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 09:26:29 -0500 Subject: [PATCH 046/102] Nice UI for eval incomplete warnings --- app/desktop/studio_server/eval_api.py | 7 +- app/web_ui/src/lib/api_schema.d.ts | 10 +- .../[task_id]/[eval_id]/+page.svelte | 92 ++++++++++++++----- 3 files changed, 81 insertions(+), 28 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 1b70ea66..5f2d7ee1 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -13,6 +13,7 @@ PromptId, Task, ) +from kiln_ai.datamodel.basemodel import ID_TYPE from kiln_ai.datamodel.dataset_filters import DatasetFilterId, dataset_filter_from_id from kiln_ai.datamodel.eval import ( Eval, @@ -108,10 +109,10 @@ class EvalResultSummary(BaseModel): run_config_percent_complete: Dict[str, float] -def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[str]: +def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[ID_TYPE]: # Fetch all the dataset items IDs in a filter filter = dataset_filter_from_id(filter_id) - return {run.dataset_id for run in task.runs() if filter(run)} + return {run.id for run in task.runs() if filter(run)} def connect_evals_api(app: FastAPI): @@ -294,7 +295,7 @@ async def get_eval_config_score_summary( ) # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run - remaining_expected_dataset_ids: Dict[str, Set[str]] = { + remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { str(run_config.id): set(expected_dataset_ids) for run_config in task_runs_configs } diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index f6f59c98..0d707853 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -800,8 +800,8 @@ export interface paths { path?: never; cookie?: never; }; - /** Build Score Summary */ - get: operations["build_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get"]; + /** Get Eval Config Score Summary */ + get: operations["get_eval_config_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get"]; put?: never; post?: never; delete?: never; @@ -1294,6 +1294,10 @@ export interface components { [key: string]: components["schemas"]["ScoreSummary"]; }; }; + /** Run Config Percent Complete */ + run_config_percent_complete: { + [key: string]: number; + }; }; /** * EvalState @@ -3987,7 +3991,7 @@ export interface operations { }; }; }; - build_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get: { + get_eval_config_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get: { parameters: { query?: never; header?: never; diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 5d2256c8..53902919 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -447,6 +447,21 @@ } return true } + + function show_incomplete_warning( + score_summary: EvalResultSummary | null, + ): boolean { + if (!score_summary?.run_config_percent_complete) { + return false + } + + const values = Object.values(score_summary.run_config_percent_complete) + const minComplete = + values.length > 0 + ? values.reduce((min, val) => Math.min(min, val), 1.0) + : 1.0 + return minComplete < 1.0 + } + + + {#if show_incomplete_warning(score_summary)} +
+ +
+ {/if} +
ID Name Description
{evaluator.id} {evaluator.name} {evaluator.description}
Task Model Task Provider Task Prompt + {output_score.name} + {#if output_score.type === "five_star"} + (1 to 5) + {:else if output_score.type === "pass_fail"} + (0 to 1) + {:else if output_score.type === "pass_fail_critical"} + (-1 to 1) + {:else} + ({output_score.type}) + {/if} +
+ {score != null ? score.toFixed(2) : "unknown"} +
- - - - + {#each evaluator.output_scores as output_score} - {#each task_run_configs || [] as task_run_config} + {@const percent_complete = + score_summary?.run_config_percent_complete?.[ + "" + task_run_config.id + ]} { console.log("TODO: link") }} > - - - {#each evaluator.output_scores as output_score} {@const score = score_summary?.results?.["" + task_run_config.id]?.[ title_to_name(output_score.name) ]?.mean_score} - {/each} From dcf3a00d18ac1e4100a34eb7ba150b2894b58599 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 10:49:09 -0500 Subject: [PATCH 047/102] Show eval set size in UI --- app/desktop/studio_server/eval_api.py | 5 ++++- app/desktop/studio_server/test_eval_api.py | 2 ++ app/web_ui/src/lib/api_schema.d.ts | 2 ++ .../[project_id]/[task_id]/[eval_id]/+page.svelte | 13 ++++++++++--- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 5f2d7ee1..ea3d2ca6 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -107,6 +107,8 @@ class EvalResultSummary(BaseModel): results: Dict[str, Dict[str, ScoreSummary]] # run_config_id -> percent of the dataset that has been processed run_config_percent_complete: Dict[str, float] + # The total size of the dataset used for the eval + dataset_size: int def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[ID_TYPE]: @@ -291,7 +293,7 @@ async def get_eval_config_score_summary( if len(expected_dataset_ids) == 0: raise HTTPException( status_code=400, - detail="No dataset ids in eval set filter. Cannot compute score summary.", + detail="No dataset ids in eval set filter. Add items to your dataset matching the eval set filter.", ) # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run @@ -365,4 +367,5 @@ async def get_eval_config_score_summary( return EvalResultSummary( results=results, run_config_percent_complete=run_config_percent_complete, + dataset_size=len(expected_dataset_ids), ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 841ea051..009671e5 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -530,6 +530,8 @@ async def test_get_eval_config_score_summary( results = top_level_result["results"] assert "run_config_percent_complete" in top_level_result run_config_percent_complete = top_level_result["run_config_percent_complete"] + assert "dataset_size" in top_level_result + assert top_level_result["dataset_size"] == 2 # Check average scores for run1 assert results["run1"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 0d707853..2e44b7d3 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -1298,6 +1298,8 @@ export interface components { run_config_percent_complete: { [key: string]: number; }; + /** Dataset Size */ + dataset_size: number; }; /** * EvalState diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 53902919..c8d3e914 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -240,7 +240,10 @@ return eval_config.name + " — " + parts.join(", ") } - function get_eval_properties(evaluator: Eval): UiProperty[] { + function get_eval_properties( + evaluator: Eval, + score_summary: EvalResultSummary | null, + ): UiProperty[] { const properties: UiProperty[] = [] properties.push({ @@ -263,9 +266,13 @@ value: outputs.join(", "), }) } + let eval_set_size = "" + if (score_summary) { + eval_set_size = " (" + score_summary.dataset_size + " items)" + } properties.push({ name: "Eval Set", - value: evaluator.eval_set_filter_id, + value: evaluator.eval_set_filter_id + eval_set_size, }) properties.push({ name: "Config Eval Set", @@ -494,7 +501,7 @@
- {#each get_eval_properties(evaluator) as property} + {#each get_eval_properties(evaluator, score_summary) as property}
{property.name}
{property.value} From 13755c71e181bbf1e2e68b3a5af37e8b153e2b66 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 11:17:30 -0500 Subject: [PATCH 048/102] CR feedback: better names and strings --- .../json_schema_templates.test.ts | 24 +++++++++++-------- .../json_schema_templates.ts | 6 ++--- .../[task_id]/[eval_id]/+page.svelte | 7 +++--- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.test.ts b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.test.ts index 6d47c363..f34191b5 100644 --- a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.test.ts +++ b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.test.ts @@ -1,5 +1,5 @@ import { - title_to_name, + string_to_json_key, schema_from_model, model_from_schema, typed_json_from_schema_model, @@ -8,37 +8,41 @@ import type { SchemaModel, JsonSchema } from "./json_schema_templates" import { describe, it, expect } from "vitest" import { KilnError } from "$lib/utils/error_handlers" -describe("title_to_name", () => { +describe("string_to_json_key", () => { it("converts spaces to underscores", () => { - expect(title_to_name("Hello World")).toBe("hello_world") + expect(string_to_json_key("Hello World")).toBe("hello_world") }) it("converts to lowercase", () => { - expect(title_to_name("UPPERCASE")).toBe("uppercase") + expect(string_to_json_key("UPPERCASE")).toBe("uppercase") }) it("removes special characters", () => { - expect(title_to_name("Special@#$Characters!")).toBe("specialcharacters") + expect(string_to_json_key("Special@#$Characters!")).toBe( + "specialcharacters", + ) }) it("keeps alphanumeric characters, underscores, and dots", () => { - expect(title_to_name("alpha123_numeric.test")).toBe("alpha123_numeric.test") + expect(string_to_json_key("alpha123_numeric.test")).toBe( + "alpha123_numeric.test", + ) }) it("handles empty string", () => { - expect(title_to_name("")).toBe("") + expect(string_to_json_key("")).toBe("") }) it("handles string with only special characters", () => { - expect(title_to_name("@#$%^&*")).toBe("") + expect(string_to_json_key("@#$%^&*")).toBe("") }) it("handles mixed case and special characters", () => { - expect(title_to_name("User Name (Display)")).toBe("user_name_display") + expect(string_to_json_key("User Name (Display)")).toBe("user_name_display") }) it("handles leading and trailing spaces", () => { - expect(title_to_name(" Trim Me ")).toBe("trim_me") + expect(string_to_json_key(" Trim Me ")).toBe("trim_me") }) }) diff --git a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts index 4068ba4c..bf693735 100644 --- a/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts +++ b/app/web_ui/src/lib/utils/json_schema_editor/json_schema_templates.ts @@ -41,8 +41,8 @@ export function model_from_schema_string(s: string): SchemaModel { return model_from_schema(JSON.parse(s)) } -export function title_to_name(title: string): string { - return title +export function string_to_json_key(s: string): string { + return s .trim() .toLowerCase() .replace(/ /g, "_") @@ -60,7 +60,7 @@ export function schema_from_model( if (!title) { throw new KilnError("Property is empty. Please provide a name.", null) } - const safe_name = title_to_name(m.properties[i].title) + const safe_name = string_to_json_key(m.properties[i].title) if (!safe_name) { throw new KilnError( "Property name only contains special characters. Must be alphanumeric. Provided name with issues: " + diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index c8d3e914..e8ede737 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -27,7 +27,7 @@ import AvailableModelsDropdown from "../../../../run/available_models_dropdown.svelte" import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte" import Warning from "$lib/ui/warning.svelte" - import { title_to_name } from "$lib/utils/json_schema_editor/json_schema_templates" + import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -166,6 +166,7 @@ } async function get_score_summary() { + score_summary = null if (!current_eval_config_id) { score_summary_error = new KilnError("No eval config selected", null) return @@ -597,7 +598,7 @@
Run Config Name Task Model Task Provider Task Prompt Run Config + {output_score.name} {#if output_score.type === "five_star"} (1 to 5) @@ -595,36 +623,56 @@
{task_run_config.name} - {model_name( - task_run_config?.run_config_properties?.model_name, - $model_info, - )} - - {provider_name_from_id( - task_run_config?.run_config_properties - ?.model_provider_name, - )} - - {prompt_name_from_id( - task_run_config?.run_config_properties?.prompt_id, - )} +
+ {task_run_config.name} +
+
+ {model_name( + task_run_config?.run_config_properties?.model_name, + $model_info, + )} +
+
+ {provider_name_from_id( + task_run_config?.run_config_properties + ?.model_provider_name, + )} +
+
+ {prompt_name_from_id( + task_run_config?.run_config_properties?.prompt_id, + )} +
+ {#if percent_complete} +
+ Eval {(percent_complete * 100.0).toFixed(1)}% complete +
+ {:else if score_summary} + +
Eval 0% complete
+ {/if}
+ {score != null ? score.toFixed(2) : "unknown"}
- - + + + @@ -100,6 +102,7 @@ > + {/each} From e0a55327339111f9321ee7936a4046abc9faa469 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 15:17:53 -0500 Subject: [PATCH 051/102] Much better prompt system for evals - Freeze non-frozen prompts into the task_run, so the evals are consisten - Expose frozen prompts via prompt UI --- app/desktop/studio_server/eval_api.py | 45 +++++-- app/desktop/studio_server/test_eval_api.py | 96 +++++++++++---- app/web_ui/src/lib/api_schema.d.ts | 50 +++++++- app/web_ui/src/lib/stores.ts | 2 +- .../[task_id]/[eval_id]/+page.svelte | 15 +-- .../[task_id]/saved/[prompt_id]/+page.svelte | 27 ++-- .../(app)/run/prompt_type_selector.svelte | 2 +- libs/core/kiln_ai/adapters/eval/g_eval.py | 18 ++- .../kiln_ai/adapters/eval/test_eval_runner.py | 10 +- .../core/kiln_ai/adapters/eval/test_g_eval.py | 10 +- libs/core/kiln_ai/adapters/prompt_builders.py | 64 +++++----- .../kiln_ai/adapters/test_prompt_builders.py | 116 ++++++------------ libs/core/kiln_ai/datamodel/eval.py | 4 - libs/core/kiln_ai/datamodel/prompt.py | 4 + libs/core/kiln_ai/datamodel/prompt_id.py | 21 +++- libs/core/kiln_ai/datamodel/task.py | 9 +- .../core/kiln_ai/datamodel/test_eval_model.py | 13 -- libs/core/kiln_ai/datamodel/test_prompt_id.py | 27 +++- libs/server/kiln_server/prompt_api.py | 32 ++++- 19 files changed, 348 insertions(+), 217 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 7b4a99fe..e8fc3a68 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -22,6 +22,7 @@ EvalOutputScore, EvalTemplate, ) +from kiln_ai.datamodel.prompt_id import is_frozen_prompt from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig from kiln_ai.utils.name_generator import generate_memorable_name from kiln_server.task_api import task_from_id @@ -168,6 +169,33 @@ async def create_task_run_config( ) -> TaskRunConfig: task = task_from_id(project_id, task_id) name = request.name or generate_memorable_name() + + parent_project = task.parent_project() + if parent_project is None: + raise HTTPException( + status_code=400, + detail="Task must have a parent project.", + ) + + froze_prompt = False + prompt: BasePrompt | None = None + if not is_frozen_prompt(request.prompt_id): + # For dynamic prompts, we "freeze" a copy of this prompt into the task run config so we don't accidentially invalidate evals if the user changes something that impacts the prompt (example: chanding data for multi-shot, or chanding task for basic-prompt) + # We then point the task_run_config.run_properties.prompt_id to this new frozen prompt + froze_prompt = True + prompt_builder = prompt_builder_from_id(request.prompt_id, task) + prompt_name = generate_memorable_name() + prompt = BasePrompt( + name=prompt_name, + long_name=prompt_name + + " (frozen prompt from '" + + request.prompt_id + + "')", + generator_id=request.prompt_id, + prompt=prompt_builder.build_base_prompt(), + chain_of_thought_instructions=prompt_builder.chain_of_thought_prompt(), + ) + task_run_config = TaskRunConfig( parent=task, name=name, @@ -177,7 +205,13 @@ async def create_task_run_config( model_provider_name=request.model_provider_name, prompt_id=request.prompt_id, ), + prompt=prompt, ) + if froze_prompt: + # Set after, because the ID isn't known until the TaskRunConfig is created + task_run_config.run_config_properties.prompt_id = ( + f"task_run_config::{parent_project.id}::{task.id}::{task_run_config.id}" + ) task_run_config.save_to_file() return task_run_config @@ -190,19 +224,9 @@ async def create_eval_config( eval_id: str, request: CreateEvalConfigRequest, ) -> EvalConfig: - task = task_from_id(project_id, task_id) eval = eval_from_id(project_id, task_id, eval_id) name = request.name or generate_memorable_name() - # Create a prompt instance to save to the eval config - prompt_builder = prompt_builder_from_id(request.prompt_id, task) - prompt = BasePrompt( - name=request.prompt_id, - generator_id=request.prompt_id, - prompt=prompt_builder.build_base_prompt(), - chain_of_thought_instructions=prompt_builder.chain_of_thought_prompt(), - ) - eval_config = EvalConfig( name=name, config_type=request.type, @@ -215,7 +239,6 @@ async def create_eval_config( "adapter_name": "kiln_eval", }, ), - prompt=prompt, parent=eval, ) eval_config.save_to_file() diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 009671e5..adbf3690 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -10,6 +10,7 @@ BasePrompt, DataSource, DataSourceType, + Project, PromptId, Task, ) @@ -47,12 +48,19 @@ def client(app): @pytest.fixture def mock_task(tmp_path): + project = Project( + id="project1", + name="Test Project", + path=tmp_path / "project.kiln", + ) + project.save_to_file() task = Task( id="task1", name="Test Task", description="Test Description", instruction="Test Instructions", path=tmp_path / "task.kiln", + parent=project, ) task.save_to_file() return task @@ -210,16 +218,23 @@ async def test_create_evaluator( async def test_create_task_run_config(client, mock_task_from_id, mock_task): mock_task_from_id.return_value = mock_task - response = client.post( - "/api/projects/project1/tasks/task1/task_run_config", - json={ - "name": "Test Task Run Config", - "description": "Test Description", - "model_name": "gpt-4o", - "model_provider_name": "openai", - "prompt_id": "simple_chain_of_thought_prompt_builder", - }, - ) + with ( + patch( + "app.desktop.studio_server.eval_api.generate_memorable_name" + ) as mock_generate_memorable_name, + ): + mock_generate_memorable_name.return_value = "Custom Name" + + response = client.post( + "/api/projects/project1/tasks/task1/task_run_config", + json={ + "name": "Test Task Run Config", + "description": "Test Description", + "model_name": "gpt-4o", + "model_provider_name": "openai", + "prompt_id": "simple_chain_of_thought_prompt_builder", + }, + ) assert response.status_code == 200 result = response.json() @@ -229,9 +244,13 @@ async def test_create_task_run_config(client, mock_task_from_id, mock_task): assert result["run_config_properties"]["model_provider_name"] == "openai" assert ( result["run_config_properties"]["prompt_id"] - == "simple_chain_of_thought_prompt_builder" + == "task_run_config::project1::task1::" + result["id"] + ) + assert result["prompt"]["name"] == "Custom Name" + assert ( + result["prompt"]["long_name"] + == "Custom Name (frozen prompt from 'simple_chain_of_thought_prompt_builder')" ) - # Fetch it from API fetch_response = client.get("/api/projects/project1/tasks/task1/task_run_configs") assert fetch_response.status_code == 200 @@ -239,6 +258,47 @@ async def test_create_task_run_config(client, mock_task_from_id, mock_task): assert len(configs) == 1 assert configs[0]["id"] == result["id"] assert configs[0]["name"] == result["name"] + assert configs[0]["prompt"]["name"] == "Custom Name" + assert configs[0]["prompt"]["long_name"] == ( + "Custom Name (frozen prompt from 'simple_chain_of_thought_prompt_builder')" + ) + assert configs[0]["run_config_properties"]["prompt_id"] == ( + "task_run_config::project1::task1::" + result["id"] + ) + + +@pytest.mark.asyncio +async def test_create_task_run_config_without_freezing( + client, mock_task_from_id, mock_task +): + mock_task_from_id.return_value = mock_task + + with ( + patch( + "app.desktop.studio_server.eval_api.generate_memorable_name" + ) as mock_generate_memorable_name, + ): + mock_generate_memorable_name.return_value = "Custom Name" + + response = client.post( + "/api/projects/project1/tasks/task1/task_run_config", + json={ + "name": "Test Task Run Config", + "description": "Test Description", + "model_name": "gpt-4o", + "model_provider_name": "openai", + "prompt_id": "id::prompt_123", + }, + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == "Test Task Run Config" + assert result["description"] == "Test Description" + assert result["run_config_properties"]["model_name"] == "gpt-4o" + assert result["run_config_properties"]["model_provider_name"] == "openai" + assert result["run_config_properties"]["prompt_id"] == "id::prompt_123" + assert result["prompt"] is None @pytest.mark.asyncio @@ -249,15 +309,8 @@ async def test_create_eval_config( with ( patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, - patch( - "app.desktop.studio_server.eval_api.prompt_builder_from_id" - ) as mock_prompt_builder, ): mock_eval_from_id.return_value = mock_eval - mock_prompt_builder.return_value.build_base_prompt.return_value = "base prompt" - mock_prompt_builder.return_value.chain_of_thought_prompt.return_value = ( - "cot prompt" - ) response = client.post( "/api/projects/project1/tasks/task1/eval/eval1/create_eval_config", @@ -278,8 +331,6 @@ async def test_create_eval_config( result["model"]["properties"]["model_provider"] == valid_eval_config_request.provider ) - assert isinstance(result["prompt"], dict) - # mock_save.assert_called_once() # Fetch disk assert len(mock_eval.configs()) == 1 @@ -291,8 +342,6 @@ async def test_create_eval_config( assert ( config.model.properties["model_provider"] == valid_eval_config_request.provider ) - assert config.prompt.prompt == "base prompt" - assert config.prompt.chain_of_thought_instructions == "cot prompt" assert config.properties["eval_steps"][0] == "step1" assert config.properties["eval_steps"][1] == "step2" @@ -317,7 +366,6 @@ def test_get_eval_configs( assert config["config_type"] == mock_eval_config.config_type assert config["properties"] == mock_eval_config.properties assert config["model"]["type"] == mock_eval_config.model.type - assert isinstance(config["prompt"], dict) mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 2e44b7d3..3eb9417b 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -814,6 +814,40 @@ export interface paths { export type webhooks = Record; export interface components { schemas: { + /** ApiPrompt */ + ApiPrompt: { + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Long Name + * @description A more detailed name for the prompt, usually incorporating the source of the prompt. + */ + long_name?: string | null; + /** + * Generator Id + * @description The id of the generator that created this prompt. + */ + generator_id?: string | null; + /** + * Prompt + * @description The prompt for the task. + */ + prompt: string; + /** + * Chain Of Thought Instructions + * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. + */ + chain_of_thought_instructions?: string | null; + /** Id */ + id: string; + /** Created At */ + created_at?: string | null; + /** Created By */ + created_by?: string | null; + }; /** AvailableModels */ AvailableModels: { /** Provider Name */ @@ -835,6 +869,11 @@ export interface components { * @description A name for this entity. */ name: string; + /** + * Long Name + * @description A more detailed name for the prompt, usually incorporating the source of the prompt. + */ + long_name?: string | null; /** * Generator Id * @description The id of the generator that created this prompt. @@ -1256,8 +1295,6 @@ export interface components { * @default {} */ properties: Record; - /** @description The prompt to use for this eval config. Both when running the task to generate outputs to evaluate and when explaining to the eval model what the goal of the task was. This is a frozen prompt, so this eval config is consistent over time (for example, if the user selects multi-shot prompting, this saves that dynamic prompt at the point the eval config is created). Freezing the prompt ensures consistent evals. */ - prompt: components["schemas"]["BasePrompt"]; /** Model Type */ readonly model_type: string; }; @@ -1658,6 +1695,11 @@ export interface components { * @description A name for this entity. */ name: string; + /** + * Long Name + * @description A more detailed name for the prompt, usually incorporating the source of the prompt. + */ + long_name?: string | null; /** * Generator Id * @description The id of the generator that created this prompt. @@ -1726,7 +1768,7 @@ export interface components { /** Generators */ generators: components["schemas"]["PromptGenerator"][]; /** Prompts */ - prompts: components["schemas"]["Prompt"][]; + prompts: components["schemas"]["ApiPrompt"][]; }; /** ProviderModel */ ProviderModel: { @@ -2255,6 +2297,8 @@ export interface components { description?: string | null; /** @description The run config properties to use for this task run. */ run_config_properties: components["schemas"]["RunConfigProperties"]; + /** @description A prompt to use for run config. */ + prompt?: components["schemas"]["BasePrompt"] | null; /** Model Type */ readonly model_type: string; }; diff --git a/app/web_ui/src/lib/stores.ts b/app/web_ui/src/lib/stores.ts index 5aefc889..a86dbe25 100644 --- a/app/web_ui/src/lib/stores.ts +++ b/app/web_ui/src/lib/stores.ts @@ -238,7 +238,7 @@ export function prompt_name_from_id(prompt_id: string): string { } if (!prompt_name) { prompt_name = get(current_task_prompts)?.prompts.find( - (prompt) => "id::" + prompt.id === prompt_id, + (prompt) => prompt.id === prompt_id, )?.name } if (!prompt_name) { diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index e8ede737..69d8746e 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -237,7 +237,6 @@ parts.push( model_name(eval_config.model.properties["model_name"], model_info), ) - parts.push(prompt_name_from_id(eval_config.prompt.name)) return eval_config.name + " — " + parts.join(", ") } @@ -317,11 +316,6 @@ eval_config.model.properties["model_provider"] + "", ), }) - // TODO remove this once we consolidate prompts - properties.push({ - name: "Prompt", - value: prompt_name_from_id(eval_config.prompt.name + ""), - }) return properties } @@ -658,9 +652,12 @@ )}
- {prompt_name_from_id( - task_run_config?.run_config_properties?.prompt_id, - )} + Prompt: + {task_run_config.prompt?.long_name || + task_run_config.prompt?.name || + prompt_name_from_id( + task_run_config?.run_config_properties?.prompt_id, + )}
{#if percent_complete}
import { page } from "$app/stores" - import { current_task, current_task_prompts } from "$lib/stores" + import { + current_task, + current_task_prompts, + prompt_name_from_id, + } from "$lib/stores" import AppPage from "../../../../../app_page.svelte" import Output from "../../../../../run/output.svelte" import { formatDate } from "$lib/utils/formatters" @@ -11,17 +15,22 @@ $: prompt_model = $current_task_prompts?.prompts.find( (prompt) => prompt.id === prompt_id, ) - let prompt_props = {} + let prompt_props: Record = {} $: { prompt_props = Object.fromEntries( Object.entries({ ID: prompt_model?.id, + Name: prompt_model?.name, + "Long Name": prompt_model?.long_name, "Created By": prompt_model?.created_by, - "Created At": formatDate(prompt_model?.created_at), + "Created At": formatDate(prompt_model?.created_at || undefined), "Chain of Thought": prompt_model?.chain_of_thought_instructions ? "Yes" : "No", - }).filter(([_, value]) => value !== undefined), + "Source Generator": prompt_model?.generator_id + ? prompt_name_from_id(prompt_model?.generator_id) + : undefined, + }).filter(([_, value]) => value !== undefined && value !== null), ) } @@ -29,9 +38,7 @@
{#if !$current_task_prompts}
@@ -55,14 +62,16 @@ {/if}
-
+
Details
{#each Object.entries(prompt_props) as [key, value]}
{key}
-
+
{value}
{/each} diff --git a/app/web_ui/src/routes/(app)/run/prompt_type_selector.svelte b/app/web_ui/src/routes/(app)/run/prompt_type_selector.svelte index 1c222d3f..3b310ccd 100644 --- a/app/web_ui/src/routes/(app)/run/prompt_type_selector.svelte +++ b/app/web_ui/src/routes/(app)/run/prompt_type_selector.svelte @@ -49,7 +49,7 @@ if (prompt.chain_of_thought_instructions && exclude_cot) { continue } - static_prompts.push(["id::" + prompt.id, prompt.name]) + static_prompts.push([prompt.id, prompt.name]) } if (static_prompts.length > 0) { grouped_options.push(["Saved Prompts", static_prompts]) diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index f0a12d02..75ffed12 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -4,7 +4,7 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput -from kiln_ai.adapters.prompt_builders import PromptGenerators +from kiln_ai.adapters.prompt_builders import PromptGenerators, prompt_builder_from_id from kiln_ai.datamodel import Project, Task, TaskRun from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalScores from kiln_ai.datamodel.task import RunConfig @@ -30,15 +30,25 @@ class GEvalTask(Task, parent_of={}): Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. """ - def __init__(self, eval_config: EvalConfig): + def __init__(self, eval_config: EvalConfig, run_config: RunConfig): tmp_project = Project(name="GEval") + eval = eval_config.parent_eval() + if not eval: + raise ValueError("Eval config must have a parent eval") + task = eval.parent_task() + if not task: + raise ValueError("Eval must have a parent task") + + prompt_builder = prompt_builder_from_id(run_config.prompt_id, task) + base_prompt = prompt_builder.build_base_prompt() + system_instruction = f""" Your job to evaluate a model's performance on a task. Blocks will be marked with tags. The task the model was given is as follows: -{eval_config.prompt.prompt} +{base_prompt} """ @@ -88,7 +98,7 @@ def __init__(self, eval_config: EvalConfig, run_config: RunConfig): super().__init__(eval_config, run_config) - self.geval_task = GEvalTask(eval_config) + self.geval_task = GEvalTask(eval_config, run_config) async def run_eval(self, task_run: TaskRun) -> EvalScores: """ diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index 8aa47ec2..62dc57a2 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -89,7 +89,10 @@ def mock_run_config( run_config_properties=RunConfigProperties( model_name="gpt-4", model_provider_name="openai", - prompt_id="simple_prompt_builder", + prompt=BasePrompt( + name="test", + prompt="test", + ), ), parent=mock_task, ) @@ -234,7 +237,10 @@ def test_collect_tasks_multiple_run_configs( run_config_properties=RunConfigProperties( model_name="gpt-3.5", model_provider_name="openai", - prompt_id="simple_prompt_builder", + prompt=BasePrompt( + name="test", + prompt="test", + ), ), parent=mock_task, ) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index e24fcb8b..815e9457 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -82,11 +82,6 @@ def test_eval_config(test_task): "adapter_name": "openai_compatible", }, ), - prompt=BasePrompt( - # TODO ensure it's called with the frozen prompt - name="Joke Generator Frozen Prompt", - prompt=test_task.instruction, - ), properties={ "eval_steps": [ "Is the joke funny?", @@ -106,8 +101,11 @@ def test_run_config(test_task): return RunConfig( model_name="llama_3_1_8b", model_provider_name="groq", - prompt_id="simple_prompt_builder", task=test_task, + prompt=BasePrompt( + name="test", + prompt="test", + ), ) diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 68f58c94..b54d4832 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -1,9 +1,6 @@ import json from abc import ABCMeta, abstractmethod -from enum import Enum -from typing import Annotated, Dict - -from pydantic import AfterValidator +from typing import Dict from kiln_ai.datamodel import PromptGenerators, PromptId, Task, TaskRun from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -292,48 +289,44 @@ def chain_of_thought_prompt(self) -> str | None: return self.prompt_model.chain_of_thought_instructions -class EvalPromptBuilder(BasePromptBuilder): - """A prompt builder that looks up a static prompt in an eval config.""" +class TaskRunConfigPromptBuilder(BasePromptBuilder): + """A prompt builder that looks up a static prompt in a task run config.""" - def __init__(self, task: Task, eval_config_prompt_id: str): - parts = eval_config_prompt_id.split("::") - if len(parts) != 5: + def __init__(self, task: Task, run_config_prompt_id: str): + parts = run_config_prompt_id.split("::") + if len(parts) != 4: raise ValueError( - f"Invalid eval prompt ID: {eval_config_prompt_id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]::[eval_config_id]'." + f"Invalid task run config prompt ID: {run_config_prompt_id}. Expected format: 'task_run_config::[project_id]::[task_id]::[run_config_id]'." ) task_id = parts[2] if task_id != task.id: raise ValueError( - f"Eval prompt ID: {eval_config_prompt_id}. Task ID mismatch. Expected: {task.id}, got: {task_id}." - ) - - eval_id = parts[3] - eval = next( - (eval for eval in task.evals(readonly=True) if eval.id == eval_id), - None, - ) - if not eval: - raise ValueError( - f"Eval ID not found: {eval_id} for prompt id {eval_config_prompt_id}" + f"Task run config prompt ID: {run_config_prompt_id}. Task ID mismatch. Expected: {task.id}, got: {task_id}." ) - eval_config_id = parts[4] - eval_config = next( + run_config_id = parts[3] + run_config = next( ( - eval_config - for eval_config in eval.configs(readonly=True) - if eval_config.id == eval_config_id + run_config + for run_config in task.run_configs(readonly=True) + if run_config.id == run_config_id ), None, ) - if not eval_config: + if not run_config: + raise ValueError( + f"Task run config ID not found: {run_config_id} for prompt id {run_config_prompt_id}" + ) + if run_config.prompt is None: raise ValueError( - f"Eval config ID not found: {eval_config_id} for prompt id {eval_config_prompt_id}" + f"Task run config ID {run_config_id} does not have a stored prompt. Used as prompt id {run_config_prompt_id}" ) - self.prompt_model = eval_config.prompt - self.id = eval_config_prompt_id + # Load the prompt from the model + self.prompt = run_config.prompt.prompt + self.cot_prompt = run_config.prompt.chain_of_thought_instructions + self.id = run_config_prompt_id super().__init__(task) @@ -341,10 +334,10 @@ def prompt_id(self) -> str | None: return self.id def build_base_prompt(self) -> str: - return self.prompt_model.prompt + return self.prompt def chain_of_thought_prompt(self) -> str | None: - return self.prompt_model.chain_of_thought_instructions + return self.cot_prompt class FineTunePromptBuilder(BasePromptBuilder): @@ -403,9 +396,10 @@ def prompt_builder_from_id(prompt_id: PromptId, task: Task) -> BasePromptBuilder prompt_id = prompt_id[4:] return SavedPromptBuilder(task, prompt_id) - # Eval prompts are prefixed with "eval_prompt::" - if prompt_id.startswith("eval_prompt::"): - return EvalPromptBuilder(task, prompt_id) + # Task run config prompts are prefixed with "task_run_config::" + # task_run_config::[project_id]::[task_id]::[run_config_id] + if prompt_id.startswith("task_run_config::"): + return TaskRunConfigPromptBuilder(task, prompt_id) # Fine-tune prompts are prefixed with "fine_tune_prompt::" if prompt_id.startswith("fine_tune_prompt::"): diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index bad1d1e4..43674375 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -1,14 +1,12 @@ import json import pytest -from pydantic import BaseModel, ValidationError from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.adapters.model_adapters.test_structured_output import ( build_structured_output_test_task, ) from kiln_ai.adapters.prompt_builders import ( - EvalPromptBuilder, FewShotChainOfThoughtPromptBuilder, FewShotPromptBuilder, FineTunePromptBuilder, @@ -18,6 +16,7 @@ SavedPromptBuilder, SimpleChainOfThoughtPromptBuilder, SimplePromptBuilder, + TaskRunConfigPromptBuilder, chain_of_thought_prompt, prompt_builder_from_id, ) @@ -36,7 +35,7 @@ TaskOutputRating, TaskRun, ) -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore +from kiln_ai.datamodel.task import RunConfigProperties, Task, TaskRunConfig def test_simple_prompt_builder(tmp_path): @@ -589,107 +588,62 @@ def test_build_prompt_with_json_instructions(tmp_path): assert requirement.instruction in prompt_with_json -@pytest.fixture -def valid_eval_config_datasource(): - return DataSource( - type=DataSourceType.synthetic, - properties={ - "model_name": "gpt-4", - "model_provider": "openai", - "adapter_name": "openai_compatible", - }, - ) - - -def test_eval_prompt_builder(tmp_path, valid_eval_config_datasource): +def test_task_run_config_prompt_builder(tmp_path): task = build_test_task(tmp_path) - # Create an eval and eval config - eval = Eval( - name="test_eval", + run_config = TaskRunConfig( + name="test_run_config", parent=task, - eval_set_filter_id="tag::tag1", - eval_configs_filter_id="tag::tag2", - output_scores=[ - EvalOutputScore( - name="accuracy", - type="five_star", - ), - ], - ) - eval.save_to_file() - - eval_config = EvalConfig( - name="test_eval_config", - parent=eval, - config_type=EvalConfigType.g_eval, - model=valid_eval_config_datasource, + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), prompt=Prompt( - name="test_prompt", - prompt="test_eval_prompt", - chain_of_thought_instructions="Think carefully", + name="test prompt name", + prompt="test prompt content", + chain_of_thought_instructions="test step by step", ), - properties={"eval_steps": ["step1", "step2"]}, ) - eval_config.save_to_file() + run_config.save_to_file() # Construct the eval prompt ID - eval_prompt_id = ( - f"eval_prompt::{task.parent.id}::{task.id}::{eval.id}::{eval_config.id}" + run_config_prompt_id = ( + f"task_run_config::{task.parent.id}::{task.id}::{run_config.id}" ) - # Test successful creation, constructor and ID creation + # Test successful creation 2 ways: constructor and ID creation builders = [ - EvalPromptBuilder(task=task, eval_config_prompt_id=eval_prompt_id), - prompt_builder_from_id(eval_prompt_id, task), + TaskRunConfigPromptBuilder( + task=task, run_config_prompt_id=run_config_prompt_id + ), + prompt_builder_from_id(run_config_prompt_id, task), ] for builder in builders: assert ( - builder.build_prompt(include_json_instructions=False) == "test_eval_prompt" + builder.build_prompt(include_json_instructions=False) + == "test prompt content" ) - assert builder.chain_of_thought_prompt() == "Think carefully" - assert builder.prompt_id() == eval_prompt_id + assert builder.chain_of_thought_prompt() == "test step by step" + assert builder.prompt_id() == run_config_prompt_id - # test accessor - -def test_eval_prompt_builder_validation_errors(tmp_path): +def test_task_run_config_prompt_builder_validation_errors(tmp_path): task = build_test_task(tmp_path) # Test invalid format - with pytest.raises(ValueError, match="Invalid eval prompt ID"): - EvalPromptBuilder(task=task, eval_config_prompt_id="eval_prompt::wrong::format") + with pytest.raises(ValueError, match="Invalid task run config prompt ID"): + TaskRunConfigPromptBuilder( + task=task, run_config_prompt_id="task_run_config::wrong::format" + ) # Test task ID mismatch - wrong_task_id = f"eval_prompt::{task.parent.id}::wrong_task_id::eval_id::config_id" + wrong_task_id = f"task_run_config::{task.parent.id}::wrong_task_id::config_id" with pytest.raises(ValueError, match="Task ID mismatch"): - EvalPromptBuilder(task=task, eval_config_prompt_id=wrong_task_id) + TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=wrong_task_id) # Test eval not found - nonexistent_eval = ( - f"eval_prompt::{task.parent.id}::{task.id}::nonexistent_eval::config_id" - ) - with pytest.raises(ValueError, match="Eval ID not found"): - EvalPromptBuilder(task=task, eval_config_prompt_id=nonexistent_eval) - - # Create eval but test config not found - eval = Eval( - name="test_eval", - parent=task, - eval_set_filter_id="tag::tag1", - eval_configs_filter_id="tag::tag2", - output_scores=[ - EvalOutputScore( - name="accuracy", - type="five_star", - ), - ], - ) - eval.save_to_file() - - nonexistent_config = ( - f"eval_prompt::{task.parent.id}::{task.id}::{eval.id}::nonexistent_config" - ) - with pytest.raises(ValueError, match="Eval config ID not found"): - EvalPromptBuilder(task=task, eval_config_prompt_id=nonexistent_config) + nonexistent_eval = f"task_run_config::{task.parent.id}::{task.id}::nonexistent_id" + with pytest.raises(ValueError, match="Task run config ID not found"): + TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=nonexistent_eval) diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 0bad43c2..6cfcc612 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -14,7 +14,6 @@ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.json_schema import string_to_json_key -from kiln_ai.datamodel.prompt import BasePrompt from kiln_ai.datamodel.task_output import DataSource, DataSourceType from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -182,9 +181,6 @@ class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun} default={}, description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", ) - prompt: BasePrompt = Field( - description="The prompt to use for this eval config. Both when running the task to generate outputs to evaluate and when explaining to the eval model what the goal of the task was. This is a frozen prompt, so this eval config is consistent over time (for example, if the user selects multi-shot prompting, this saves that dynamic prompt at the point the eval config is created). Freezing the prompt ensures consistent evals." - ) def parent_eval(self) -> Union["Eval", None]: if self.parent is not None and self.parent.__class__.__name__ != "Eval": diff --git a/libs/core/kiln_ai/datamodel/prompt.py b/libs/core/kiln_ai/datamodel/prompt.py index 650712d9..3bcd44e6 100644 --- a/libs/core/kiln_ai/datamodel/prompt.py +++ b/libs/core/kiln_ai/datamodel/prompt.py @@ -11,6 +11,10 @@ class BasePrompt(BaseModel): """ name: str = NAME_FIELD + long_name: str | None = Field( + default=None, + description="A more detailed name for the prompt, usually incorporating the source of the prompt.", + ) generator_id: str | None = Field( default=None, description="The id of the generator that created this prompt.", diff --git a/libs/core/kiln_ai/datamodel/prompt_id.py b/libs/core/kiln_ai/datamodel/prompt_id.py index 4285aa00..2d2c5f02 100644 --- a/libs/core/kiln_ai/datamodel/prompt_id.py +++ b/libs/core/kiln_ai/datamodel/prompt_id.py @@ -48,12 +48,12 @@ def _check_prompt_id(id: str) -> str: ) return id - if id.startswith("eval_prompt::"): - # check it had a eval_id after the :: -- 'project_id::task_id::eval_id::eval_config_id' + if id.startswith("task_run_config::"): + # check it had a eval_id after the :: -- 'project_id::task_id::task_run_config_id' parts = id.split("::") - if len(parts) != 5: + if len(parts) != 4: raise ValueError( - f"Invalid eval prompt ID: {id}. Expected format: 'eval_prompt::[project_id]::[task_id]::[eval_id]'." + f"Invalid task run config prompt ID: {id}. Expected format: 'task_run_config::[project_id]::[task_id]::[task_run_config_id]'." ) return id @@ -67,3 +67,16 @@ def _check_prompt_id(id: str) -> str: return id raise ValueError(f"Invalid prompt ID: {id}") + + +def is_frozen_prompt(id: PromptId) -> bool: + """ + Check if the prompt ID is a frozen prompt. + """ + if id.startswith("id::"): + return True + if id.startswith("task_run_config::"): + return True + if id.startswith("fine_tune_prompt::"): + return True + return False diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 52368868..af0bfb6d 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -16,7 +16,7 @@ from kiln_ai.datamodel.dataset_split import DatasetSplit from kiln_ai.datamodel.eval import Eval from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str -from kiln_ai.datamodel.prompt import Prompt +from kiln_ai.datamodel.prompt import BasePrompt, Prompt from kiln_ai.datamodel.prompt_id import PromptGenerators, PromptId from kiln_ai.datamodel.task_run import TaskRun @@ -85,6 +85,13 @@ class TaskRunConfig(KilnParentedModel): run_config_properties: RunConfigProperties = Field( description="The run config properties to use for this task run." ) + # We usually want to persist the exact prompt, not just a prompt ID. + # We want the prompt to be perfectly consistent, and some prompt_ids are dynamic. + # The prompt ID in the run_config_properties likely points to this (although it's not required). + prompt: BasePrompt | None = Field( + default=None, + description="A prompt to use for run config.", + ) # Workaround to return typed parent without importing Task def parent_task(self) -> Union["Task", None]: diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index 44623539..911e9272 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -44,10 +44,6 @@ def valid_eval_config_data(): "adapter_name": "openai_compatible", }, ), - "prompt": BasePrompt( - name="Test Prompt", - prompt="Test prompt", - ), } @@ -64,15 +60,6 @@ def test_eval_config_valid(valid_eval_config): assert valid_eval_config.model.properties["model_name"] == "gpt-4" assert valid_eval_config.model.properties["model_provider"] == "openai" assert valid_eval_config.model.properties["adapter_name"] == "openai_compatible" - assert valid_eval_config.prompt.name == "Test Prompt" - assert valid_eval_config.prompt.prompt == "Test prompt" - - -def test_eval_config_missing_prompt(valid_eval_config): - with pytest.raises( - ValueError, match="Input should be a valid dictionary or instance of BasePromp" - ): - valid_eval_config.prompt = None def test_eval_config_missing_eval_steps(valid_eval_config): diff --git a/libs/core/kiln_ai/datamodel/test_prompt_id.py b/libs/core/kiln_ai/datamodel/test_prompt_id.py index 23cd1d3a..cf5d2326 100644 --- a/libs/core/kiln_ai/datamodel/test_prompt_id.py +++ b/libs/core/kiln_ai/datamodel/test_prompt_id.py @@ -5,6 +5,7 @@ PromptGenerators, PromptId, ) +from kiln_ai.datamodel.prompt_id import is_frozen_prompt # Test model to validate the PromptId type @@ -90,10 +91,10 @@ def test_prompt_generator_case_sensitivity(): @pytest.mark.parametrize( "valid_id", [ - "eval_prompt::project_123::task_456::eval_789::config_012", # Valid eval prompt ID + "task_run_config::project_123::task_456::config_123", # Valid task run config prompt ID ], ) -def test_valid_eval_prompt_id(valid_id): +def test_valid_task_run_config_prompt_id(valid_id): """Test that valid eval prompt IDs are accepted""" model = ModelTester(prompt_id=valid_id) assert model.prompt_id == valid_id @@ -102,13 +103,27 @@ def test_valid_eval_prompt_id(valid_id): @pytest.mark.parametrize( "invalid_id,expected_error", [ - ("eval_prompt::", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1::e1", "Invalid eval prompt ID"), - ("eval_prompt::p1::t1::e1::c1::extra", "Invalid eval prompt ID"), + ("task_run_config::", "Invalid task run config prompt ID"), + ("task_run_config::p1", "Invalid task run config prompt ID"), + ("task_run_config::p1::t1", "Invalid task run config prompt ID"), + ("task_run_config::p1::t1::c1::extra", "Invalid task run config prompt ID"), ], ) def test_invalid_eval_prompt_id_format(invalid_id, expected_error): """Test that invalid eval prompt ID formats are rejected""" with pytest.raises(ValidationError, match=expected_error): ModelTester(prompt_id=invalid_id) + + +@pytest.mark.parametrize( + "id,should_be_frozen", + [ + ("simple_prompt_builder", False), + ("id::prompt_123", True), + ("task_run_config::p1::t1", True), + ("fine_tune_prompt::ft_123", True), + ], +) +def test_is_frozen_prompt(id, should_be_frozen): + """Test that the is_frozen_prompt function works""" + assert is_frozen_prompt(id) == should_be_frozen diff --git a/libs/server/kiln_server/prompt_api.py b/libs/server/kiln_server/prompt_api.py index 0b17cbb1..40c5a56c 100644 --- a/libs/server/kiln_server/prompt_api.py +++ b/libs/server/kiln_server/prompt_api.py @@ -1,10 +1,19 @@ +from datetime import datetime + from fastapi import FastAPI -from kiln_ai.datamodel import Prompt +from kiln_ai.datamodel import BasePrompt, Prompt, PromptId from pydantic import BaseModel from kiln_server.task_api import task_from_id +# This is a wrapper around the Prompt datamodel that adds an id field which represents the PromptID and not the data model ID. +class ApiPrompt(BasePrompt): + id: PromptId + created_at: datetime | None = None + created_by: str | None = None + + class PromptCreateRequest(BaseModel): name: str prompt: str @@ -21,7 +30,7 @@ class PromptGenerator(BaseModel): class PromptResponse(BaseModel): generators: list[PromptGenerator] - prompts: list[Prompt] + prompts: list[ApiPrompt] def connect_prompt_api(app: FastAPI): @@ -43,9 +52,26 @@ async def create_prompt( async def get_prompts(project_id: str, task_id: str) -> PromptResponse: parent_task = task_from_id(project_id, task_id) + prompts: list[ApiPrompt] = [] + for prompt in parent_task.prompts(): + properties = prompt.model_dump(exclude={"id"}) + prompts.append(ApiPrompt(id=f"id::{prompt.id}", **properties)) + + # Add any task run config prompts to the list + task_run_configs = parent_task.run_configs() + for task_run_config in task_run_configs: + if task_run_config.prompt: + properties = task_run_config.prompt.model_dump(exclude={"id"}) + prompts.append( + ApiPrompt( + id=f"task_run_config::{project_id}::{task_id}::{task_run_config.id}", + **properties, + ) + ) + return PromptResponse( generators=_prompt_generators, - prompts=parent_task.prompts(), + prompts=prompts, ) From e3a6a27a96825d2f508b1dd307acfa4838e51e34 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 16:00:15 -0500 Subject: [PATCH 052/102] CR feedback --- app/desktop/studio_server/eval_api.py | 10 ++++------ app/desktop/studio_server/test_eval_api.py | 4 +++- .../kiln_ai/adapters/eval/test_eval_runner.py | 14 ++------------ libs/core/kiln_ai/adapters/eval/test_g_eval.py | 5 +---- .../model_adapters/langchain_adapters.py | 4 +--- .../model_adapters/openai_model_adapter.py | 6 ++---- .../model_adapters/test_base_adapter.py | 11 +++++++++-- .../kiln_ai/adapters/test_prompt_builders.py | 4 +--- libs/core/kiln_ai/datamodel/task.py | 1 - libs/core/kiln_ai/datamodel/test_basemodel.py | 1 + libs/core/kiln_ai/datamodel/test_task.py | 18 +++++++++++++++--- 11 files changed, 39 insertions(+), 39 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index e8fc3a68..dce33a6f 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -177,15 +177,13 @@ async def create_task_run_config( detail="Task must have a parent project.", ) - froze_prompt = False - prompt: BasePrompt | None = None + frozen_prompt: BasePrompt | None = None if not is_frozen_prompt(request.prompt_id): # For dynamic prompts, we "freeze" a copy of this prompt into the task run config so we don't accidentially invalidate evals if the user changes something that impacts the prompt (example: chanding data for multi-shot, or chanding task for basic-prompt) # We then point the task_run_config.run_properties.prompt_id to this new frozen prompt - froze_prompt = True prompt_builder = prompt_builder_from_id(request.prompt_id, task) prompt_name = generate_memorable_name() - prompt = BasePrompt( + frozen_prompt = BasePrompt( name=prompt_name, long_name=prompt_name + " (frozen prompt from '" @@ -205,9 +203,9 @@ async def create_task_run_config( model_provider_name=request.model_provider_name, prompt_id=request.prompt_id, ), - prompt=prompt, + prompt=frozen_prompt, ) - if froze_prompt: + if frozen_prompt is not None: # Set after, because the ID isn't known until the TaskRunConfig is created task_run_config.run_config_properties.prompt_id = ( f"task_run_config::{parent_project.id}::{task.id}::{task_run_config.id}" diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index adbf3690..d6b53df5 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -215,7 +215,9 @@ async def test_create_evaluator( @pytest.mark.asyncio -async def test_create_task_run_config(client, mock_task_from_id, mock_task): +async def test_create_task_run_config_with_freezing( + client, mock_task_from_id, mock_task +): mock_task_from_id.return_value = mock_task with ( diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index 62dc57a2..8c333f22 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -67,10 +67,6 @@ def mock_eval_config(mock_eval, data_source): name="test", model=data_source, parent=mock_eval, - prompt=BasePrompt( - name="test", - prompt="test", - ), properties={ "eval_steps": ["step1", "step2", "step3"], }, @@ -89,10 +85,7 @@ def mock_run_config( run_config_properties=RunConfigProperties( model_name="gpt-4", model_provider_name="openai", - prompt=BasePrompt( - name="test", - prompt="test", - ), + prompt_id="simple_prompt_builder", ), parent=mock_task, ) @@ -237,10 +230,7 @@ def test_collect_tasks_multiple_run_configs( run_config_properties=RunConfigProperties( model_name="gpt-3.5", model_provider_name="openai", - prompt=BasePrompt( - name="test", - prompt="test", - ), + prompt_id="simple_prompt_builder", ), parent=mock_task, ) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 815e9457..3e21fda4 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -102,10 +102,7 @@ def test_run_config(test_task): model_name="llama_3_1_8b", model_provider_name="groq", task=test_task, - prompt=BasePrompt( - name="test", - prompt="test", - ), + prompt_id="simple_prompt_builder", ) diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index e9896c69..79d9906e 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -84,11 +84,9 @@ def __init__( task=kiln_task, model_name=model_name, model_provider_name=provider, + prompt_id=prompt_id or datamodel.PromptGenerators.SIMPLE, ) - if prompt_id is not None: - run_config.prompt_id = prompt_id - super().__init__( run_config=run_config, tags=tags, diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index d5edcba5..94ec18d5 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -20,7 +20,7 @@ OpenAICompatibleConfig, ) from kiln_ai.adapters.parsers.json_parser import parse_json_string -from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel import PromptGenerators, PromptId from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -45,11 +45,9 @@ def __init__( task=kiln_task, model_name=config.model_name, model_provider_name=config.provider_name, + prompt_id=prompt_id or PromptGenerators.SIMPLE, ) - if prompt_id is not None: - run_config.prompt_id = prompt_id - super().__init__( run_config=run_config, tags=tags, diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py index 3628fc72..8160294b 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py @@ -37,6 +37,7 @@ def adapter(base_task): task=base_task, model_name="test_model", model_provider_name="test_provider", + prompt_id="simple_prompt_builder", ), ) @@ -84,7 +85,10 @@ async def test_model_provider_missing_names(base_task): # Test with missing model name adapter = MockAdapter( run_config=RunConfig( - task=base_task, model_name="", model_provider_name="test_provider" + task=base_task, + model_name="", + model_provider_name="", + prompt_id="simple_prompt_builder", ), ) with pytest.raises( @@ -95,7 +99,10 @@ async def test_model_provider_missing_names(base_task): # Test with missing provider name adapter = MockAdapter( run_config=RunConfig( - task=base_task, model_name="test_model", model_provider_name="" + task=base_task, + model_name="test_model", + model_provider_name="", + prompt_id="simple_prompt_builder", ), ) with pytest.raises( diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 43674375..d95bc7d8 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -28,14 +28,12 @@ FinetuneDataStrategy, Project, Prompt, - PromptGenerators, - PromptId, Task, TaskOutput, TaskOutputRating, TaskRun, ) -from kiln_ai.datamodel.task import RunConfigProperties, Task, TaskRunConfig +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig def test_simple_prompt_builder(tmp_path): diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index af0bfb6d..87c63b8c 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -53,7 +53,6 @@ class RunConfigProperties(BaseModel): ) prompt_id: PromptId = Field( description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.", - default=PromptGenerators.SIMPLE, ) diff --git a/libs/core/kiln_ai/datamodel/test_basemodel.py b/libs/core/kiln_ai/datamodel/test_basemodel.py index d93de053..de33f2df 100644 --- a/libs/core/kiln_ai/datamodel/test_basemodel.py +++ b/libs/core/kiln_ai/datamodel/test_basemodel.py @@ -501,6 +501,7 @@ def adapter(base_task): task=base_task, model_name="test_model", model_provider_name="test_provider", + prompt_id="simple_prompt_builder", ), ) diff --git a/libs/core/kiln_ai/datamodel/test_task.py b/libs/core/kiln_ai/datamodel/test_task.py index 333ef733..b60bd51e 100644 --- a/libs/core/kiln_ai/datamodel/test_task.py +++ b/libs/core/kiln_ai/datamodel/test_task.py @@ -8,7 +8,12 @@ def test_runconfig_valid_creation(): task = Task(id="task1", name="Test Task", instruction="Do something") - config = RunConfig(task=task, model_name="gpt-4", model_provider_name="openai") + config = RunConfig( + task=task, + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE, + ) assert config.task == task assert config.model_name == "gpt-4" @@ -21,10 +26,13 @@ def test_runconfig_missing_required_fields(): RunConfig() errors = exc_info.value.errors() - assert len(errors) == 3 # task, model_name, and model_provider_name are required + assert ( + len(errors) == 4 + ) # task, model_name, model_provider_name, and prompt_id are required assert any(error["loc"][0] == "task" for error in errors) assert any(error["loc"][0] == "model_name" for error in errors) assert any(error["loc"][0] == "model_provider_name" for error in errors) + assert any(error["loc"][0] == "prompt_id" for error in errors) def test_runconfig_custom_prompt_id(): @@ -47,7 +55,11 @@ def sample_task(): @pytest.fixture def sample_run_config_props(sample_task): - return RunConfigProperties(model_name="gpt-4", model_provider_name="openai") + return RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE, + ) def test_task_run_config_valid_creation(sample_task, sample_run_config_props): From a46b94224c957688762a55d15aac13339ec7eedd Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 16:24:22 -0500 Subject: [PATCH 053/102] improve comment --- libs/core/kiln_ai/datamodel/task.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 87c63b8c..29f72e4e 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -84,9 +84,9 @@ class TaskRunConfig(KilnParentedModel): run_config_properties: RunConfigProperties = Field( description="The run config properties to use for this task run." ) - # We usually want to persist the exact prompt, not just a prompt ID. - # We want the prompt to be perfectly consistent, and some prompt_ids are dynamic. - # The prompt ID in the run_config_properties likely points to this (although it's not required). + # The prompt_id in the run_config_properties is the prompt ID to use for this task run. + # However, we want the prompt to be perfectly consistent, and some prompt_ids are dynamic. + # If we need to "freeze" a prompt, we can do so here (then point the prompt_id to this frozen prompt). prompt: BasePrompt | None = Field( default=None, description="A prompt to use for run config.", From 3f21c3610409e718c3300313b9fafd110177181c Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 17:03:17 -0500 Subject: [PATCH 054/102] Okay: fix the root of my 2 prompt issue - Want the evaluator to have some context on what the goal is. - Don't want to give it the prompt, as we're testing prompts, so it's biasing the evaluator - Instead, give a short task-desription, which is locked across the eval_config, so no bias for a given prompt. --- app/desktop/studio_server/eval_api.py | 1 - .../[task_id]/[eval_id]/+page.svelte | 7 +++ .../[eval_id]/create_eval_config/+page.svelte | 43 ++++++++++++------ libs/core/kiln_ai/adapters/eval/g_eval.py | 30 ++++--------- .../core/kiln_ai/adapters/eval/test_g_eval.py | 45 ++++++++++++++++++- libs/core/kiln_ai/datamodel/eval.py | 6 +++ .../core/kiln_ai/datamodel/test_eval_model.py | 8 ++++ 7 files changed, 103 insertions(+), 37 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index dce33a6f..932c489f 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -84,7 +84,6 @@ class CreateEvalConfigRequest(BaseModel): properties: dict[str, Any] model_name: str provider: ModelProviderName - prompt_id: PromptId class CreateTaskRunConfigRequest(BaseModel): diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 69d8746e..577bd06f 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -316,6 +316,13 @@ eval_config.model.properties["model_provider"] + "", ), }) + const task_description = eval_config.properties["task_description"] + if (task_description) { + properties.push({ + name: "Task Description", + value: task_description, + }) + } return properties } diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index 7cd38d1f..0e2fc742 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -8,15 +8,14 @@ import { KilnError, createKilnError } from "$lib/utils/error_handlers" import { onMount } from "svelte" import Warning from "$lib/ui/warning.svelte" - import PromptTypeSelector from "../../../../../run/prompt_type_selector.svelte" import AvailableModelsDropdown from "../../../../../run/available_models_dropdown.svelte" import type { Eval, EvalTemplate, Task, EvalConfigType } from "$lib/types" import { tick } from "svelte" import { load_task } from "$lib/stores" import { goto } from "$app/navigation" - let prompt_method = "simple_prompt_builder" let model: string | undefined = undefined + let task_description: string = "" let eval_steps: string[] = [] type EvalTemplateWithoutKiln = Exclude @@ -175,9 +174,6 @@ if (!model_name || !provider) { throw new Error("No model selected") } - if (!prompt_method) { - throw new Error("No prompt method selected") - } create_evaluator_loading = true const { data, error } = await client.POST( @@ -195,10 +191,11 @@ model_name: model_name, // @ts-expect-error provider is not typed, but server will validate provider: provider, - prompt_id: prompt_method, properties: { - // @ts-expect-error eval_steps is not typed, but server will validate + // @ts-expect-error properties are not typed, but server will validate eval_steps: eval_steps, + // @ts-expect-error properties are not typed, but server will validate + task_description: task_description, }, }, }, @@ -238,7 +235,7 @@
{:else}
- Part 2: Select Prompt and Model + Step 2: Select Eval Model
- Specify which prompt and model will be used to run the eval. + Specify which model will be used to evaluate the results. This is + not necessarily the model that will be used to run the task.
- - {/if} - {#if selected_algo && model && prompt_method} + {#if selected_algo && model} +
+
+ Step 3: Task Description +
+
+
+ Include a short description of what this task does for the + evaluator to use as context. +
+
+
+ +
- Part 3: Evaluation Instructions + Step 4: Evaluation Instructions
This is a list of instructions to be used by the evaluator's diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index 75ffed12..eaa34b67 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -30,31 +30,19 @@ class GEvalTask(Task, parent_of={}): Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. """ - def __init__(self, eval_config: EvalConfig, run_config: RunConfig): + def __init__(self, eval_config: EvalConfig): tmp_project = Project(name="GEval") - eval = eval_config.parent_eval() - if not eval: - raise ValueError("Eval config must have a parent eval") - task = eval.parent_task() - if not task: - raise ValueError("Eval must have a parent task") - - prompt_builder = prompt_builder_from_id(run_config.prompt_id, task) - base_prompt = prompt_builder.build_base_prompt() - - system_instruction = f""" -Your job to evaluate a model's performance on a task. Blocks will be marked with tags. - -The task the model was given is as follows: - -{base_prompt} - -""" + # Build a simple LLM as Judge system instruction + system_instruction = f"Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" + # Optionally add a short task description + task_description = eval_config.properties.get("task_description", None) + if task_description: + system_instruction += f"\nThe task the model was given is as follows:\n\n{task_description}\n\n" # Build the COT eval instructions cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" - steps = eval_config.properties["eval_steps"] + steps = eval_config.properties.get("eval_steps", None) if not steps or not isinstance(steps, list): raise ValueError("eval_steps must be a list") for i, step in enumerate(steps): @@ -98,7 +86,7 @@ def __init__(self, eval_config: EvalConfig, run_config: RunConfig): super().__init__(eval_config, run_config) - self.geval_task = GEvalTask(eval_config, run_config) + self.geval_task = GEvalTask(eval_config) async def run_eval(self, task_run: TaskRun) -> EvalScores: """ diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index 3e21fda4..a0003f53 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -2,7 +2,7 @@ import pickle import pytest -from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval +from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output from kiln_ai.adapters.model_adapters.base_adapter import RunOutput from kiln_ai.datamodel import ( @@ -402,3 +402,46 @@ def __init__(self, token, top_logprobs): token_logprob = MockTokenLogprob("5", []) with pytest.raises(RuntimeError, match="No valid scoring tokens found"): g_eval.rating_token_to_score(token_logprob) + + +def test_g_eval_system_instruction(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore(name="overall_rating", type=TaskOutputRatingType.five_star), + ], + ) + eval_config = EvalConfig( + parent=eval, + name="Test Eval", + model=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt_4o_mini", + "model_provider": "openai", + "adapter_name": "openai_compatible", + }, + ), + config_type=EvalConfigType.g_eval, + properties={ + "task_description": "Test task description", + "eval_steps": ["Step 1", "Step 2"], + }, + ) + g_eval_task = GEvalTask(eval_config) + assert g_eval_task.instruction == ( + "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n\n" + "The task the model was given is as follows:\n\n" + "Test task description\n" + "\n" + ) + + # Test without task description + eval_config.properties = {"eval_steps": ["Step 1", "Step 2"]} + g_eval_task = GEvalTask(eval_config) + assert ( + g_eval_task.instruction + == "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" + ) diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 6cfcc612..84540324 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -200,6 +200,12 @@ def validate_properties(self) -> Self: self.properties["eval_steps"], list ): raise ValueError("eval_steps is required and must be a list for g_eval") + if "task_description" in self.properties and not isinstance( + self.properties["task_description"], str + ): + raise ValueError( + "task_description is optional, but if provided must be a string" + ) return self else: raise ValueError(f"Invalid eval config type: {self.config_type}") diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index 911e9272..c75ac1a1 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -69,6 +69,14 @@ def test_eval_config_missing_eval_steps(valid_eval_config): valid_eval_config.properties = {} +def test_eval_config_missing_task_description(valid_eval_config): + with pytest.raises( + ValueError, + match="task_description is optional, but if provided must be a string", + ): + valid_eval_config.properties = {"task_description": 123, "eval_steps": []} + + def test_eval_config_invalid_json(valid_eval_config): class InvalidClass: pass From 0af1cdfbd6cf90630f9f84d7eeaa860e7109971c Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 17:18:17 -0500 Subject: [PATCH 055/102] UI improvements --- .../[task_id]/[eval_id]/+page.svelte | 34 ++++++++++++++----- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 577bd06f..62a1a647 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -28,6 +28,7 @@ import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte" import Warning from "$lib/ui/warning.svelte" import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates" + import InfoTooltip from "$lib/ui/info_tooltip.svelte" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -617,15 +618,30 @@ {#each evaluator.output_scores as output_score}
{/each} From 7d3cccb4ef6709da23aaba0f66ccc79281eb3748 Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 19:30:47 -0500 Subject: [PATCH 056/102] Add R1 and Llama 3.1 70B to g_eval support. Add tests. Note: ran a lot more, these are the only ones that work. Fireworks only returns 5 logprobs (not enough). Ollama doesn't support logprobs. Amazon could work, but can do that later. Note: slightly ugly provider specific code leaking into the OAI compaible adapter. Okay for now but should limit this. --- .../core/kiln_ai/adapters/eval/test_g_eval.py | 69 +++++++++++++++++-- libs/core/kiln_ai/adapters/ml_model_list.py | 2 + .../model_adapters/openai_model_adapter.py | 22 ++++-- .../test_openai_model_adapter.py | 29 +++++++- 4 files changed, 108 insertions(+), 14 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py index a0003f53..e5e81abe 100644 --- a/libs/core/kiln_ai/adapters/eval/test_g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -4,7 +4,9 @@ import pytest from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output +from kiln_ai.adapters.ml_model_list import built_in_models from kiln_ai.adapters.model_adapters.base_adapter import RunOutput +from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers from kiln_ai.datamodel import ( BasePrompt, DataSource, @@ -130,15 +132,20 @@ def test_task_run(test_task): return task_run -@pytest.mark.parametrize( - "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] -) -@pytest.mark.paid -async def test_run_g_eval( - test_task, test_eval_config, test_task_run, config_type, test_run_config +async def run_g_eval_test( + test_task, + test_eval_config, + test_task_run, + config_type, + test_run_config, + model_name: str | None = None, + provider_name: str | None = None, ): # Create G-Eval instance test_eval_config.config_type = config_type + if model_name is not None and provider_name is not None: + test_eval_config.model.properties["model_name"] = model_name + test_eval_config.model.properties["model_provider"] = provider_name g_eval = GEval(test_eval_config, test_run_config) # Run the evaluation @@ -160,6 +167,18 @@ async def test_run_g_eval( assert 1.0 <= overall <= 5.0 +@pytest.mark.parametrize( + "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] +) +@pytest.mark.paid +async def test_run_g_eval( + test_task, test_eval_config, test_task_run, config_type, test_run_config +): + await run_g_eval_test( + test_task, test_eval_config, test_task_run, config_type, test_run_config + ) + + @pytest.mark.parametrize( "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] ) @@ -445,3 +464,41 @@ def test_g_eval_system_instruction(): g_eval_task.instruction == "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" ) + + +def check_supports_logprobs(model_name: str, provider_name: str): + for model in built_in_models: + if model.name != model_name: + continue + for provider in model.providers: + if provider.name != provider_name: + continue + if not provider.supports_logprobs: + pytest.skip( + f"Skipping {model.name} {provider.name} because it does not support logprobs" + ) + return + raise RuntimeError(f"No model {model_name} {provider_name} found") + + +@pytest.mark.paid +@pytest.mark.ollama +@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) +async def test_all_built_in_models_logprobs_geval( + model_name, + provider_name, + test_task, + test_eval_config, + test_task_run, + test_run_config, +): + check_supports_logprobs(model_name, provider_name) + await run_g_eval_test( + test_task, + test_eval_config, + test_task_run, + EvalConfigType.g_eval, + test_run_config, + model_name, + provider_name, + ) diff --git a/libs/core/kiln_ai/adapters/ml_model_list.py b/libs/core/kiln_ai/adapters/ml_model_list.py index 97682cad..3e256e4a 100644 --- a/libs/core/kiln_ai/adapters/ml_model_list.py +++ b/libs/core/kiln_ai/adapters/ml_model_list.py @@ -245,6 +245,7 @@ class KilnModel(BaseModel): # No custom parser -- openrouter implemented it themselves structured_output_mode=StructuredOutputMode.json_instructions, reasoning_capable=True, + supports_logprobs=True, ), KilnModelProvider( name=ModelProviderName.fireworks_ai, @@ -393,6 +394,7 @@ class KilnModel(BaseModel): supports_data_gen=False, structured_output_mode=StructuredOutputMode.function_calling, provider_options={"model": "meta-llama/llama-3.1-70b-instruct"}, + supports_logprobs=True, ), KilnModelProvider( name=ModelProviderName.ollama, diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index 94ec18d5..909146c9 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -9,7 +9,7 @@ ) import kiln_ai.datamodel as datamodel -from kiln_ai.adapters.ml_model_list import StructuredOutputMode +from kiln_ai.adapters.ml_model_list import ModelProviderName, StructuredOutputMode from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, AdapterConfig, @@ -115,6 +115,12 @@ async def _run(self, input: Dict | str) -> RunOutput: # fp8 quants are awful "ignore": ["DeepInfra"], } + elif self.model_provider().name == ModelProviderName.openrouter: + # OpenRouter specific options. Bit of a hack but really does improve usability. + extra_body["provider"] = { + "require_parameters": True, + "ignore": ["DeepInfra"], + } # Main completion call response_format_options = await self.response_format_options() @@ -235,15 +241,19 @@ def tool_call_params(self) -> dict[str, Any]: ) output_schema["additionalProperties"] = False + function_params = { + "name": "task_response", + "parameters": output_schema, + } + # This parameter is only reliable for OpenAI + if self.model_provider().name == ModelProviderName.openai: + function_params["strict"] = True + return { "tools": [ { "type": "function", - "function": { - "name": "task_response", - "parameters": output_schema, - "strict": True, - }, + "function": function_params, } ], "tool_choice": { diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py index b481f807..3232da2b 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py @@ -43,7 +43,7 @@ def config(): api_key="test_key", base_url="https://api.test.com", model_name="test-model", - provider_name="test-provider", + provider_name="openrouter", default_headers={"X-Test": "test"}, ) @@ -166,7 +166,32 @@ async def test_response_format_options_json_schema(config, mock_task): } -def test_tool_call_params(config, mock_task): +def test_tool_call_params_non_openai(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) + + params = adapter.tool_call_params() + expected_schema = mock_task.output_schema() + expected_schema["additionalProperties"] = False + + assert params == { + "tools": [ + { + "type": "function", + "function": { + "name": "task_response", + "parameters": expected_schema, + }, + } + ], + "tool_choice": { + "type": "function", + "function": {"name": "task_response"}, + }, + } + + +def test_tool_call_params_openai(config, mock_task): + config.provider_name = "openai" adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) params = adapter.tool_call_params() From f0d41444431a9d9da1a3b34b7c5872cbafbc07ff Mon Sep 17 00:00:00 2001 From: scosman Date: Sun, 23 Feb 2025 21:32:09 -0500 Subject: [PATCH 057/102] New UI: detailed results screen --- app/desktop/studio_server/eval_api.py | 35 ++- app/desktop/studio_server/test_eval_api.py | 64 ++++++ app/web_ui/src/lib/api_schema.d.ts | 149 ++++++++++++- app/web_ui/src/lib/types.ts | 1 + .../[task_id]/[eval_id]/+page.svelte | 34 ++- .../[run_config_id]/run_result/+page.svelte | 201 ++++++++++++++++++ .../[run_config_id]/run_result/+page.ts | 1 + 7 files changed, 469 insertions(+), 16 deletions(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 932c489f..97226ba4 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, Set +from typing import Any, Dict, List, Set from fastapi import FastAPI, HTTPException, Query from fastapi.responses import StreamingResponse @@ -20,6 +20,7 @@ EvalConfig, EvalConfigType, EvalOutputScore, + EvalRun, EvalTemplate, ) from kiln_ai.datamodel.prompt_id import is_frozen_prompt @@ -102,6 +103,13 @@ class ScoreSummary(BaseModel): mean_score: float +class EvalRunResult(BaseModel): + results: List[EvalRun] + eval: Eval + eval_config: EvalConfig + run_config: TaskRunConfig + + class EvalResultSummary(BaseModel): # run_config_id -> output_score_id -> ScoreSummary results: Dict[str, Dict[str, ScoreSummary]] @@ -293,6 +301,31 @@ async def event_generator(): media_type="text/event-stream", ) + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results" + ) + async def get_eval_run_results( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + run_config_id: str, + ) -> EvalRunResult: + eval = eval_from_id(project_id, task_id, eval_id) + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + run_config = task_run_config_from_id(project_id, task_id, run_config_id) + results = [ + run_result + for run_result in eval_config.runs(readonly=True) + if run_result.task_run_config_id == run_config_id + ] + return EvalRunResult( + results=results, + eval=eval, + eval_config=eval_config, + run_config=run_config, + ) + @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary" ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index d6b53df5..93eda512 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -614,3 +614,67 @@ async def test_get_eval_config_score_summary( ) mock_eval_config_for_score_summary.runs.assert_called_once_with(readonly=True) mock_dataset_ids_in_filter.assert_called_once_with(mock_task, "tag::eval_set") + + +@pytest.mark.asyncio +async def test_get_eval_run_results( + client, + mock_task_from_id, + mock_task, + mock_eval, + mock_eval_config, + mock_run_config, +): + mock_task_from_id.return_value = mock_task + + eval_run = EvalRun( + task_run_config_id="run_config1", + scores={"score1": 3.0}, + input="input", + output="output", + dataset_id="dataset_id1", + parent=mock_eval_config, + ) + eval_run.save_to_file() + + # Test successful retrieval + response = client.get( + f"/api/projects/project1/tasks/task1/eval/eval1" + f"/eval_config/eval_config1/run_config/run_config1/results" + ) + + assert response.status_code == 200 + data = response.json() + + # Verify response structure + assert "results" in data + assert "eval" in data + assert "eval_config" in data + assert "run_config" in data + + # Verify results content + assert len(data["results"]) == 1 + assert data["results"][0]["id"] == eval_run.id + assert data["results"][0]["task_run_config_id"] == mock_run_config.id + assert data["results"][0]["scores"] == {"score1": 3.0} + + # Test with invalid eval ID + response = client.get( + f"/api/projects/project1/tasks/task1/eval/invalid_eval" + f"/eval_config/eval_config1/run_config/run_config1/results" + ) + assert response.status_code == 404 + + # Test with invalid eval config ID + response = client.get( + f"/api/projects/project1/tasks/task1/eval/eval1" + f"/eval_config/invalid_config/run_config/run_config1/results" + ) + assert response.status_code == 404 + + # Test with invalid run config ID + response = client.get( + f"/api/projects/project1/tasks/task1/eval/eval1" + f"/eval_config/eval_config1/run_config/invalid_run_config/results" + ) + assert response.status_code == 404 diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 3eb9417b..c97cd519 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -793,6 +793,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Run Results */ + get: operations["get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary": { parameters: { query?: never; @@ -922,8 +939,6 @@ export interface components { /** Model Name */ model_name: string; provider: components["schemas"]["ModelProviderName"]; - /** Prompt Id */ - prompt_id: string; }; /** CreateEvaluatorRequest */ CreateEvaluatorRequest: { @@ -1338,6 +1353,65 @@ export interface components { /** Dataset Size */ dataset_size: number; }; + /** + * EvalRun + * @description The results of running an eval on a single dataset item, with a specific TaskRunConfig and EvalConfig. + */ + EvalRun: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Dataset Id + * @description The ID of the dataset item that was used for this run (we only use it's input). Must belong to the same Task as this eval. + */ + dataset_id: string | null; + /** + * Task Run Config Id + * @description The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval. + */ + task_run_config_id: string | null; + /** + * Input + * @description The input to the task. JSON formatted for structured input, plaintext for unstructured input. + */ + input: string; + /** + * Output + * @description The output of the task. JSON formatted for structured output, plaintext for unstructured output. + */ + output: string; + /** + * Scores + * @description The scores of the evaluator (specifically the EvalConfig this object is a child of). + */ + scores: { + [key: string]: number; + }; + /** Model Type */ + readonly model_type: string; + }; + /** EvalRunResult */ + EvalRunResult: { + /** Results */ + results: components["schemas"]["EvalRun"][]; + eval: components["schemas"]["Eval"]; + eval_config: components["schemas"]["EvalConfig"]; + run_config: components["schemas"]["TaskRunConfig"]; + }; /** * EvalState * @enum {string} @@ -1547,7 +1621,36 @@ export interface components { * created_at (datetime): Timestamp when the model was created * created_by (str): User ID of the creator */ - KilnBaseModel: { + "KilnBaseModel-Input": { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + }; + /** + * KilnBaseModel + * @description Base model for all Kiln data models with common functionality for persistence and versioning. + * + * Attributes: + * v (int): Schema version number for migration support + * id (str): Unique identifier for the model instance + * path (Path): File system path where the model is stored + * created_at (datetime): Timestamp when the model was created + * created_by (str): User ID of the creator + */ + "KilnBaseModel-Output": { /** * V * @default 1 @@ -1564,6 +1667,8 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; + /** Model Type */ + readonly model_type: string; }; /** ModelDetails */ ModelDetails: { @@ -1841,7 +1946,6 @@ export interface components { /** * Prompt Id * @description The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided. - * @default simple_prompt_builder */ prompt_id: string; }; @@ -2173,7 +2277,7 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; - parent?: components["schemas"]["KilnBaseModel"] | null; + parent?: components["schemas"]["KilnBaseModel-Input"] | null; /** * Input * @description The inputs to the task. JSON formatted for structured input, plaintext for unstructured input. @@ -4037,6 +4141,41 @@ export interface operations { }; }; }; + get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + run_config_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalRunResult"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; get_eval_config_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get: { parameters: { query?: never; diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 7da878dd..2739bb6b 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -26,3 +26,4 @@ export type EvalConfigType = components["schemas"]["EvalConfigType"] export type EvalConfig = components["schemas"]["EvalConfig"] export type TaskRunConfig = components["schemas"]["TaskRunConfig"] export type EvalResultSummary = components["schemas"]["EvalResultSummary"] +export type EvalRunResult = components["schemas"]["EvalRunResult"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 62a1a647..7d7858c3 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -282,6 +282,10 @@ return properties } + $: current_eval_config = eval_configs?.find( + (config) => config.id === current_eval_config_id, + ) + function get_eval_config_properties( eval_config_id: string | null, model_info: ProviderModels | null, @@ -500,7 +504,7 @@ {:else if evaluator}
-
Properties
+
Evaluator Properties
@@ -514,7 +518,10 @@
-
Config
+
Evaluator Config
+
+ How the task outputs will be evaluated. +
{/each} -
Config Quality
+
Quality
@@ -547,10 +554,12 @@ {#if task_run_configs?.length}
-
Results
+
Results Summary
- Filtered by the selected eval config. Rows are grouped by task run - config. + Overview of how various task run configs perform on the selected + evaluator{current_eval_config + ? ` (${current_eval_config.name})` + : ""}.
{#if score_summary_error}
@@ -614,11 +623,14 @@
Name Description Eval NameDescriptionCreated
{evaluator.name} {evaluator.description} {formatDate(evaluator.created_at)}
{output_score.name} - {#if output_score.type === "five_star"} - (1 to 5) - {:else if output_score.type === "pass_fail"} - (0 to 1) - {:else if output_score.type === "pass_fail_critical"} - (-1 to 1) - {:else} - ({output_score.type}) - {/if} +
+ {#if output_score.type === "five_star"} + 1 to 5 + + + + {:else if output_score.type === "pass_fail"} + pass/fail + + + + {:else if output_score.type === "pass_fail_critical"} + pass/fail/critical + + + + {:else} + ({output_score.type}) + {/if} +
- + {#each evaluator.output_scores as output_score} @@ -655,7 +667,9 @@ { - console.log("TODO: link") + goto( + `/evals/${project_id}/${task_id}/${eval_id}/${current_eval_config_id}/${task_run_config.id}/run_result`, + ) }} > {#each evaluator.output_scores as output_score} - {@const score = null} - {/each} @@ -459,77 +427,40 @@ -
- {#if eval_state === "complete"} -
Eval Complete 🎉
- {#if eval_total_count == 0} -
- No evals were run, because everything was already up to date! -
- {/if} - {:else if eval_state === "complete_with_errors"} -
Eval Complete with Errors
- {:else if eval_state === "running"} -
-
Running...
- {/if} -
- {#if eval_total_count > 0} -
- {eval_complete_count + eval_error_count} of {eval_total_count} -
- {/if} - {#if eval_error_count > 0} -
- {eval_error_count} error{eval_error_count === 1 ? "" : "s"} -
- {/if} - {#if eval_run_error} -
- {eval_run_error.getMessage() || "An unknown error occurred"} -
- {/if} -
-
+
-
-
Run this eval with the selected configuration?
-
Don't close this page if you want to monitor progress.
- +
MAE: Mean Absolute Error
+
Lower is better
+
+ Example: If the eval scores an item a 3, and the eval scores it a 5, the + absolute error would be 2 [abs(3-5)]. The overall score is the mean of all + absolute errors. +
+
MSE: Mean squared error
+
Lower is better
+
+ Example: If the eval scores an item a 3, and the eval scores it a 5, the + squared error would be 4 [(3-5)^2]. The overall score is the mean of all + squared errors.
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte new file mode 100644 index 00000000..d1be4213 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte @@ -0,0 +1,38 @@ + + +{#if eval_config} + {@const eval_steps = get_eval_steps(eval_config)} + {#if eval_config.properties?.["task_description"]} +
+
Task Description:
+ {eval_config.properties["task_description"]} +
+ {/if} + {#if eval_steps} +
+
Evaluation Steps:
+
    + {#each eval_steps as step} +
  1. + + {step} + +
  2. + {/each} +
+
+ {/if} +{/if} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte new file mode 100644 index 00000000..d0f9918c --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte @@ -0,0 +1,183 @@ + + +{#if eval_state === "not_started"} + +{:else} + +{/if} + + +
+ {#if eval_state === "complete"} +
Eval Complete 🎉
+ {#if eval_total_count == 0} +
+ No evals were run, because everything was already up to date! +
+ {/if} + {:else if eval_state === "complete_with_errors"} +
Eval Complete with Errors
+ {:else if eval_state === "running"} +
+
Running...
+ {/if} +
+ {#if eval_total_count > 0} +
+ {eval_complete_count + eval_error_count} of {eval_total_count} +
+ {/if} + {#if eval_error_count > 0} +
+ {eval_error_count} error{eval_error_count === 1 ? "" : "s"} +
+ {/if} + {#if eval_run_error} +
+ {eval_run_error.getMessage() || "An unknown error occurred"} +
+ {/if} +
+
+
+ + +
+
Run this eval with the selected configuration?
+
Don't close this page if you want to monitor progress.
+ +
+
From 50811b1e8139da8f3ce5c2818a8212676f81419c Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 26 Feb 2025 00:27:23 -0500 Subject: [PATCH 064/102] - All setting current eval config for an eval through UI - Improve strings/messaging - Allow creating eval configs from /eval_configs with correct redirect - Fix a bug where eval runs without task_run_configs were causing lookup errors. --- app/desktop/studio_server/eval_api.py | 18 +++++ app/desktop/studio_server/test_eval_api.py | 34 ++++++++ app/web_ui/src/lib/api_schema.d.ts | 51 ++++++++++++ .../[eval_id]/create_eval_config/+page.svelte | 13 ++- .../[eval_id]/eval_configs/+page.svelte | 79 ++++++++++++++++--- 5 files changed, 183 insertions(+), 12 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index c0578197..e8423aa7 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -389,6 +389,21 @@ async def run_eval_config( return await run_eval_runner_with_status(eval_runner) + @app.post( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}" + ) + async def set_default_eval_config( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + ) -> Eval: + eval = eval_from_id(project_id, task_id, eval_id) + eval.current_config_id = eval_config_id + eval.save_to_file() + + return eval + @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval" ) @@ -470,6 +485,9 @@ async def get_eval_config_score_summary( # important: readonly makes this much faster for eval_run in eval_config.runs(readonly=True): + if eval_run.task_run_config_id is None: + # This eval_run is not associated with a run_config, so we can't count it + continue run_config_id = str(eval_run.task_run_config_id) # Check if we should count this eval_run. Not every eval_run has to go into the stats: diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index d982cdf7..88ceca2d 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -1000,3 +1000,37 @@ async def test_run_eval_config_eval( assert eval_runner.eval_configs[0].id == mock_eval_config.id assert eval_runner.run_configs is None assert eval_runner.eval_run_type == "eval_config_eval" + + +@pytest.mark.asyncio +async def test_set_current_eval_config( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + """Test setting the current eval config for an evaluation.""" + mock_task_from_id.return_value = mock_task + + # Get the eval before updating to verify the change + response = client.get("/api/projects/project1/tasks/task1/eval/eval1") + assert response.status_code == 200 + eval_before = response.json() + + # The current_config_id might be None or different initially + initial_config_id = eval_before.get("current_config_id") + assert initial_config_id is None + + # Set the current eval config + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.post( + "/api/projects/project1/tasks/task1/eval/eval1/set_current_eval_config/eval_config1" + ) + assert response.status_code == 200 + updated_eval = response.json() + + # Verify the current_config_id was updated + assert updated_eval["current_config_id"] == "eval_config1" + assert updated_eval["id"] == "eval1" + + # Verify the change persists by fetching the eval again + eval_from_disk = mock_task.evals()[0] + assert eval_from_disk.current_config_id == "eval_config1" diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 14c403fc..a969bf12 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -810,6 +810,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Set Default Eval Config */ + post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": { parameters: { query?: never; @@ -4260,6 +4277,40 @@ export interface operations { }; }; }; + set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Eval"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: { parameters: { query?: never; diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index 0e2fc742..1efd78a9 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -204,9 +204,16 @@ throw error } complete = true - goto( - `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`, - ) + const next_page = $page.url.searchParams.get("next_page") + if (next_page === "eval_configs") { + goto( + `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/eval_configs`, + ) + } else { + goto( + `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`, + ) + } } catch (e) { create_evaluator_error = createKilnError(e) } finally { diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index 8af9a2f2..c0c182ad 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -51,9 +51,10 @@ load_model_info(), load_available_prompts(), load_available_models(), + // Get this first, as we want to know "current" for sorting + get_eval(), ]) // These can be parallel - get_eval() get_eval_config() get_score_summary() }) @@ -102,7 +103,12 @@ if (error) { throw error } - eval_configs = data + // sort with current on top + eval_configs = data.sort((a, b) => { + if (evaluator && a.id === evaluator.current_config_id) return -1 + if (evaluator && b.id === evaluator.current_config_id) return 1 + return 0 + }) } catch (error) { eval_configs_error = createKilnError(error) } finally { @@ -180,17 +186,17 @@ const warnings: string[] = [] if (score_summary.dataset_size === 0) { warnings.push( - "No items in your eval-config dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.", + "There are zero items in your config eval dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.", ) } if (score_summary.not_rated_count > 0) { warnings.push( - `${score_summary.not_rated_count} item(s) in your eval-config dataset are not rated at all. Add human ratings to these items in the dataset tab.`, + `${score_summary.not_rated_count} item(s) in your config eval dataset are not rated at all. Add human ratings to these items in the dataset tab.`, ) } if (score_summary.partially_rated_count > 0) { warnings.push( - `${score_summary.partially_rated_count} item(s) in your eval-config dataset are only partially rated. Add human ratings to these items in the dataset tab for each score.`, + `${score_summary.partially_rated_count} item(s) in your config eval dataset are only partially rated. Add human ratings to these items for every score.`, ) } @@ -209,11 +215,47 @@ return warnings } + + async function set_current_eval_config( + eval_config_id: string | null | undefined, + ) { + if (!eval_config_id) { + return + } + try { + const { data, error } = await client.POST( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}", + { + params: { + path: { + project_id: $page.params.project_id, + task_id: $page.params.task_id, + eval_id: $page.params.eval_id, + eval_config_id: eval_config_id, + }, + }, + }, + ) + if (error) { + throw error + } + // Update the evaluator with the latest + evaluator = data + } catch (error) { + eval_error = createKilnError(error) + } + } {#if loading}
@@ -242,16 +284,22 @@
{/each} + {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25} + + {/if}
{#if eval_configs?.length}
-
Correlation to Human Scores
+
Correlation to Human Ratings
- How each eval config correlates to human scores (ratings from the - dataset tab). + How each eval config correlates to human ratings.
{#if score_summary_error}
@@ -279,13 +327,14 @@
+ {#if incomplete_warning(score_summary).length}
-
    +
      {#each incomplete_warning(score_summary) as warning}
    • {warning}
    • {/each} @@ -370,6 +419,18 @@
      0% complete
      {/if} + {#if eval_config.id == evaluator.current_config_id} +
      Default
      + {:else} + + {/if}
{/each} @@ -464,12 +493,15 @@ ]} {#each evaluator.output_scores as output_score} {/each} @@ -648,9 +646,9 @@
-
Create a Run Config
+
Create a Run Method
- A task run config defines how the task is run, such as which model + A task run method defines how the task is run, such as which model and prompt to use. Create one to run this evaluator.
{#each results.eval.output_scores as score} - + {/each} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index 1efd78a9..7a7496fb 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -224,8 +224,9 @@
{#if loading}
@@ -311,8 +312,9 @@
- Include a short description of what this task does for the - evaluator to use as context. + Include a short description of what this task does. The + evaluator will use this for context. Keep it short, ideally one + sentence. Include more detailed requirements in steps below.
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index a012e4c5..c347809b 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -159,7 +159,7 @@ const properties: UiProperty[] = [] properties.push({ - name: "Eval Name", + name: "Name", value: evaluator.name, }) if (evaluator.description) { @@ -174,7 +174,7 @@ eval_configs_set_size = " (" + score_summary.dataset_size + " items)" } properties.push({ - name: "Config Eval Set", + name: "Eval Method Dataset", value: evaluator.eval_configs_filter_id + eval_configs_set_size, }) return properties @@ -190,17 +190,17 @@ const warnings: string[] = [] if (score_summary.dataset_size === 0) { warnings.push( - "There are zero items in your config eval dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.", + "There are zero items in your eval method dataset. Generate some runs in your dataset tab, and tag them to add them to your eval method dataset.", ) } if (score_summary.not_rated_count > 0) { warnings.push( - `${score_summary.not_rated_count} item(s) in your config eval dataset are not rated at all. Add human ratings to these items in the dataset tab.`, + `${score_summary.not_rated_count} item(s) in your eval method dataset are not rated at all. Add human ratings to these items in the dataset tab.`, ) } if (score_summary.partially_rated_count > 0) { warnings.push( - `${score_summary.partially_rated_count} item(s) in your config eval dataset are only partially rated. Add human ratings to these items for every score.`, + `${score_summary.partially_rated_count} item(s) in your eval method dataset are only partially rated. Add human ratings for each score in the dataset tab.`, ) } @@ -277,11 +277,11 @@ {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25} - +
+ +
{/if} @@ -328,7 +330,7 @@
Correlation to Human Ratings
- How each eval config correlates to human ratings. + How each eval method correlates to human ratings.
@@ -521,7 +523,7 @@ -
Task Description:
- {eval_config.properties["task_description"]} - - {/if} +
+
Task Description:
+ {eval_config.properties["task_description"] || "No description provided."} +
{#if eval_steps}
Evaluation Steps:
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte new file mode 100644 index 00000000..a6dd2500 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte @@ -0,0 +1,29 @@ + + +
+ {#if output_score_type === "five_star"} + 1 to 5 + + + + {:else if output_score_type === "pass_fail"} + pass/fail + + + + {:else if output_score_type === "pass_fail_critical"} + pass/fail/critical + + + + {:else} + {output_score_type} + {/if} +
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte index 345e6d37..87688a4a 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte @@ -125,11 +125,11 @@ eval_set_default_tags[selected_template ?? "none"] || "eval_set" const config_set_default_tags: Record = { kiln_requirements: "golden", - toxicity: "toxicity_config_evals", - bias: "bias_config_evals", - maliciousness: "maliciousness_config_evals", - factual_correctness: "factual_config_evals", - jailbreak: "jailbreak_config_evals", + toxicity: "toxicity_golden", + bias: "bias_golden", + maliciousness: "maliciousness_golden", + factual_correctness: "factual_golden", + jailbreak: "jailbreak_golden", none: "golden", } $: suggested_config_set_tag = @@ -253,11 +253,12 @@
- Part 3: Evaluation Dataset + Part 3: Task Evaluation Dataset
- Specify which which part of your dataset this evaluator should run - on. + Specify which which part of your dataset is used when evaluating + different methods of running your task (various prompts, models, + fine-tunes, etc).
- Part 3: Dataset to Evaluate Evaluation Configs + Part 4: Dataset to Compare Evaluation Methods
- Specify which which part of your dataset this evaluator should run - on when attemping to find the ideal evaluation config (prompt, - model, etc). + Specify which which part of your dataset is used when trying to find + the best evaluation method for this task. You'll rate these dataset + items, so we can compare the evaluator's ratings to your human + preferences.
Date: Wed, 26 Feb 2025 14:05:05 -0500 Subject: [PATCH 068/102] Fix issue where the run_eval progress disappeared. We triggerd loading, which tool the whole svelte componenet out of dom --- .../[project_id]/[task_id]/[eval_id]/+page.svelte | 15 ++++----------- .../[task_id]/[eval_id]/eval_configs/+page.svelte | 6 +----- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index bf388f67..d1320fc4 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -50,13 +50,8 @@ let score_summary: EvalResultSummary | null = null let score_summary_error: KilnError | null = null - let score_summary_loading = false - $: loading = - eval_loading || - eval_configs_loading || - task_run_configs_loading || - score_summary_loading + $: loading = eval_loading || eval_configs_loading || task_run_configs_loading $: error = eval_error || eval_configs_error || task_run_configs_error // Note: not including score_summary_error, because it's not a critical error we should block the UI for @@ -174,7 +169,7 @@ return } try { - score_summary_loading = true + score_summary = null const { data, error } = await client.GET( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary", { @@ -194,8 +189,6 @@ score_summary = data } catch (error) { score_summary_error = createKilnError(error) - } finally { - score_summary_loading = false } } @@ -620,11 +613,11 @@ ? 'text-error' : 'text-gray-500'}" > - Eval {(percent_complete * 100.0).toFixed(1)}% complete + {(percent_complete * 100.0).toFixed(1)}% complete {:else if score_summary} -
Eval 0% complete
+
0% complete
{/if} {#each evaluator.output_scores as output_score} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index c347809b..2b736b25 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -39,11 +39,10 @@ let score_summary: EvalConfigCompareSummary | null = null let score_summary_error: KilnError | null = null - let score_summary_loading = false let score_type: "mse" | "mae" | "norm_mse" | "norm_mae" = "norm_mse" - $: loading = eval_loading || eval_configs_loading || score_summary_loading + $: loading = eval_loading || eval_configs_loading // Score summary not blocking whole UI $: error = eval_error || eval_configs_error || score_summary_error $: run_eval_url = `${base_url}/api/projects/${$page.params.project_id}/tasks/${$page.params.task_id}/eval/${$page.params.eval_id}/run_eval_config_eval` @@ -123,7 +122,6 @@ async function get_score_summary() { score_summary = null try { - score_summary_loading = true const { data, error } = await client.GET( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary", { @@ -142,8 +140,6 @@ score_summary = data } catch (error) { score_summary_error = createKilnError(error) - } finally { - score_summary_loading = false } } From ee30223921d9f9f451c9f7c6bb832a0a6f6db17f Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 26 Feb 2025 14:54:29 -0500 Subject: [PATCH 069/102] String changes, final CR feedback --- .../(app)/evals/[project_id]/[task_id]/+page.svelte | 2 +- .../create_evaluator/select_eval_template.svelte | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte index 83654fcf..11bdb687 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte @@ -52,7 +52,7 @@ Date: Thu, 27 Feb 2025 10:20:50 -0500 Subject: [PATCH 070/102] Add a peek warning --- app/web_ui/src/lib/ui/dialog.svelte | 10 ++++- .../[run_config_id]/run_result/+page.svelte | 41 +++++++++++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/app/web_ui/src/lib/ui/dialog.svelte b/app/web_ui/src/lib/ui/dialog.svelte index ffd23807..bda972e9 100644 --- a/app/web_ui/src/lib/ui/dialog.svelte +++ b/app/web_ui/src/lib/ui/dialog.svelte @@ -2,6 +2,7 @@ import { KilnError, createKilnError } from "$lib/utils/error_handlers" export let title: string + export let blur_background: boolean = false const id: string = "dialog-" + Math.random().toString(36) type ActionButton = { label: string @@ -10,6 +11,7 @@ action?: () => boolean isCancel?: boolean isPrimary?: boolean + isError?: boolean disabled?: boolean } export let action_buttons: ActionButton[] = [] @@ -94,7 +96,8 @@
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte index c39f3306..1fe9e206 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte @@ -1,5 +1,7 @@ -
Run Config +
Run Config
+
How task output is generated
+
{output_score.name} -
+
{#if output_score.type === "five_star"} 1 to 5 @@ -639,7 +651,7 @@ /> {:else} - ({output_score.type}) + {output_score.type} {/if}
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte new file mode 100644 index 00000000..c16e7bc0 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte @@ -0,0 +1,201 @@ + + + + {#if results_loading} +
+
+
+ {:else if results_error} +
+
Error Loading Eval Results
+
+ {results_error.getMessage() || "An unknown error occurred"} +
+
+ {:else if results && results.results.length === 0} +
+
Eval Results Empty
+
+ No results found for this run config. +
+
+ {:else if results} +
+
+
Task Run Config
+
+ How the outputs were generated. +
+
+ {#each Object.entries(get_run_config_properties(results.run_config, results.eval)) as [prop_name, prop_value]} +
{prop_name}
+
+ {prop_value} +
+ {/each} +
+
+
+
Evaluator
+
+ How the outputs were evaluated. +
+
+ {#each Object.entries(get_eval_properties(results.eval, results.eval_config)) as [prop_name, prop_value]} +
{prop_name}
+
+ {prop_value} +
+ {/each} +
+
+
+
+ + + + + + {#each results.eval.output_scores as score} + + {/each} + + + + {#each results.results as result} + + + + {#each results.eval.output_scores as score} + {@const score_value = + result.scores[string_to_json_key(score.name)]} + + {/each} + + {/each} + +
InputOutput{score.name}
{result.input} {result.output} + {score_value ? score_value.toFixed(2) : "N/A"} +
+
+ {/if} +
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts @@ -0,0 +1 @@ +export const prerender = false From 7e51c3e46cdd6182cac36562056bdc4554b3598f Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 06:12:56 -0500 Subject: [PATCH 058/102] Add eval config comparison summary API --- app/desktop/studio_server/eval_api.py | 224 +++++++++++++++- app/desktop/studio_server/test_eval_api.py | 291 ++++++++++++++++++++- 2 files changed, 510 insertions(+), 5 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 97226ba4..7947f40e 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -1,5 +1,5 @@ import json -from typing import Any, Dict, List, Set +from typing import Any, Dict, List, Set, Tuple from fastapi import FastAPI, HTTPException, Query from fastapi.responses import StreamingResponse @@ -12,6 +12,7 @@ DataSourceType, PromptId, Task, + TaskRun, ) from kiln_ai.datamodel.basemodel import ID_TYPE from kiln_ai.datamodel.dataset_filters import DatasetFilterId, dataset_filter_from_id @@ -23,6 +24,7 @@ EvalRun, EvalTemplate, ) +from kiln_ai.datamodel.json_schema import string_to_json_key from kiln_ai.datamodel.prompt_id import is_frozen_prompt from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig from kiln_ai.utils.name_generator import generate_memorable_name @@ -119,12 +121,84 @@ class EvalResultSummary(BaseModel): dataset_size: int +class EvalConfigScoreSummary(BaseModel): + mean_absolute_error: float + mean_squared_error: float + + +class EvalConfigCompareSummary(BaseModel): + # Summary of results. eval_config_id -> output_score_id -> ScoreSummary + results: Dict[str, Dict[str, EvalConfigScoreSummary]] + # eval_config_id -> percent of the dataset that has been processed (run with eval scores) + eval_config_percent_complete: Dict[str, float] + # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size) + dataset_size: int + # The number of dataset items which are fully rated, partially rated, or not rated at all. + fully_rated_count: int + partially_rated_count: int + not_rated_count: int + + def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[ID_TYPE]: # Fetch all the dataset items IDs in a filter filter = dataset_filter_from_id(filter_id) return {run.id for run in task.runs() if filter(run)} +def human_score_from_task_run( + task_run: TaskRun, + score_key: str, + score_key_to_task_requirement_id: Dict[str, ID_TYPE], +) -> float | None: + if not task_run.output.rating: + return None + + human_score: float | None = None + if score_key == "overall_rating": + human_score = task_run.output.rating.value + else: + req_rating = task_run.output.rating.requirement_ratings.get( + score_key_to_task_requirement_id[score_key], None + ) + if req_rating is not None: + human_score = req_rating.value + + return human_score + + +def count_human_evals( + items: Set[TaskRun], + eval: Eval, + score_key_to_task_requirement_id: Dict[str, ID_TYPE], +) -> Tuple[int, int, int]: + # Track how often we are missing human evals in dataset items + fully_rated_count: int = 0 + partially_rated_count: int = 0 + not_rated_count: int = 0 + for dataset_item in items: + # Check it has all scores + has_all_scores = True + has_any_scores = False + for output_score in eval.output_scores: + score_key = output_score.json_key() + score = human_score_from_task_run( + dataset_item, score_key, score_key_to_task_requirement_id + ) + if score is None: + has_all_scores = False + else: + has_any_scores = True + + if not has_any_scores: + not_rated_count += 1 + elif has_all_scores: + fully_rated_count += 1 + else: + partially_rated_count += 1 + + return fully_rated_count, partially_rated_count, not_rated_count + + def connect_evals_api(app: FastAPI): @app.post("/api/projects/{project_id}/tasks/{task_id}/create_evaluator") async def create_evaluator( @@ -168,6 +242,15 @@ async def get_eval_configs( eval = eval_from_id(project_id, task_id, eval_id) return eval.configs() + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}" + ) + async def get_eval_config( + project_id: str, task_id: str, eval_id: str, eval_config_id: str + ) -> EvalConfig: + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + return eval_config + @app.post("/api/projects/{project_id}/tasks/{task_id}/task_run_config") async def create_task_run_config( project_id: str, @@ -368,7 +451,7 @@ async def get_eval_config_score_summary( # Check if we should count this eval_run. Not every eval_run has to go into the stats: # - a dataset_id can be removed from the dataset filter (removed a tag) - # - this dataset_id was already counted (okay there are dupes, but shouldn't be double counted) + # - this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]: continue else: @@ -421,3 +504,140 @@ async def get_eval_config_score_summary( run_config_percent_complete=run_config_percent_complete, dataset_size=len(expected_dataset_ids), ) + + # Compared to above, this is comparing all eval configs to each other, not looking at a single eval config + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary" + ) + async def get_eval_configs_score_summary( + project_id: str, + task_id: str, + eval_id: str, + ) -> EvalConfigCompareSummary: + task = task_from_id(project_id, task_id) + eval = eval_from_id(project_id, task_id, eval_id) + eval_configs = eval.configs(readonly=True) + + # Create a map of score_key -> Task requirement ID + score_key_to_task_requirement_id: Dict[str, ID_TYPE] = {} + for task_requirement in task.requirements: + score_key = string_to_json_key(task_requirement.name) + score_key_to_task_requirement_id[score_key] = task_requirement.id + + # Build a set of all the dataset items IDs we expect to have scores for + # Fetch all the dataset items in a filter, and return a map of dataset_id -> TaskRun + filter = dataset_filter_from_id(eval.eval_configs_filter_id) + expected_dataset_items = {run.id: run for run in task.runs() if filter(run)} + expected_dataset_ids = set(expected_dataset_items.keys()) + if len(expected_dataset_ids) == 0: + return EvalConfigCompareSummary( + results={}, + eval_config_percent_complete={}, + dataset_size=0, + fully_rated_count=0, + partially_rated_count=0, + not_rated_count=0, + ) + + # save a copy of the expected dataset ids for each eval config, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { + str(eval_config.id): set(expected_dataset_ids) + for eval_config in eval_configs + } + + # eval_config_id -> output_score_id -> scores/total + total_squared_error: Dict[str, Dict[str, float]] = {} + total_absolute_error: Dict[str, Dict[str, float]] = {} + total_count: Dict[str, Dict[str, int]] = {} + + # important: readonly makes this much faster + for eval_config in eval_configs: + eval_config_id = str(eval_config.id) + for eval_run in eval_config.runs(readonly=True): + dataset_item = expected_dataset_items.get(eval_run.dataset_id, None) + if dataset_item is None: + # A dataset_id can be removed from the dataset filter (ran previously, then removed the tag to remove it from the eval config set filter) + # A dataset_id could be for an run_config, not for comparing eval at all + continue + + # Check if we should count this eval_run. Not every eval_run has to go into the stats: + # Example: this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) + if ( + eval_run.dataset_id + not in remaining_expected_dataset_ids[eval_config_id] + ): + continue + else: + remaining_expected_dataset_ids[eval_config_id].remove( + eval_run.dataset_id + ) + + for output_score in eval.output_scores: + score_key = output_score.json_key() + eval_score: float | None = eval_run.scores.get(score_key, None) + + # Fetch the human eval score from the dataset item + human_score = human_score_from_task_run( + dataset_item, score_key, score_key_to_task_requirement_id + ) + + if human_score is None or eval_score is None: + # This score doesn't have both a human eval and eval score, so we can't compare + continue + + if eval_config_id not in total_squared_error: + total_squared_error[eval_config_id] = {} + total_absolute_error[eval_config_id] = {} + total_count[eval_config_id] = {} + if score_key not in total_squared_error[eval_config_id]: + total_squared_error[eval_config_id][score_key] = 0 + total_absolute_error[eval_config_id][score_key] = 0 + total_count[eval_config_id][score_key] = 0 + + # TODO normalize MSE? + total_squared_error[eval_config_id][score_key] += ( + eval_score - human_score + ) ** 2 + total_absolute_error[eval_config_id][score_key] += abs( + eval_score - human_score + ) + total_count[eval_config_id][score_key] += 1 + + # Convert to score summaries + results: Dict[str, Dict[str, EvalConfigScoreSummary]] = {} + for eval_config_id in total_count.keys(): + results[eval_config_id] = {} + for score_key in total_count[eval_config_id].keys(): + count = total_count[eval_config_id][score_key] + if count > 0: + results[eval_config_id][score_key] = EvalConfigScoreSummary( + mean_squared_error=( + total_squared_error[eval_config_id][score_key] / count + ), + mean_absolute_error=( + total_absolute_error[eval_config_id][score_key] / count + ), + ) + + # Calculate the percent of the dataset that has been processed + eval_config_percent_complete: Dict[str, float] = {} + for eval_config in eval_configs: + eval_config_id = str(eval_config.id) + # Partial incomplete (missing scores), and fully incomplete (no eval_run) + incomplete_count = len(remaining_expected_dataset_ids[eval_config_id]) + percent_incomplete = incomplete_count / len(expected_dataset_ids) + eval_config_percent_complete[str(eval_config.id)] = 1 - percent_incomplete + + # Count how many dataset items have human evals + fully_rated_count, partially_rated_count, not_rated_count = count_human_evals( + expected_dataset_items.values(), eval, score_key_to_task_requirement_id + ) + + return EvalConfigCompareSummary( + results=results, + eval_config_percent_complete=eval_config_percent_complete, + dataset_size=len(expected_dataset_ids), + fully_rated_count=fully_rated_count, + partially_rated_count=partially_rated_count, + not_rated_count=not_rated_count, + ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 93eda512..f7ae1fcb 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -1,5 +1,6 @@ import json -from typing import Dict, Tuple +from dataclasses import dataclass +from typing import Dict, List, Tuple from unittest.mock import Mock, patch import pytest @@ -10,9 +11,15 @@ BasePrompt, DataSource, DataSourceType, + Priority, Project, PromptId, + RequirementRating, Task, + TaskOutput, + TaskOutputRating, + TaskRequirement, + TaskRun, ) from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.eval import ( @@ -60,6 +67,15 @@ def mock_task(tmp_path): description="Test Description", instruction="Test Instructions", path=tmp_path / "task.kiln", + requirements=[ + TaskRequirement( + name="score1", + description="desc1", + instruction="inst1", + priority=Priority.p1, + type="five_star", + ), + ], parent=project, ) task.save_to_file() @@ -75,6 +91,9 @@ def mock_eval(mock_task): template=EvalTemplate.bias, output_scores=[ EvalOutputScore(name="score1", description="desc1", type="five_star"), + EvalOutputScore( + name="overall_rating", description="desc2", type="five_star" + ), ], eval_set_filter_id="tag::eval_set", eval_configs_filter_id="tag::golden", @@ -348,6 +367,28 @@ async def test_create_eval_config( assert config.properties["eval_steps"][1] == "step2" +def test_get_eval_config( + client, mock_task_from_id, mock_eval, mock_task, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1" + ) + + assert response.status_code == 200 + config = response.json() + assert isinstance(config, dict) + + assert config["config_type"] == mock_eval_config.config_type + assert config["properties"] == mock_eval_config.properties + assert config["model"]["type"] == mock_eval_config.model.type + + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") + + def test_get_eval_configs( client, mock_task_from_id, mock_eval, mock_task, mock_eval_config ): @@ -629,7 +670,7 @@ async def test_get_eval_run_results( eval_run = EvalRun( task_run_config_id="run_config1", - scores={"score1": 3.0}, + scores={"score1": 3.0, "overall_rating": 1.0}, input="input", output="output", dataset_id="dataset_id1", @@ -656,7 +697,7 @@ async def test_get_eval_run_results( assert len(data["results"]) == 1 assert data["results"][0]["id"] == eval_run.id assert data["results"][0]["task_run_config_id"] == mock_run_config.id - assert data["results"][0]["scores"] == {"score1": 3.0} + assert data["results"][0]["scores"] == {"score1": 3.0, "overall_rating": 1.0} # Test with invalid eval ID response = client.get( @@ -678,3 +719,247 @@ async def test_get_eval_run_results( f"/eval_config/eval_config1/run_config/invalid_run_config/results" ) assert response.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_eval_config_compare_summary( + client, + mock_task_from_id, + mock_task, + mock_eval, + mock_eval_config, + mock_run_config, +): + mock_task_from_id.return_value = mock_task + + # structed data to make it easier to generate test cases. + @dataclass + class EvalCondigSummaryTestData: + human_overall_rating: float | None + score1_overall_rating: float | None + eval_overall_rating: float + eval__score1_rating: float + eval_config_id: str + skip_eval_run: bool = False + skip_golden_tag: bool = False + + test_data: List[EvalCondigSummaryTestData] = [ + # Test 1: ec1 + # Normal run, with some data to check calulations on a sinlgle run + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=2.0, + eval_overall_rating=1.0, + eval__score1_rating=3.5, + eval_config_id="ec1", + ), + # Should be ignored as it's not in the eval set filter (golden tag). Would mess up the scores of eval_config1 if included + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec2", + skip_golden_tag=True, + ), + # Test 2: ec2 - Test multiple, and correct averaging + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec2", + ), + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=1.0, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec2", + ), + # Test 3: Dataset item that has partial human rating + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=None, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec3", + ), + # Test 4: Dataset item that has no human rating + EvalCondigSummaryTestData( + human_overall_rating=None, + score1_overall_rating=None, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec4", + ), + # Test 5: skipping eval run should lower the percent complete + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec5", + skip_eval_run=True, + ), + ] + + # Count items that don't have skip_golden_tag set to True + total_in_dataset = sum(1 for x in test_data if not x.skip_golden_tag) + + eval_configs_by_id: Dict[str, EvalConfig] = {} + + assert len(mock_task.requirements) == 1 + assert mock_task.requirements[0].name == "score1" + score1_requirement_id = mock_task.requirements[0].id + for test_case in test_data: + # create eval config if it doesn't exist + eval_config = eval_configs_by_id.get(test_case.eval_config_id) + if eval_config is None: + eval_config = EvalConfig( + id=test_case.eval_config_id, + name="Test Eval Config", + config_type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + parent=mock_eval, + model=DataSource( + id="model1", + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "TODO", + }, + ), + prompt=BasePrompt( + name="test", + prompt="base prompt", + chain_of_thought_instructions="cot prompt", + ), + ) + eval_config.save_to_file() + eval_configs_by_id[test_case.eval_config_id] = eval_config + + tags = ["golden"] + if test_case.skip_golden_tag: + tags = [] + + ratings = {} + if test_case.score1_overall_rating is not None: + ratings[score1_requirement_id] = RequirementRating( + value=test_case.score1_overall_rating, + type="five_star", + ) + + task_run = TaskRun( + output=TaskOutput( + output="Test Output", + source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "langchain_adapter", + }, + ), + rating=TaskOutputRating( + value=test_case.human_overall_rating, + requirement_ratings=ratings, + ), + ), + input="Test Input", + input_source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "langchain_adapter", + }, + ), + tags=tags, + parent=mock_task, + ) + task_run.save_to_file() + + if test_case.skip_eval_run: + continue + + eval_run = EvalRun( + task_run_config_id="run_config1", + scores={ + "score1": test_case.eval__score1_rating, + "overall_rating": test_case.eval_overall_rating, + }, + input="input", + output="output", + dataset_id=task_run.id, + parent=eval_config, + ) + eval_run.save_to_file() + + # Test successful retrieval + response = client.get( + f"/api/projects/project1/tasks/task1/eval/eval1/eval_configs_score_summary" + ) + + assert response.status_code == 200 + data = response.json() + + assert "results" in data + results = data["results"] + assert isinstance(results, dict) + + assert "eval_config_percent_complete" in data + eval_config_percent_complete = data["eval_config_percent_complete"] + assert isinstance(eval_config_percent_complete, dict) + + # check the counts + assert data["fully_rated_count"] == 4 + assert data["partially_rated_count"] == 1 + assert data["not_rated_count"] == 1 + assert data["dataset_size"] == total_in_dataset + + # Test case 1: 1 item should be included, manually calculated scores, should exclude a second item that isn't in the eval config set filter + assert results["ec1"] == { + "overall_rating": { + "mean_squared_error": 16.0, # error 4.0^2 + "mean_absolute_error": 4.0, # error 4.0 + }, + "score1": { + "mean_squared_error": 2.25, # error (3.5-5.0)^2 + "mean_absolute_error": 1.5, # error 1.5 + }, + } + # 1 of total_in_dataset eval configs are are in ec1 test + assert eval_config_percent_complete["ec1"] == pytest.approx(1 / total_in_dataset) + + # Test case 2: check proper averaging + assert results["ec2"] == { + "overall_rating": { + "mean_squared_error": 2.5, # error (1^2 + 2^2) / 2 + "mean_absolute_error": 1.5, # (1+2)/2 + }, + "score1": { + "mean_squared_error": 2.5, # (1^2+2^2)/2 + "mean_absolute_error": 1.5, # (1+2)/2 + }, + } + # 2 of total_in_dataset eval configs are are in ec2 test + assert eval_config_percent_complete["ec2"] == pytest.approx(2 / total_in_dataset) + + # Test case 3: Check partials still calulate available scores + assert results["ec3"] == { + "overall_rating": { + "mean_squared_error": 4, + "mean_absolute_error": 2, + }, + } + # 2 of total_in_dataset eval configs are are in ec2 test + assert eval_config_percent_complete["ec3"] == pytest.approx(1 / total_in_dataset) + + # Test case 4: Check no rating is empty results + assert results.get("ec4", {}) == {} + assert eval_config_percent_complete["ec4"] == pytest.approx(1 / total_in_dataset) + + # Test case 5: Check skipping eval run lowers the percent complete + assert eval_config_percent_complete["ec5"] == pytest.approx(0 / total_in_dataset) From 113475c15557009d696757d4069d1bcf42a2cb12 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 06:51:38 -0500 Subject: [PATCH 059/102] WIP UI for evaluating eval configs --- app/web_ui/src/lib/api_schema.d.ts | 129 +++++ app/web_ui/src/lib/types.ts | 2 + .../[task_id]/[eval_id]/+page.svelte | 11 +- .../[eval_id]/eval_configs/+page.svelte | 535 ++++++++++++++++++ .../[task_id]/[eval_id]/eval_configs/+page.ts | 1 + 5 files changed, 675 insertions(+), 3 deletions(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index c97cd519..fb43195b 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -742,6 +742,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Config */ + get: operations["get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/task_run_config": { parameters: { query?: never; @@ -827,6 +844,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Configs Score Summary */ + get: operations["get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { @@ -1313,6 +1347,34 @@ export interface components { /** Model Type */ readonly model_type: string; }; + /** EvalConfigCompareSummary */ + EvalConfigCompareSummary: { + /** Results */ + results: { + [key: string]: { + [key: string]: components["schemas"]["EvalConfigScoreSummary"]; + }; + }; + /** Eval Config Percent Complete */ + eval_config_percent_complete: { + [key: string]: number; + }; + /** Dataset Size */ + dataset_size: number; + /** Fully Rated Count */ + fully_rated_count: number; + /** Partially Rated Count */ + partially_rated_count: number; + /** Not Rated Count */ + not_rated_count: number; + }; + /** EvalConfigScoreSummary */ + EvalConfigScoreSummary: { + /** Mean Absolute Error */ + mean_absolute_error: number; + /** Mean Squared Error */ + mean_squared_error: number; + }; /** * EvalConfigType * @enum {string} @@ -4031,6 +4093,40 @@ export interface operations { }; }; }; + get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalConfig"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post: { parameters: { query?: never; @@ -4210,4 +4306,37 @@ export interface operations { }; }; }; + get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["EvalConfigCompareSummary"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; } diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 2739bb6b..e191de7e 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -27,3 +27,5 @@ export type EvalConfig = components["schemas"]["EvalConfig"] export type TaskRunConfig = components["schemas"]["TaskRunConfig"] export type EvalResultSummary = components["schemas"]["EvalResultSummary"] export type EvalRunResult = components["schemas"]["EvalRunResult"] +export type EvalConfigCompareSummary = + components["schemas"]["EvalConfigCompareSummary"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 7d7858c3..b2696a8d 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -483,8 +483,8 @@ subtitle={evaluator?.name} action_buttons={[ { - label: "Evaluate Eval Quality", - href: `/evals/${project_id}/${task_id}/${eval_id}/TODO`, + label: "Compare Eval Configs", + href: `/evals/${project_id}/${task_id}/${eval_id}/eval_configs`, }, ]} > @@ -545,7 +545,12 @@ {/each}
Quality
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte new file mode 100644 index 00000000..8415f289 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -0,0 +1,535 @@ + + + + {#if loading} +
+
+
+ {:else if error} +
+
Error Loading
+
+ {error.getMessage() || "An unknown error occurred"} +
+
+ {:else if evaluator} +
+
+
Evaluator Properties
+
+ {#each get_eval_properties(evaluator, score_summary) as property} +
{property.name}
+
+ {property.value} +
+ {/each} +
+
+
+
+ {#if eval_configs?.length} +
+
+
Correlation to Human Scores
+
+ Overview of how each eval config correlates to human scores + (ratings from the dataset tab). +
+ {#if score_summary_error} +
+ {score_summary_error.getMessage() || + "An unknown error occurred fetching scores."} +
+ {/if} +
+
+ {#if eval_state === "not_started"} + + {:else} + + {/if} +
+
+ + + + {#if show_incomplete_warning(score_summary)} +
+ +
+ {/if} + +
+ + + + + + {#each evaluator.output_scores as output_score} + + {/each} + + + + {#each eval_configs || [] as eval_config} + {@const percent_complete = + score_summary?.eval_config_percent_complete?.[ + "" + eval_config.id + ]} + + + + {#each evaluator.output_scores as output_score} + {@const score = null} + + {/each} + + {/each} + +
+
Eval Config
+
How task output is evaluated
+
Eval Instructions + {output_score.name} +
+ {#if output_score.type === "five_star"} + 1 to 5 + + + + {:else if output_score.type === "pass_fail"} + pass/fail + + + + {:else if output_score.type === "pass_fail_critical"} + pass/fail/critical + + + + {:else} + {output_score.type} + {/if} +
+
+
+ {eval_config.name} +
+
+ {model_name( + eval_config?.model.properties?.["model_name"], + $model_info, + )} +
+
+ {provider_name_from_id( + eval_config?.model.properties?.["model_provider_name"] + + "", + )} +
+ {#if percent_complete} +
+ Eval {(percent_complete * 100.0).toFixed(1)}% complete +
+ {:else if score_summary} + +
Eval 0% complete
+ {/if} +
+
+ {#if eval_config.properties?.["task_description"]} +
+
Task Description:
+ {eval_config.properties["task_description"]} +
+ {/if} + {#if eval_config.properties?.["eval_steps"] && Array.isArray(eval_config.properties["eval_steps"])} +
+
+ Evaluator Instructions: +
+
    + {#each eval_config.properties["eval_steps"] as step} +
  1. + + {step} + +
  2. + {/each} +
+
+ {/if} +
+
+ {score != null ? score.toFixed(2) : "unknown"} +
+
+ {:else} + + {/if} +
+ {/if} +
+ + +
+ {#if eval_state === "complete"} +
Eval Complete 🎉
+ {#if eval_total_count == 0} +
+ No evals were run, because everything was already up to date! +
+ {/if} + {:else if eval_state === "complete_with_errors"} +
Eval Complete with Errors
+ {:else if eval_state === "running"} +
+
Running...
+ {/if} +
+ {#if eval_total_count > 0} +
+ {eval_complete_count + eval_error_count} of {eval_total_count} +
+ {/if} + {#if eval_error_count > 0} +
+ {eval_error_count} error{eval_error_count === 1 ? "" : "s"} +
+ {/if} + {#if eval_run_error} +
+ {eval_run_error.getMessage() || "An unknown error occurred"} +
+ {/if} +
+
+
+ + +
+
Run this eval with the selected configuration?
+
Don't close this page if you want to monitor progress.
+ +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts @@ -0,0 +1 @@ +export const prerender = false From a8bb4db362bf26f43474d942bd91207af9c57ed6 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 20:58:28 -0500 Subject: [PATCH 060/102] Eval runner updated to be more powerful. Run a eval_config analysis, as well as x product of eval_configs and task runs. --- app/desktop/studio_server/eval_api.py | 9 +- app/desktop/studio_server/test_eval_api.py | 22 +- libs/core/kiln_ai/adapters/eval/base_eval.py | 7 +- .../core/kiln_ai/adapters/eval/eval_runner.py | 158 ++++++--- libs/core/kiln_ai/adapters/eval/g_eval.py | 8 +- .../kiln_ai/adapters/eval/test_eval_runner.py | 308 ++++++++++++++++-- libs/core/kiln_ai/datamodel/eval.py | 23 +- .../core/kiln_ai/datamodel/test_eval_model.py | 54 +++ 8 files changed, 502 insertions(+), 87 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 7947f40e..0b834d89 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -167,7 +167,7 @@ def human_score_from_task_run( def count_human_evals( - items: Set[TaskRun], + items: List[TaskRun], eval: Eval, score_key_to_task_requirement_id: Dict[str, ID_TYPE], ) -> Tuple[int, int, int]: @@ -362,8 +362,9 @@ async def run_eval_config( ] eval_runner = EvalRunner( - eval_config=eval_config, + eval_configs=[eval_config], run_configs=run_configs, + eval_run_type="task_run_eval", ) # Async messages via server side events (SSE) @@ -630,7 +631,9 @@ async def get_eval_configs_score_summary( # Count how many dataset items have human evals fully_rated_count, partially_rated_count, not_rated_count = count_human_evals( - expected_dataset_items.values(), eval, score_key_to_task_requirement_id + list(expected_dataset_items.values()), + eval, + score_key_to_task_requirement_id, ) return EvalConfigCompareSummary( diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index f7ae1fcb..539c0c9e 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -13,7 +13,6 @@ DataSourceType, Priority, Project, - PromptId, RequirementRating, Task, TaskOutput, @@ -21,7 +20,6 @@ TaskRequirement, TaskRun, ) -from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.eval import ( Eval, EvalConfig, @@ -680,8 +678,8 @@ async def test_get_eval_run_results( # Test successful retrieval response = client.get( - f"/api/projects/project1/tasks/task1/eval/eval1" - f"/eval_config/eval_config1/run_config/run_config1/results" + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/eval_config1/run_config/run_config1/results" ) assert response.status_code == 200 @@ -701,22 +699,22 @@ async def test_get_eval_run_results( # Test with invalid eval ID response = client.get( - f"/api/projects/project1/tasks/task1/eval/invalid_eval" - f"/eval_config/eval_config1/run_config/run_config1/results" + "/api/projects/project1/tasks/task1/eval/invalid_eval" + "/eval_config/eval_config1/run_config/run_config1/results" ) assert response.status_code == 404 # Test with invalid eval config ID response = client.get( - f"/api/projects/project1/tasks/task1/eval/eval1" - f"/eval_config/invalid_config/run_config/run_config1/results" + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/invalid_config/run_config/run_config1/results" ) assert response.status_code == 404 # Test with invalid run config ID response = client.get( - f"/api/projects/project1/tasks/task1/eval/eval1" - f"/eval_config/eval_config1/run_config/invalid_run_config/results" + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/eval_config1/run_config/invalid_run_config/results" ) assert response.status_code == 404 @@ -899,7 +897,7 @@ class EvalCondigSummaryTestData: # Test successful retrieval response = client.get( - f"/api/projects/project1/tasks/task1/eval/eval1/eval_configs_score_summary" + "/api/projects/project1/tasks/task1/eval/eval1/eval_configs_score_summary" ) assert response.status_code == 200 @@ -947,7 +945,7 @@ class EvalCondigSummaryTestData: # 2 of total_in_dataset eval configs are are in ec2 test assert eval_config_percent_complete["ec2"] == pytest.approx(2 / total_in_dataset) - # Test case 3: Check partials still calulate available scores + # Test case 3: Check partials still calculate available scores assert results["ec3"] == { "overall_rating": { "mean_squared_error": 4, diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py index c8a2dd7f..47e85d32 100644 --- a/libs/core/kiln_ai/adapters/eval/base_eval.py +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -11,7 +11,7 @@ class BaseEval: - def __init__(self, eval_config: EvalConfig, run_config: RunConfig): + def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): self.eval_config = eval_config eval = eval_config.parent_eval() if not eval: @@ -40,7 +40,10 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]: return model_name, ModelProviderName(provider) - async def run(self, input: str) -> tuple[TaskRun, EvalScores]: + async def run_task_and_eval(self, input: str) -> tuple[TaskRun, EvalScores]: + if self.run_config is None: + raise ValueError("Run config is required for run_task_and_eval") + run_adapter = adapter_for_task( self.target_task, self.run_config.model_name, diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py index fd4eceb7..3b4f0a6f 100644 --- a/libs/core/kiln_ai/adapters/eval/eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -1,11 +1,12 @@ import asyncio from dataclasses import dataclass -from typing import AsyncGenerator, List +from typing import AsyncGenerator, Dict, List, Literal, Set from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.eval.registry import eval_adapter_from_type +from kiln_ai.datamodel.basemodel import ID_TYPE from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id -from kiln_ai.datamodel.eval import EvalConfig, EvalRun +from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores from kiln_ai.datamodel.task import TaskRunConfig from kiln_ai.datamodel.task_run import TaskRun @@ -13,7 +14,10 @@ @dataclass class EvalJob: item: TaskRun - task_run_config: TaskRunConfig + type: Literal["task_run_eval", "eval_config_eval"] + # If type == "task_run_eval", both of these should be set. If type == "eval_config_eval", only eval_config should be set. + eval_config: EvalConfig + task_run_config: TaskRunConfig | None = None @dataclass @@ -32,55 +36,118 @@ class EvalRunner: def __init__( self, - eval_config: EvalConfig, - run_configs: List[TaskRunConfig], + eval_configs: List[EvalConfig], + run_configs: List[TaskRunConfig] | None, + eval_run_type: Literal["eval_config_eval", "task_run_eval"], ): - # confirm these are compatible - target_eval = eval_config.parent_eval() + if len(eval_configs) == 0: + raise ValueError("Eval runner requires at least one eval config") + target_eval = eval_configs[0].parent_eval() if target_eval is None: raise ValueError("Eval config requires a parent eval") + for eval_config in eval_configs: + parent_eval = eval_config.parent_eval() + if parent_eval is None: + raise ValueError("Eval config requires a parent eval") + if parent_eval.id != target_eval.id: + raise ValueError("All eval configs must have the same parent eval") + target_task = target_eval.parent_task() if target_task is None: raise ValueError("Eval config requires a (grand)parent task") - if len(run_configs) == 0: - raise ValueError("Eval config requires at least one run config") - - # confirm the run configs are for the target task - for run_config in run_configs: - parent_task = run_config.parent_task() - if parent_task is None: - raise ValueError("Each run config requires a parent task") - if parent_task.id != target_task.id: - raise ValueError( - "Run config is not for the same task as the eval config" - ) - self.eval_config = eval_config + # Check that run_configs is compatible + if eval_run_type == "task_run_eval": + if run_configs is None or len(run_configs) == 0: + raise ValueError("Task run eval requires run configs") + for run_config in run_configs: + parent_task = run_config.parent_task() + if parent_task is None: + raise ValueError("All run configs must have a parent task") + if parent_task.id != target_task.id: + raise ValueError( + "Run config is not for the same task as the eval configs" + ) + else: + if run_configs is not None: + raise ValueError("Mode 'eval_config_eval' does not support run configs") + + self.eval_run_type = eval_run_type + self.eval_configs = eval_configs self.run_configs = run_configs self.task = target_task self.eval = target_eval def collect_tasks(self) -> List[EvalJob]: + if self.eval_run_type == "eval_config_eval": + return self.collect_tasks_for_eval_config_eval() + else: + return self.collect_tasks_for_task_run_eval() + + def collect_tasks_for_eval_config_eval(self) -> List[EvalJob]: + """ + Collect all jobs for this run, excluding any that have already been run. + + This variant is used when evaluating an eval config, using existing dataset run. + + The tasks: + - should be in the eval config set filter + - should not have already been run for this eval config + dataset item pair + """ + filter = dataset_filter_from_id(self.eval.eval_configs_filter_id) + + # already_run[eval_config_id][dataset_id] + already_run: Dict[ID_TYPE, Set[ID_TYPE]] = {} + for eval_config in self.eval_configs: + already_run[eval_config.id] = set() + for run in eval_config.runs(readonly=True): + already_run[eval_config.id].add(run.dataset_id) + + return [ + EvalJob( + item=task_run, + eval_config=eval_config, + type="eval_config_eval", + ) + for task_run in self.task.runs(readonly=True) + if filter(task_run) + for eval_config in self.eval_configs + if task_run.id not in already_run[eval_config.id] + ] + + def collect_tasks_for_task_run_eval(self) -> List[EvalJob]: """ Collect all jobs for this run, excluding any that have already been run. + This variant is used when evaluating a range of task run configs on an eval config. + The tasks: - - should be in one of the eval filters: the eval filter (what's being evaluated) or the eval config filter (what's being evaluated to compare eval configs). - - should not have already been run for this eval config + - should be in the eval set filter + - should not have already been run for this eval config + run config pair """ - config_filter = dataset_filter_from_id(self.eval.eval_configs_filter_id) - eval_filter = dataset_filter_from_id(self.eval.eval_set_filter_id) + filter = dataset_filter_from_id(self.eval.eval_set_filter_id) + + # already_run[eval_config_id][run_config_id][dataset_id] + already_run: Dict[ID_TYPE, Dict[ID_TYPE, Set[ID_TYPE]]] = {} + for eval_config in self.eval_configs: + already_run[eval_config.id] = {} + for run_config in self.run_configs or []: + already_run[eval_config.id][run_config.id] = set() + for run in eval_config.runs(readonly=True): + already_run[eval_config.id][run_config.id].add(run.dataset_id) - already_run = { - f"{run.dataset_id}::{run.task_run_config_id}" - for run in self.eval_config.runs(readonly=True) - } return [ - EvalJob(item=task_run, task_run_config=run_config) + EvalJob( + item=task_run, + task_run_config=run_config, + type="task_run_eval", + eval_config=eval_config, + ) for task_run in self.task.runs(readonly=True) - if config_filter(task_run) or eval_filter(task_run) - for run_config in self.run_configs - if f"{task_run.id}::{run_config.id}" not in already_run + if filter(task_run) + for eval_config in self.eval_configs + for run_config in self.run_configs or [] + if task_run.id not in already_run[eval_config.id][run_config.id] ] async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]: @@ -148,22 +215,37 @@ async def run_worker( async def run_job(self, job: EvalJob) -> bool: try: # Create the evaluator for this eval config/run config pair - evaluator = eval_adapter_from_type(self.eval_config.config_type)( - self.eval_config, job.task_run_config.run_config() + evaluator = eval_adapter_from_type(job.eval_config.config_type)( + job.eval_config, + job.task_run_config.run_config() if job.task_run_config else None, ) if not isinstance(evaluator, BaseEval): raise ValueError("Not able to create evaluator from eval config") - result_task_run, scores = await evaluator.run(job.item.input) + task_output: str | None = None + scores: EvalScores | None = None + if job.type == "eval_config_eval": + # Eval config eval, we use the saved input from the task run, not invoking the task again + scores = await evaluator.run_eval(job.item) + task_output = job.item.output.output + else: + # Task run eval, we invoke the task again to get a fresh output + result_task_run, scores = await evaluator.run_task_and_eval( + job.item.input + ) + task_output = result_task_run.output.output # Save the job result eval_run = EvalRun( - parent=self.eval_config, - task_run_config_id=job.task_run_config.id, + parent=job.eval_config, + task_run_config_id=job.task_run_config.id + if job.task_run_config + else None, dataset_id=job.item.id, + eval_config_eval=job.type == "eval_config_eval", scores=scores, input=job.item.input, - output=result_task_run.output.output, + output=task_output, ) eval_run.save_to_file() diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py index eaa34b67..4ee6a9a4 100644 --- a/libs/core/kiln_ai/adapters/eval/g_eval.py +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -4,9 +4,9 @@ from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.eval.base_eval import BaseEval from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput -from kiln_ai.adapters.prompt_builders import PromptGenerators, prompt_builder_from_id +from kiln_ai.adapters.prompt_builders import PromptGenerators from kiln_ai.datamodel import Project, Task, TaskRun -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalScores +from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores from kiln_ai.datamodel.task import RunConfig from openai.types.chat import ChatCompletionTokenLogprob @@ -34,7 +34,7 @@ def __init__(self, eval_config: EvalConfig): tmp_project = Project(name="GEval") # Build a simple LLM as Judge system instruction - system_instruction = f"Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" + system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" # Optionally add a short task description task_description = eval_config.properties.get("task_description", None) if task_description: @@ -75,7 +75,7 @@ class GEval(BaseEval): LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. """ - def __init__(self, eval_config: EvalConfig, run_config: RunConfig): + def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): if ( eval_config.config_type != EvalConfigType.g_eval and eval_config.config_type != EvalConfigType.llm_as_judge diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py index 8c333f22..16411ccd 100644 --- a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -12,7 +12,13 @@ TaskOutputRatingType, TaskRun, ) -from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore, EvalRun +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalOutputScore, + EvalRun, + EvalScores, +) from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig @@ -98,8 +104,9 @@ def mock_eval_runner( mock_eval, data_source, mock_task, mock_eval_config, mock_run_config ): return EvalRunner( - eval_config=mock_eval_config, + eval_configs=[mock_eval_config], run_configs=[mock_run_config], + eval_run_type="task_run_eval", ) @@ -135,7 +142,12 @@ async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency): def test_collect_tasks_filtering( - mock_eval_runner, mock_task, mock_eval_config, data_source + mock_eval, + mock_eval_runner, + mock_task, + mock_eval_config, + data_source, + mock_run_config, ): """Test that tasks are properly filtered based on eval filters""" tags = ["tag1", "tag2", "tag3"] @@ -154,21 +166,139 @@ def test_collect_tasks_filtering( task_run.save_to_file() task_runs.append(task_run) - # Set up filters to only match tag1 - mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" - mock_eval_runner.eval.eval_configs_filter_id = "tag::tag2" + mock_eval.eval_set_filter_id = "tag::tag1" + mock_eval.eval_configs_filter_id = "tag::tag2" - jobs = mock_eval_runner.collect_tasks() + # Create a new runner of type task run eval + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=[mock_run_config], + eval_run_type="task_run_eval", + ) + jobs = runner.collect_tasks() + + # Should only get task_run1 jobs, the one with tag1 + assert len(jobs) == 1 + job = jobs[0] + # job should be the tag1 item, and setup as a task run eval for mock_run_config + assert job.item.tags == ["tag1"] + assert job.task_run_config.id == mock_run_config.id + assert job.eval_config.id == mock_eval_config.id + + # Change to an eval config set filter + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + + # Should only get eval_config1 jobs + assert len(jobs) == 1 + job = jobs[0] + # job should be the tag2 item, and setup as a eval config eval for mock_eval_config + assert job.item.tags == ["tag2"] + assert job.eval_config.id == mock_eval_config.id + assert job.task_run_config is None + + # Add a second task run config, and call a new runner with multiple run configs + rc = TaskRunConfig( + name="test2", + description="test2", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + rc.save_to_file() + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=[mock_run_config, rc], + eval_run_type="task_run_eval", + ) + jobs = runner.collect_tasks() + assert len(jobs) == 2 + for job in jobs: + assert job.item.tags == ["tag1"] + assert job.task_run_config.id in [mock_run_config.id, rc.id] + assert job.eval_config.id == mock_eval_config.id + assert jobs[0].task_run_config.id != jobs[1].task_run_config.id - # Should only get task_run1 jobs + # add a second eval config, and call a new runner with multiple eval configs + eval_config = EvalConfig( + name="test2", + model=data_source, + parent=mock_eval, + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + ) + eval_config.save_to_file() + runner = EvalRunner( + eval_configs=[mock_eval_config, eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + # Check we get 2 jobs, one for each eval config assert len(jobs) == 2 - ids = [job.item.id for job in jobs] - assert task_runs[0].id in ids - assert task_runs[1].id in ids - assert task_runs[2].id not in ids + for job in jobs: + assert job.item.tags == ["tag2"] + assert job.eval_config.id in [mock_eval_config.id, eval_config.id] + assert job.task_run_config is None + assert jobs[0].eval_config.id != jobs[1].eval_config.id + + +def test_validate_same_task( + mock_eval_runner, + mock_task, + data_source, + tmp_path, + mock_eval_config, + mock_run_config, +): + # second eval config has a different task + eval_config = EvalConfig( + name="test2", + model=data_source, + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + parent=Eval( + name="test", + description="test", + eval_set_filter_id="all", + eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Accuracy", + instruction="Check if the output is accurate", + type=TaskOutputRatingType.pass_fail, + ), + ], + parent=Task( + name="test", + description="test", + instruction="do the thing", + ), + ), + ) + + with pytest.raises( + ValueError, match="All eval configs must have the same parent eval" + ): + EvalRunner( + eval_configs=[mock_eval_config, eval_config], + run_configs=[mock_run_config], + eval_run_type="eval_config_eval", + ) -def test_collect_tasks_excludes_already_run(mock_eval_runner, mock_task, data_source): +def test_collect_tasks_excludes_already_run_task_run_eval( + mock_eval_runner, mock_task, data_source, mock_eval_config, mock_run_config +): """Test that already run tasks are excluded""" # Create a task run task_run = TaskRun( @@ -186,12 +316,14 @@ def test_collect_tasks_excludes_already_run(mock_eval_runner, mock_task, data_so jobs = mock_eval_runner.collect_tasks() assert len(jobs) == 1 assert jobs[0].item.id == task_run.id + assert jobs[0].task_run_config.id == mock_run_config.id + assert jobs[0].eval_config.id == mock_eval_config.id # Create an eval run for this task EvalRun( - parent=mock_eval_runner.eval_config, + parent=mock_eval_config, dataset_id=task_run.id, - task_run_config_id=mock_eval_runner.run_configs[0].id, + task_run_config_id=mock_run_config.id, input="test", output="test", scores={"accuracy": 1.0}, @@ -207,6 +339,57 @@ def test_collect_tasks_excludes_already_run(mock_eval_runner, mock_task, data_so assert len(jobs) == 0 +def test_collect_tasks_excludes_already_run_eval_config_eval( + mock_task, data_source, mock_eval_config, mock_eval, mock_run_config +): + """Test that already run tasks are excluded""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + mock_eval.eval_set_filter_id = "tag::nonexistent" + mock_eval.eval_configs_filter_id = "tag::tag1" + mock_eval.save_to_file() + + # Prior to any eval runs, we should get 1 job for the eval config + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + assert len(jobs) == 1 + assert jobs[0].item.id == task_run.id + assert jobs[0].eval_config.id == mock_eval_config.id + assert jobs[0].task_run_config is None + + # Create an eval run for this eval config task run pair, so now we should get no jobs (already run) + EvalRun( + parent=mock_eval_config, + dataset_id=task_run.id, + task_run_config_id=None, + eval_config_eval=True, + input="test", + output="test", + scores={ + "accuracy": 1.0, + }, + ).save_to_file() + + jobs = runner.collect_tasks() + + # Should get no jobs since the task was already run + assert len(jobs) == 0 + + def test_collect_tasks_multiple_run_configs( mock_eval_runner, mock_task, data_source, mock_run_config ): @@ -276,8 +459,8 @@ def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source): @pytest.mark.asyncio -async def test_run_job_success( - mock_eval_runner, mock_task, data_source, mock_run_config +async def test_run_job_success_task_run_eval( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config ): # Create a task run to evaluate task_run = TaskRun( @@ -289,7 +472,12 @@ async def test_run_job_success( task_run.save_to_file() # Create eval job - job = EvalJob(item=task_run, task_run_config=mock_run_config) + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) # Mock the evaluator mock_result_run = TaskRun( @@ -300,7 +488,7 @@ async def test_run_job_success( mock_scores = {"accuracy": 0.95} class MockEvaluator(BaseEval): - async def run(self, input_text): + async def run_task_and_eval(self, input_text): return mock_result_run, mock_scores with patch( @@ -312,7 +500,7 @@ async def run(self, input_text): assert success is True # Verify eval run was saved - eval_runs = mock_eval_runner.eval_config.runs() + eval_runs = mock_eval_config.runs() assert len(eval_runs) == 1 saved_run = eval_runs[0] assert saved_run.dataset_id == task_run.id @@ -320,11 +508,69 @@ async def run(self, input_text): assert saved_run.scores == mock_scores assert saved_run.input == "test input" assert saved_run.output == "evaluated output" + assert saved_run.parent_eval_config().id == mock_eval_config.id + assert saved_run.eval_config_eval is False + + +@pytest.mark.asyncio +async def test_run_job_success_eval_config_eval( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config +): + # Create a task run to evaluate + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + + # Create eval job + job = EvalJob( + item=task_run, + type="eval_config_eval", + eval_config=mock_eval_config, + ) + + # Mock the evaluator + mock_result_run = TaskRun( + input="test input", + input_source=data_source, + output=TaskOutput(output="evaluated output"), + ) + mock_scores: EvalScores = {"accuracy": 0.95} + + class MockEvaluator(BaseEval): + async def run_task_and_eval(self, input_text): + raise ValueError("Attempted to run task and eval for a config eval") + + async def run_eval(self, task_run: TaskRun) -> EvalScores: + return mock_scores + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: MockEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is True + + # Verify eval run was saved + eval_runs = mock_eval_config.runs() + assert len(eval_runs) == 1 + saved_run = eval_runs[0] + assert saved_run.dataset_id == task_run.id + assert saved_run.task_run_config_id is None + assert saved_run.scores == mock_scores + assert saved_run.input == "test input" + assert saved_run.output == "test output" + assert saved_run.parent_eval_config().id == mock_eval_config.id + assert saved_run.eval_config_eval is True @pytest.mark.asyncio async def test_run_job_invalid_evaluator( - mock_eval_runner, mock_task, data_source, mock_run_config + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config ): task_run = TaskRun( parent=mock_task, @@ -333,7 +579,12 @@ async def test_run_job_invalid_evaluator( output=TaskOutput(output="test output"), ) task_run.save_to_file() - job = EvalJob(item=task_run, task_run_config=mock_run_config) + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) # Return an invalid evaluator type with patch( @@ -343,12 +594,12 @@ async def test_run_job_invalid_evaluator( success = await mock_eval_runner.run_job(job) assert success is False - assert len(mock_eval_runner.eval_config.runs()) == 0 + assert len(mock_eval_config.runs()) == 0 @pytest.mark.asyncio async def test_run_job_evaluator_error( - mock_eval_runner, mock_task, data_source, mock_run_config + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config ): task_run = TaskRun( parent=mock_task, @@ -357,7 +608,12 @@ async def test_run_job_evaluator_error( output=TaskOutput(output="test output"), ) task_run.save_to_file() - job = EvalJob(item=task_run, task_run_config=mock_run_config) + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) class ErrorEvaluator(BaseEval): async def run(self, input_text): @@ -370,4 +626,4 @@ async def run(self, input_text): success = await mock_eval_runner.run_job(job) assert success is False - assert len(mock_eval_runner.eval_config.runs()) == 0 + assert len(mock_eval_config.runs()) == 0 diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 84540324..3d691c8b 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -84,8 +84,15 @@ class EvalRun(KilnParentedModel): dataset_id: ID_TYPE = Field( description="The ID of the dataset item that was used for this run (we only use it's input). Must belong to the same Task as this eval." ) - task_run_config_id: ID_TYPE = Field( - description="The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval." + # Eval runs can be one of 2 types: + # 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We ran the task with the task_run_config, saved the output, then ran the evaluator on the output. task_run_config_id must be set. + # 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. + task_run_config_id: ID_TYPE | None = Field( + description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." + ) + eval_config_eval: bool = Field( + description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", + default=False, ) # This may duplicate the dataset_id.input, but we're denormalizing intentionally. input: str = Field( @@ -103,6 +110,18 @@ def parent_eval_config(self) -> Union["EvalConfig", None]: raise ValueError("parent must be an EvalConfig") return self.parent # type: ignore + @model_validator(mode="after") + def validate_eval_run_types(self) -> Self: + if self.eval_config_eval and self.task_run_config_id is not None: + raise ValueError( + "task_run_config_id must be None if eval_config_eval is true" + ) + if not self.eval_config_eval and self.task_run_config_id is None: + raise ValueError( + "task_run_config_id must be set if eval_config_eval is false" + ) + return self + @model_validator(mode="after") def validate_scores(self) -> Self: # We're checking the scores have the expected keys from the grand-parent eval diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index c75ac1a1..cff21cc2 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -605,3 +605,57 @@ def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_da ) ], ) + + +def test_eval_run_eval_config_eval_validation(): + """Test that eval_config_eval and task_run_config_id validation works correctly""" + + # Case 1: Valid configuration - eval_config_eval=True and task_run_config_id=None + valid_run1 = EvalRun( + dataset_id="dataset123", + eval_config_eval=True, + task_run_config_id=None, + input="test input", + output="test output", + scores={"score": 1.0}, + ) + assert valid_run1.eval_config_eval is True + assert valid_run1.task_run_config_id is None + + # Case 2: Valid configuration - eval_config_eval=False and task_run_config_id is set + valid_run2 = EvalRun( + dataset_id="dataset123", + eval_config_eval=False, + task_run_config_id="config456", + input="test input", + output="test output", + scores={"score": 1.0}, + ) + assert valid_run2.eval_config_eval is False + assert valid_run2.task_run_config_id == "config456" + + # Case 3: Invalid configuration - eval_config_eval=True but task_run_config_id is set + with pytest.raises( + ValueError, match="task_run_config_id must be None if eval_config_eval is true" + ): + EvalRun( + dataset_id="dataset123", + eval_config_eval=True, + task_run_config_id="config456", + input="test input", + output="test output", + scores={"score": 1.0}, + ) + + # Case 4: Invalid configuration - eval_config_eval=False but task_run_config_id is None + with pytest.raises( + ValueError, match="task_run_config_id must be set if eval_config_eval is false" + ): + EvalRun( + dataset_id="dataset123", + eval_config_eval=False, + task_run_config_id=None, + input="test input", + output="test output", + scores={"score": 1.0}, + ) From f6dec21b234682cea371ffbcef5f276e06bd3919 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 21:14:10 -0500 Subject: [PATCH 061/102] Fix bug in how we collected runs --- libs/core/kiln_ai/adapters/eval/eval_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py index 3b4f0a6f..11d8b9f1 100644 --- a/libs/core/kiln_ai/adapters/eval/eval_runner.py +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -133,8 +133,11 @@ def collect_tasks_for_task_run_eval(self) -> List[EvalJob]: already_run[eval_config.id] = {} for run_config in self.run_configs or []: already_run[eval_config.id][run_config.id] = set() - for run in eval_config.runs(readonly=True): - already_run[eval_config.id][run_config.id].add(run.dataset_id) + for run in eval_config.runs(readonly=True): + if run.task_run_config_id is not None: + already_run[eval_config.id][run.task_run_config_id].add( + run.dataset_id + ) return [ EvalJob( From ee1318ef29304bb2cb859b94865d15cb5d9f64b0 Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 23:11:41 -0500 Subject: [PATCH 062/102] Fix 2 issues: - Test were failing on CI from how we checked provider. Just use name now. - Don't specify extra OR parameters, unless needed for logprobs --- app/desktop/studio_server/eval_api.py | 56 ++++++++++++------ app/desktop/studio_server/test_eval_api.py | 39 +++++++++++++ app/web_ui/src/lib/api_schema.d.ts | 58 ++++++++++++++++++- .../model_adapters/openai_model_adapter.py | 12 ++-- 4 files changed, 143 insertions(+), 22 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index 0b834d89..c0578197 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -72,6 +72,27 @@ def task_run_config_from_id( ) +# JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better +async def run_eval_runner_with_status(eval_runner: EvalRunner) -> StreamingResponse: + # Async messages via server side events (SSE) + async def event_generator(): + async for progress in eval_runner.run(): + data = { + "progress": progress.complete, + "total": progress.total, + "errors": progress.errors, + } + yield f"data: {json.dumps(data)}\n\n" + + # Send the final complete message the app expects, and uses to stop listening + yield "data: complete\n\n" + + return StreamingResponse( + content=event_generator(), + media_type="text/event-stream", + ) + + class CreateEvaluatorRequest(BaseModel): name: str description: str @@ -332,7 +353,6 @@ async def create_eval_config( eval_config.save_to_file() return eval_config - # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run" ) @@ -367,24 +387,26 @@ async def run_eval_config( eval_run_type="task_run_eval", ) - # Async messages via server side events (SSE) - async def event_generator(): - async for progress in eval_runner.run(): - data = { - "progress": progress.complete, - "total": progress.total, - "errors": progress.errors, - } - yield f"data: {json.dumps(data)}\n\n" - - # Send the final complete message the app expects, and uses to stop listening - yield "data: complete\n\n" - - return StreamingResponse( - content=event_generator(), - media_type="text/event-stream", + return await run_eval_runner_with_status(eval_runner) + + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval" + ) + async def run_eval_config_eval( + project_id: str, + task_id: str, + eval_id: str, + ) -> StreamingResponse: + eval = eval_from_id(project_id, task_id, eval_id) + eval_configs = eval.configs() + eval_runner = EvalRunner( + eval_configs=eval_configs, + run_configs=None, + eval_run_type="eval_config_eval", ) + return await run_eval_runner_with_status(eval_runner) + @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results" ) diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 539c0c9e..d982cdf7 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -5,6 +5,7 @@ import pytest from fastapi import FastAPI, HTTPException +from fastapi.responses import StreamingResponse from fastapi.testclient import TestClient from kiln_ai.adapters.ml_model_list import ModelProviderName from kiln_ai.datamodel import ( @@ -961,3 +962,41 @@ class EvalCondigSummaryTestData: # Test case 5: Check skipping eval run lowers the percent complete assert eval_config_percent_complete["ec5"] == pytest.approx(0 / total_in_dataset) + + +@pytest.mark.asyncio +async def test_run_eval_config_eval( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + # Create a mock response for run_eval_runner_with_status + mock_response = StreamingResponse( + content=iter([b"data: test\n\n"]), media_type="text/event-stream" + ) + + with patch( + "app.desktop.studio_server.eval_api.run_eval_runner_with_status" + ) as mock_run_eval: + # Set up the mock to return our mock response + mock_run_eval.return_value = mock_response + + # Call the endpoint + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/run_eval_config_eval" + ) + + # Verify the response + assert response.status_code == 200 + + # Verify run_eval_runner_with_status was called with correct parameters + mock_run_eval.assert_called_once() + + # Get the EvalRunner that was passed to run_eval_runner_with_status + eval_runner = mock_run_eval.call_args[0][0] + + # Verify the EvalRunner was configured correctly + assert len(eval_runner.eval_configs) == 1 + assert eval_runner.eval_configs[0].id == mock_eval_config.id + assert eval_runner.run_configs is None + assert eval_runner.eval_run_type == "eval_config_eval" diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index fb43195b..14c403fc 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -810,6 +810,23 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Run Eval Config Eval */ + get: operations["run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results": { parameters: { query?: never; @@ -1443,9 +1460,15 @@ export interface components { dataset_id: string | null; /** * Task Run Config Id - * @description The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval. + * @description The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config. */ task_run_config_id: string | null; + /** + * Eval Config Eval + * @description Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task. + * @default false + */ + eval_config_eval: boolean; /** * Input * @description The input to the task. JSON formatted for structured input, plaintext for unstructured input. @@ -4237,6 +4260,39 @@ export interface operations { }; }; }; + run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get: { parameters: { query?: never; diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index 909146c9..06881fc4 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -100,7 +100,8 @@ async def _run(self, input: Dict | str) -> RunOutput: ] ) - # OpenRouter specific options for reasoning models + # OpenRouter specific options for reasoning models and logprobs. + # TODO: this isn't a good place for this and I should refactor. But big usability improvement so keeping it here for now. extra_body = {} require_or_reasoning = ( self.config.openrouter_style_reasoning and provider.reasoning_capable @@ -115,8 +116,11 @@ async def _run(self, input: Dict | str) -> RunOutput: # fp8 quants are awful "ignore": ["DeepInfra"], } - elif self.model_provider().name == ModelProviderName.openrouter: - # OpenRouter specific options. Bit of a hack but really does improve usability. + elif ( + self.run_config.model_provider_name == ModelProviderName.openrouter + and self.base_adapter_config.top_logprobs is not None + ): + # OpenRouter specific options related to logprobs. Bit of a hack but really does improve usability. extra_body["provider"] = { "require_parameters": True, "ignore": ["DeepInfra"], @@ -246,7 +250,7 @@ def tool_call_params(self) -> dict[str, Any]: "parameters": output_schema, } # This parameter is only reliable for OpenAI - if self.model_provider().name == ModelProviderName.openai: + if self.run_config.model_provider_name == ModelProviderName.openai: function_params["strict"] = True return { From 1133e1a1dabf73df69d38b05db9d22dad8d0b6ff Mon Sep 17 00:00:00 2001 From: scosman Date: Tue, 25 Feb 2025 23:44:08 -0500 Subject: [PATCH 063/102] Fully functionaly UI for finding the eval-config which works best for your score. Includes the ability to run the eval-config-eval. --- app/web_ui/src/lib/utils/formatters.ts | 13 + .../[task_id]/[eval_id]/+page.svelte | 185 +--------- .../[eval_id]/eval_configs/+page.svelte | 347 +++++++----------- .../eval_config_instruction.svelte | 38 ++ .../[task_id]/[eval_id]/run_eval.svelte | 183 +++++++++ 5 files changed, 385 insertions(+), 381 deletions(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte diff --git a/app/web_ui/src/lib/utils/formatters.ts b/app/web_ui/src/lib/utils/formatters.ts index 46a977fd..d1563893 100644 --- a/app/web_ui/src/lib/utils/formatters.ts +++ b/app/web_ui/src/lib/utils/formatters.ts @@ -1,3 +1,5 @@ +import { type EvalConfigType } from "$lib/types" + export function formatDate(dateString: string | undefined): string { if (!dateString) { return "Unknown" @@ -40,3 +42,14 @@ export function formatDate(dateString: string | undefined): string { .replace(" PM", "pm") .replace(",", "") } + +export function eval_config_to_ui_name( + eval_config_type: EvalConfigType, +): string { + return ( + { + g_eval: "G-Eval", + llm_as_judge: "LLM as Judge", + }[eval_config_type] || eval_config_type + ) +} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index b2696a8d..2a1c5aaf 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -8,7 +8,6 @@ import FormElement from "$lib/utils/form_element.svelte" import type { EvalConfig, - EvalConfigType, ProviderModels, TaskRunConfig, EvalResultSummary, @@ -29,6 +28,8 @@ import Warning from "$lib/ui/warning.svelte" import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates" import InfoTooltip from "$lib/ui/info_tooltip.svelte" + import RunEval from "./run_eval.svelte" + import { eval_config_to_ui_name } from "$lib/utils/formatters" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -218,15 +219,6 @@ value: string } - function eval_config_to_ui_name(eval_config_type: EvalConfigType): string { - return ( - { - g_eval: "G-Eval", - llm_as_judge: "LLM as Judge", - }[eval_config_type] || eval_config_type - ) - } - // A name for the eval config that is human readable and helpful // Combine's it's memorable name with it's properties function get_eval_config_name( @@ -349,72 +341,12 @@ return results } - let run_dialog: Dialog | null = null - let running_progress_dialog: Dialog | null = null - - let eval_run_error: KilnError | null = null let eval_state: | "not_started" | "running" | "complete" | "complete_with_errors" = "not_started" - let eval_complete_count = 0 - let eval_total_count = 0 - let eval_error_count = 0 - - function run_eval(): boolean { - if (!current_eval_config_id) { - eval_run_error = new KilnError("No eval config selected", null) - eval_state = "complete_with_errors" - // True to close the run dialog, and then show the error in the progress dialog - running_progress_dialog?.show() - return true - } - - score_summary = null - eval_state = "running" - eval_complete_count = 0 - eval_total_count = 0 - eval_error_count = 0 - - const eventSource = new EventSource( - `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true`, - ) - - eventSource.onmessage = (event) => { - try { - if (event.data === "complete") { - // Special end message - eventSource.close() - eval_state = - eval_error_count > 0 ? "complete_with_errors" : "complete" - get_score_summary() - } else { - const data = JSON.parse(event.data) - eval_complete_count = data.progress - eval_total_count = data.total - eval_error_count = data.errors - eval_state = "running" - } - } catch (error) { - eval_run_error = createKilnError(error) - eval_state = "complete_with_errors" - get_score_summary() - } - } - - // Don't restart on an error (default SSE behavior) - eventSource.onerror = (error) => { - eventSource.close() - eval_state = "complete_with_errors" - eval_run_error = createKilnError(error) - get_score_summary() - } - - // Switch over to the progress dialog, closing the run dialog - running_progress_dialog?.show() - return true - } + $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true` let task_run_config_model_name = "" let task_run_config_provider_name = "" @@ -561,8 +493,7 @@
Results Summary
- Overview of how various task run configs perform on the selected - evaluator{current_eval_config + How various task run configs perform on the selected evaluator{current_eval_config ? ` (${current_eval_config.name})` : ""}.
@@ -581,31 +512,15 @@ add_task_config_dialog?.show() }}>Add Run Config - - {:else} - {/if} + { + console.log("run complete") + get_score_summary() + }} + />
@@ -787,79 +702,3 @@ {/if} - - -
- {#if eval_state === "complete"} -
Eval Complete 🎉
- {#if eval_total_count == 0} -
- No evals were run, because everything was already up to date! -
- {/if} - {:else if eval_state === "complete_with_errors"} -
Eval Complete with Errors
- {:else if eval_state === "running"} -
-
Running...
- {/if} -
- {#if eval_total_count > 0} -
- {eval_complete_count + eval_error_count} of {eval_total_count} -
- {/if} - {#if eval_error_count > 0} -
- {eval_error_count} error{eval_error_count === 1 ? "" : "s"} -
- {/if} - {#if eval_run_error} -
- {eval_run_error.getMessage() || "An unknown error occurred"} -
- {/if} -
-
-
- - -
-
Run this eval with the selected configuration?
-
Don't close this page if you want to monitor progress.
- -
-
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index 8415f289..8af9a2f2 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -5,32 +5,32 @@ import { KilnError, createKilnError } from "$lib/utils/error_handlers" import { onMount, tick } from "svelte" import { page } from "$app/stores" - import FormElement from "$lib/utils/form_element.svelte" - import type { - EvalConfig, - EvalConfigType, - ProviderModels, - EvalConfigCompareSummary, - } from "$lib/types" - import { goto } from "$app/navigation" + import RunEval from "./../run_eval.svelte" + import type { EvalConfig, EvalConfigCompareSummary } from "$lib/types" import { model_info, load_model_info, model_name, provider_name_from_id, - prompt_name_from_id, load_available_prompts, load_available_models, } from "$lib/stores" - import Dialog from "$lib/ui/dialog.svelte" import Warning from "$lib/ui/warning.svelte" - import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates" import InfoTooltip from "$lib/ui/info_tooltip.svelte" + import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates" + import EvalConfigInstruction from "./eval_config_instruction.svelte" + import Dialog from "$lib/ui/dialog.svelte" + import { eval_config_to_ui_name } from "$lib/utils/formatters" + + let score_legend_dialog: Dialog | null = null let evaluator: Eval | null = null let eval_error: KilnError | null = null let eval_loading = true + let eval_config_instructions_dialog: Dialog | null = null + let displayed_eval_config: EvalConfig | null = null + let eval_configs: EvalConfig[] | null = null let eval_configs_error: KilnError | null = null let eval_configs_loading = true @@ -41,6 +41,7 @@ $: loading = eval_loading || eval_configs_loading || score_summary_loading $: error = eval_error || eval_configs_error || score_summary_error + $: run_eval_url = `${base_url}/api/projects/${$page.params.project_id}/tasks/${$page.params.task_id}/eval/${$page.params.eval_id}/run_eval_config_eval` onMount(async () => { // Wait for page params to load @@ -169,84 +170,51 @@ return properties } - let run_dialog: Dialog | null = null - let running_progress_dialog: Dialog | null = null - - let eval_run_error: KilnError | null = null - let eval_state: - | "not_started" - | "running" - | "complete" - | "complete_with_errors" = "not_started" - let eval_complete_count = 0 - let eval_total_count = 0 - let eval_error_count = 0 - - function run_eval(): boolean { - score_summary = null - eval_state = "running" - eval_complete_count = 0 - eval_total_count = 0 - eval_error_count = 0 - - const eventSource = new EventSource( - `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true`, - ) - - eventSource.onmessage = (event) => { - try { - if (event.data === "complete") { - // Special end message - eventSource.close() - eval_state = - eval_error_count > 0 ? "complete_with_errors" : "complete" - get_score_summary() - } else { - const data = JSON.parse(event.data) - eval_complete_count = data.progress - eval_total_count = data.total - eval_error_count = data.errors - eval_state = "running" - } - } catch (error) { - eval_run_error = createKilnError(error) - eval_state = "complete_with_errors" - get_score_summary() - } + function incomplete_warning( + score_summary: EvalConfigCompareSummary | null, + ): string[] { + if (!score_summary) { + return [] } - // Don't restart on an error (default SSE behavior) - eventSource.onerror = (error) => { - eventSource.close() - eval_state = "complete_with_errors" - eval_run_error = createKilnError(error) - get_score_summary() + const warnings: string[] = [] + if (score_summary.dataset_size === 0) { + warnings.push( + "No items in your eval-config dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.", + ) } - - // Switch over to the progress dialog, closing the run dialog - running_progress_dialog?.show() - return true - } - - // TODO P0: adapt this from other screen, to this screen. warning if len(results) == 0, no items in dataset (dataset_size == 0), and other "go fix your dataset" warnings - function show_incomplete_warning( - score_summary: EvalResultSummary | null, - ): boolean { - if (!score_summary?.run_config_percent_complete) { - return false + if (score_summary.not_rated_count > 0) { + warnings.push( + `${score_summary.not_rated_count} item(s) in your eval-config dataset are not rated at all. Add human ratings to these items in the dataset tab.`, + ) + } + if (score_summary.partially_rated_count > 0) { + warnings.push( + `${score_summary.partially_rated_count} item(s) in your eval-config dataset are only partially rated. Add human ratings to these items in the dataset tab for each score.`, + ) } - return false - const values = Object.values(score_summary.run_config_percent_complete) + const completion_values = Object.values( + score_summary.eval_config_percent_complete, + ) const minComplete = - values.length > 0 - ? values.reduce((min, val) => Math.min(min, val), 1.0) + completion_values.length > 0 + ? completion_values.reduce((min, val) => Math.min(min, val), 1.0) : 1.0 - return minComplete < 1.0 + if (minComplete < 1.0) { + warnings.push( + "You evals are incomplete. Click 'Run Evals' to generate scores for the missing items.", + ) + } + + return warnings } - + {#if loading}
@@ -282,8 +250,8 @@
Correlation to Human Scores
- Overview of how each eval config correlates to human scores - (ratings from the dataset tab). + How each eval config correlates to human scores (ratings from the + dataset tab).
{#if score_summary_error}
@@ -293,48 +261,35 @@ {/if}
- {#if eval_state === "not_started"} - - {:else} - - {/if} + + { + get_score_summary() + }} + />
- - {#if show_incomplete_warning(score_summary)} + {#if incomplete_warning(score_summary).length}
- + +
    + {#each incomplete_warning(score_summary) as warning} +
  • {warning}
  • + {/each} +
{/if} @@ -389,6 +344,9 @@
{eval_config.name}
+
+ {eval_config_to_ui_name(eval_config.config_type)} +
{model_name( eval_config?.model.properties?.["model_name"], @@ -397,8 +355,7 @@
{provider_name_from_id( - eval_config?.model.properties?.["model_provider_name"] + - "", + eval_config?.model.properties?.["model_provider"] + "", )}
{#if percent_complete} @@ -407,43 +364,54 @@ ? 'text-error' : 'text-gray-500'}" > - Eval {(percent_complete * 100.0).toFixed(1)}% complete + {(percent_complete * 100.0).toFixed(1)}% complete
{:else if score_summary} -
Eval 0% complete
+
0% complete
{/if}
-
- {#if eval_config.properties?.["task_description"]} -
-
Task Description:
- {eval_config.properties["task_description"]} -
- {/if} - {#if eval_config.properties?.["eval_steps"] && Array.isArray(eval_config.properties["eval_steps"])} -
-
- Evaluator Instructions: +
+
+ +
+
+
+
-
    - {#each eval_config.properties["eval_steps"] as step} -
  1. - - {step} - -
  2. - {/each} -
- {/if} +
- {score != null ? score.toFixed(2) : "unknown"} + {@const scores = + score_summary?.results?.["" + eval_config.id]?.[ + string_to_json_key(output_score.name) + ]} + + {#if scores} +
+ MAE: {scores.mean_absolute_error.toFixed(2)} +
+
+ MSE: {scores.mean_squared_error.toFixed(2)} +
+ {:else} + unknown + {/if}
From a493ccdaa6d847601df449cd4cb0f6c3c845c15e Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 26 Feb 2025 09:17:13 -0500 Subject: [PATCH 065/102] Add 2 new scores: normalized MSE and MAE --- app/desktop/studio_server/eval_api.py | 30 +++++++++++++++- app/desktop/studio_server/test_eval_api.py | 10 ++++++ libs/core/kiln_ai/datamodel/task_output.py | 22 ++++++++++++ libs/core/kiln_ai/datamodel/test_task.py | 41 ++++++++++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index e8423aa7..5dde89ae 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -27,6 +27,7 @@ from kiln_ai.datamodel.json_schema import string_to_json_key from kiln_ai.datamodel.prompt_id import is_frozen_prompt from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig +from kiln_ai.datamodel.task_output import normalize_rating from kiln_ai.utils.name_generator import generate_memorable_name from kiln_server.task_api import task_from_id from pydantic import BaseModel @@ -144,7 +145,9 @@ class EvalResultSummary(BaseModel): class EvalConfigScoreSummary(BaseModel): mean_absolute_error: float + mean_normalized_absolute_error: float mean_squared_error: float + mean_normalized_squared_error: float class EvalConfigCompareSummary(BaseModel): @@ -588,7 +591,9 @@ async def get_eval_configs_score_summary( # eval_config_id -> output_score_id -> scores/total total_squared_error: Dict[str, Dict[str, float]] = {} + total_normalized_squared_error: Dict[str, Dict[str, float]] = {} total_absolute_error: Dict[str, Dict[str, float]] = {} + total_normalized_absolute_error: Dict[str, Dict[str, float]] = {} total_count: Dict[str, Dict[str, int]] = {} # important: readonly makes this much faster @@ -630,18 +635,33 @@ async def get_eval_configs_score_summary( total_squared_error[eval_config_id] = {} total_absolute_error[eval_config_id] = {} total_count[eval_config_id] = {} + total_normalized_squared_error[eval_config_id] = {} + total_normalized_absolute_error[eval_config_id] = {} if score_key not in total_squared_error[eval_config_id]: total_squared_error[eval_config_id][score_key] = 0 total_absolute_error[eval_config_id][score_key] = 0 total_count[eval_config_id][score_key] = 0 + total_normalized_squared_error[eval_config_id][score_key] = 0 + total_normalized_absolute_error[eval_config_id][score_key] = 0 - # TODO normalize MSE? + normalized_eval_score = normalize_rating( + eval_score, output_score.type + ) + normalized_human_score = normalize_rating( + human_score, output_score.type + ) total_squared_error[eval_config_id][score_key] += ( eval_score - human_score ) ** 2 + total_normalized_squared_error[eval_config_id][score_key] += ( + normalized_eval_score - normalized_human_score + ) ** 2 total_absolute_error[eval_config_id][score_key] += abs( eval_score - human_score ) + total_normalized_absolute_error[eval_config_id][score_key] += abs( + normalized_eval_score - normalized_human_score + ) total_count[eval_config_id][score_key] += 1 # Convert to score summaries @@ -658,6 +678,14 @@ async def get_eval_configs_score_summary( mean_absolute_error=( total_absolute_error[eval_config_id][score_key] / count ), + mean_normalized_squared_error=( + total_normalized_squared_error[eval_config_id][score_key] + / count + ), + mean_normalized_absolute_error=( + total_normalized_absolute_error[eval_config_id][score_key] + / count + ), ) # Calculate the percent of the dataset that has been processed diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 88ceca2d..29d174db 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -923,10 +923,14 @@ class EvalCondigSummaryTestData: "overall_rating": { "mean_squared_error": 16.0, # error 4.0^2 "mean_absolute_error": 4.0, # error 4.0 + "mean_normalized_squared_error": 1, # max error: 1 v 5 + "mean_normalized_absolute_error": 1, # max error: 1 v 5 }, "score1": { "mean_squared_error": 2.25, # error (3.5-5.0)^2 "mean_absolute_error": 1.5, # error 1.5 + "mean_normalized_squared_error": 0.140625, # hand calc + "mean_normalized_absolute_error": 0.375, # 1.5/4 }, } # 1 of total_in_dataset eval configs are are in ec1 test @@ -937,10 +941,14 @@ class EvalCondigSummaryTestData: "overall_rating": { "mean_squared_error": 2.5, # error (1^2 + 2^2) / 2 "mean_absolute_error": 1.5, # (1+2)/2 + "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 + "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 }, "score1": { "mean_squared_error": 2.5, # (1^2+2^2)/2 "mean_absolute_error": 1.5, # (1+2)/2 + "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 + "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 }, } # 2 of total_in_dataset eval configs are are in ec2 test @@ -951,6 +959,8 @@ class EvalCondigSummaryTestData: "overall_rating": { "mean_squared_error": 4, "mean_absolute_error": 2, + "mean_normalized_squared_error": 0.25, + "mean_normalized_absolute_error": 0.5, }, } # 2 of total_in_dataset eval configs are are in ec2 test diff --git a/libs/core/kiln_ai/datamodel/task_output.py b/libs/core/kiln_ai/datamodel/task_output.py index 96463432..475bb547 100644 --- a/libs/core/kiln_ai/datamodel/task_output.py +++ b/libs/core/kiln_ai/datamodel/task_output.py @@ -11,6 +11,7 @@ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType from kiln_ai.datamodel.json_schema import validate_schema from kiln_ai.datamodel.strict_mode import strict_mode +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error if TYPE_CHECKING: from kiln_ai.datamodel.task import Task @@ -25,6 +26,27 @@ class RequirementRating(BaseModel): type: TaskOutputRatingType = Field(description="The type of rating") +def normalize_rating(rating: float, rating_type: TaskOutputRatingType) -> float: + """Normalize a rating to a 0-1 scale. Simple normalization, not z-score.""" + match rating_type: + case TaskOutputRatingType.five_star: + if rating < 1 or rating > 5: + raise ValueError("Five star rating must be between 1 and 5") + return (rating - 1) / 4 + case TaskOutputRatingType.pass_fail: + if rating < 0 or rating > 1: + raise ValueError("Pass fail rating must 0 to 1") + return rating + case TaskOutputRatingType.pass_fail_critical: + if rating < -1 or rating > 1: + raise ValueError("Pass fail critical rating must -1 to 1") + return (rating + 1) / 2 # -1 to 1 + case TaskOutputRatingType.custom: + raise ValueError("Custom rating type can not be normalized") + case _: + raise_exhaustive_enum_error(rating_type) + + class TaskOutputRating(KilnBaseModel): """ A rating for a task output, including an overall rating and ratings for each requirement. diff --git a/libs/core/kiln_ai/datamodel/test_task.py b/libs/core/kiln_ai/datamodel/test_task.py index b60bd51e..cf109a5c 100644 --- a/libs/core/kiln_ai/datamodel/test_task.py +++ b/libs/core/kiln_ai/datamodel/test_task.py @@ -1,8 +1,10 @@ import pytest from pydantic import ValidationError +from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType from kiln_ai.datamodel.prompt_id import PromptGenerators from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, Task, TaskRunConfig +from kiln_ai.datamodel.task_output import normalize_rating def test_runconfig_valid_creation(): @@ -116,3 +118,42 @@ def test_task_run_config_missing_task_in_run_config(sample_task): model_provider_name="openai", task=None, # type: ignore ) + + +@pytest.mark.parametrize( + "rating_type,rating,expected", + [ + (TaskOutputRatingType.five_star, 1, 0), + (TaskOutputRatingType.five_star, 2, 0.25), + (TaskOutputRatingType.five_star, 3, 0.5), + (TaskOutputRatingType.five_star, 4, 0.75), + (TaskOutputRatingType.five_star, 5, 1), + (TaskOutputRatingType.pass_fail, 0, 0), + (TaskOutputRatingType.pass_fail, 1, 1), + (TaskOutputRatingType.pass_fail, 0.5, 0.5), + (TaskOutputRatingType.pass_fail_critical, -1, 0), + (TaskOutputRatingType.pass_fail_critical, 0, 0.5), + (TaskOutputRatingType.pass_fail_critical, 1, 1), + (TaskOutputRatingType.pass_fail_critical, 0.5, 0.75), + ], +) +def test_normalize_rating(rating_type, rating, expected): + assert normalize_rating(rating, rating_type) == expected + + +@pytest.mark.parametrize( + "rating_type,rating", + [ + (TaskOutputRatingType.five_star, 0), + (TaskOutputRatingType.five_star, 6), + (TaskOutputRatingType.pass_fail, -0.5), + (TaskOutputRatingType.pass_fail, 1.5), + (TaskOutputRatingType.pass_fail_critical, -1.5), + (TaskOutputRatingType.pass_fail_critical, 1.5), + (TaskOutputRatingType.custom, 0), + (TaskOutputRatingType.custom, 99), + ], +) +def test_normalize_rating_errors(rating_type, rating): + with pytest.raises(ValueError): + normalize_rating(rating, rating_type) From 43eb784486db23067aecbccccc8ae865ec491705 Mon Sep 17 00:00:00 2001 From: scosman Date: Wed, 26 Feb 2025 10:06:57 -0500 Subject: [PATCH 066/102] Improved UI for config eval comparisons --- app/web_ui/src/lib/api_schema.d.ts | 4 + app/web_ui/src/lib/types.ts | 1 + .../[eval_id]/eval_configs/+page.svelte | 153 ++++++++++++------ .../[task_id]/[eval_id]/run_eval.svelte | 8 +- 4 files changed, 109 insertions(+), 57 deletions(-) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index a969bf12..b00c118e 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -1406,8 +1406,12 @@ export interface components { EvalConfigScoreSummary: { /** Mean Absolute Error */ mean_absolute_error: number; + /** Mean Normalized Absolute Error */ + mean_normalized_absolute_error: number; /** Mean Squared Error */ mean_squared_error: number; + /** Mean Normalized Squared Error */ + mean_normalized_squared_error: number; }; /** * EvalConfigType diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index e191de7e..1e65d654 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -6,6 +6,7 @@ export type Task = components["schemas"]["Task"] export type TaskRun = components["schemas"]["TaskRun-Input"] export type TaskRequirement = components["schemas"]["TaskRequirement"] export type TaskOutputRating = components["schemas"]["TaskOutputRating-Output"] +export type TaskOutputRatingType = components["schemas"]["TaskOutputRatingType"] export type RequirementRating = components["schemas"]["RequirementRating"] export type RatingType = components["schemas"]["TaskOutputRatingType"] export type AvailableModels = components["schemas"]["AvailableModels"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte index c0c182ad..a012e4c5 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -7,6 +7,7 @@ import { page } from "$app/stores" import RunEval from "./../run_eval.svelte" import type { EvalConfig, EvalConfigCompareSummary } from "$lib/types" + import FormElement from "$lib/utils/form_element.svelte" import { model_info, load_model_info, @@ -21,6 +22,7 @@ import EvalConfigInstruction from "./eval_config_instruction.svelte" import Dialog from "$lib/ui/dialog.svelte" import { eval_config_to_ui_name } from "$lib/utils/formatters" + import type { TaskOutputRatingType } from "$lib/types" let score_legend_dialog: Dialog | null = null @@ -39,6 +41,8 @@ let score_summary_error: KilnError | null = null let score_summary_loading = false + let score_type: "mse" | "mae" | "norm_mse" | "norm_mae" = "norm_mse" + $: loading = eval_loading || eval_configs_loading || score_summary_loading $: error = eval_error || eval_configs_error || score_summary_error $: run_eval_url = `${base_url}/api/projects/${$page.params.project_id}/tasks/${$page.params.task_id}/eval/${$page.params.eval_id}/run_eval_config_eval` @@ -245,6 +249,31 @@ eval_error = createKilnError(error) } } + + function info_tooltip_text( + rating_type: TaskOutputRatingType, + score_type: "mse" | "mae" | "norm_mse" | "norm_mae", + ) { + let label = "" + if (score_type === "mae") { + label = "Mean absolute error" + } else if (score_type === "mse") { + label = "Mean squared error" + } else if (score_type === "norm_mse") { + label = "Normalized mean squared error" + } else if (score_type === "norm_mae") { + label = "Normalized mean absolute error" + } + label += " for " + if (rating_type === "five_star") { + label += "1 to 5 star rating." + } else if (rating_type === "pass_fail") { + label += "pass/fail rating." + } else if (rating_type === "pass_fail_critical") { + label += "pass/fail/critical rating." + } + return label + } Correlation to Human Ratings
How each eval config correlates to human ratings. +
{#if score_summary_error}
@@ -308,21 +345,29 @@
{/if} -
- - { - get_score_summary() - }} +
+ +
+ { + get_score_summary() + }} + /> +
@@ -334,7 +379,7 @@ warning_message={`There are issues you should resolve before analyzing this data.`} tight={true} /> -
    +
      {#each incomplete_warning(score_summary) as warning}
    • {warning}
    • {/each} @@ -354,30 +399,14 @@ {#each evaluator.output_scores as output_score}
{output_score.name} -
- {#if output_score.type === "five_star"} - 1 to 5 - - - - {:else if output_score.type === "pass_fail"} - pass/fail - - - - {:else if output_score.type === "pass_fail_critical"} - pass/fail/critical - - - - {:else} - {output_score.type} - {/if} -
+ + +
{#if scores} -
- MAE: {scores.mean_absolute_error.toFixed(2)} -
-
- MSE: {scores.mean_squared_error.toFixed(2)} -
+ {#if score_type === "mae"} + {scores.mean_absolute_error.toFixed(2)} + {:else if score_type === "mse"} + {scores.mean_squared_error.toFixed(2)} + {:else if score_type === "norm_mse"} + {scores.mean_normalized_squared_error.toFixed(3)} + {:else if score_type === "norm_mae"} + {scores.mean_normalized_absolute_error.toFixed(3)} + {/if} {:else} unknown {/if} @@ -510,18 +542,35 @@ }, ]} > -
MAE: Mean Absolute Error
+
+ Each score is a correlation score between the evaluator's score and the + human score added through the dataset tab. +
+
Mean Absolute Error
Lower is better
-
- Example: If the eval scores an item a 3, and the eval scores it a 5, the +
+ Example: If a human scores an item a 3, and the eval scores it a 5, the absolute error would be 2 [abs(3-5)]. The overall score is the mean of all absolute errors.
-
MSE: Mean squared error
+
Normalized Mean Absolute Error
Lower is better
-
- Example: If the eval scores an item a 3, and the eval scores it a 5, the +
+ Like mean absolute error, but scores are normalized to the range 0-1. For + example, for a 1-5 star rating, 1-star is score 0 and 5-star is score 1. +
+
Mean Squared Error
+
Lower is better
+
+ Example: If a human scores an item a 3, and the eval scores it a 5, the squared error would be 4 [(3-5)^2]. The overall score is the mean of all - squared errors. + squared errors. This imporoves over absolute error as it penalizes larger + errors more. +
+
Normalized Mean Squared Error
+
Lower is better
+
+ Like mean squared error, but scores are normalized to the range 0-1. For + example, for a 1-5 star rating, 1-star is score 0 and 5-star is score 1.
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte index d0f9918c..05dd81e8 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte @@ -3,6 +3,7 @@ import Dialog from "$lib/ui/dialog.svelte" import Warning from "$lib/ui/warning.svelte" + export let btn_size: "normal" | "mid" = "mid" export let on_run_complete: () => void = () => {} export let run_url: string export let eval_state: @@ -36,10 +37,7 @@ eval_total_count = 0 eval_error_count = 0 - const eventSource = new EventSource( - //`${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${eval_config_id}/run?all_run_configs=true`, - run_url, - ) + const eventSource = new EventSource(run_url) eventSource.onmessage = (event) => { try { @@ -81,7 +79,7 @@ {#if eval_state === "not_started"} Date: Wed, 26 Feb 2025 12:27:41 -0500 Subject: [PATCH 067/102] More improve copy/UI. --- .../evals/[project_id]/[task_id]/+page.svelte | 4 +- .../[task_id]/[eval_id]/+page.svelte | 86 +++++++++---------- .../[run_config_id]/run_result/+page.svelte | 33 ++++--- .../[eval_id]/create_eval_config/+page.svelte | 10 ++- .../[eval_id]/eval_configs/+page.svelte | 34 ++++---- .../eval_config_instruction.svelte | 10 +-- .../output_type_table_preview.svelte | 29 +++++++ .../[task_id]/create_evaluator/+page.svelte | 28 +++--- 8 files changed, 135 insertions(+), 99 deletions(-) create mode 100644 app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte index fc3836e1..83654fcf 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte @@ -52,14 +52,14 @@ {/each}
+ {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25} +
+ +
+ {/if}
-
Run Config
+
Run Method
How task output is generated
{output_score.name} -
- {#if output_score.type === "five_star"} - 1 to 5 - - - - {:else if output_score.type === "pass_fail"} - pass/fail - - - - {:else if output_score.type === "pass_fail_critical"} - pass/fail/critical - - - - {:else} - {output_score.type} - {/if} -
+
Output{score.name} + {score.name} + +
-
Eval Config
+
Eval Method
How task output is evaluated
Eval Instructions
{:else} -
Results
-
-
Create a Run Method
-
- A task run method defines how the task is run, such as which model - and prompt to use. Create one to run this evaluator. -
- +
Compare Run Methods
+
+ Find the best method of running your task including various prompts, + models, fine-tunes, and more. Add one or more task run method to get + started.
+ + {/if}
{/if} From ae8eb1942b19a1352ada9c10047d24e30248cb7d Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 28 Feb 2025 19:41:51 -0500 Subject: [PATCH 088/102] removed unused model property --- app/web_ui/src/lib/api_schema.d.ts | 10 ---------- libs/core/kiln_ai/datamodel/eval.py | 9 --------- libs/core/kiln_ai/datamodel/test_eval_model.py | 10 ---------- 3 files changed, 29 deletions(-) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index aab1a648..0990e615 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -1329,11 +1329,6 @@ export interface components { * @description The description of the eval */ description?: string | null; - /** - * @description The state of the eval: enabled or disabled. - * @default enabled - */ - state: components["schemas"]["EvalState"]; /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */ template?: components["schemas"]["EvalTemplate"] | null; /** @@ -1544,11 +1539,6 @@ export interface components { eval_config: components["schemas"]["EvalConfig"]; run_config: components["schemas"]["TaskRunConfig"]; }; - /** - * EvalState - * @enum {string} - */ - EvalState: "enabled" | "disabled"; /** * EvalTemplate * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval. diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py index 4eb3e1e9..a5c33382 100644 --- a/libs/core/kiln_ai/datamodel/eval.py +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -36,11 +36,6 @@ class EvalTemplate(str, Enum): jailbreak = "jailbreak" -class EvalState(str, Enum): - enabled = "enabled" - disabled = "disabled" - - class EvalConfigType(str, Enum): g_eval = "g_eval" llm_as_judge = "llm_as_judge" @@ -253,10 +248,6 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig} description: str | None = Field( default=None, description="The description of the eval" ) - state: EvalState = Field( - default=EvalState.enabled, - description="The state of the eval: enabled or disabled.", - ) template: EvalTemplate | None = Field( default=None, description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py index 72f4c763..3c9cb72e 100644 --- a/libs/core/kiln_ai/datamodel/test_eval_model.py +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -9,7 +9,6 @@ EvalConfigType, EvalOutputScore, EvalRun, - EvalState, ) from kiln_ai.datamodel.task import Task from kiln_ai.datamodel.task_output import ( @@ -22,12 +21,6 @@ def mock_task(): return Task(name="Test Task", instruction="Test instruction") -def test_eval_state_values(): - assert EvalState.enabled == "enabled" - assert EvalState.disabled == "disabled" - assert len(EvalState) == 2 - - @pytest.fixture def valid_eval_config_data(): return { @@ -95,7 +88,6 @@ def test_eval_basic_properties(): eval = Eval( name="Test Eval", description="Test Description", - state=EvalState.enabled, current_config_id="config123", eval_set_filter_id="tag::tag1", eval_configs_filter_id="tag::tag2", @@ -109,7 +101,6 @@ def test_eval_basic_properties(): assert eval.name == "Test Eval" assert eval.description == "Test Description" - assert eval.state == EvalState.enabled assert eval.current_config_id == "config123" assert eval.output_scores[0].name == "accuracy" assert eval.output_scores[0].type == TaskOutputRatingType.five_star @@ -129,7 +120,6 @@ def test_eval_default_values(): ) assert eval.description is None - assert eval.state == EvalState.enabled assert eval.current_config_id is None From e2022fa3d81f102e4de0b5855378d1414185961d Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 28 Feb 2025 20:05:36 -0500 Subject: [PATCH 089/102] remove dead code --- libs/core/kiln_ai/datamodel/task.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index d2d27f61..fb8a6838 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -109,20 +109,6 @@ def run_config(self) -> RunConfig: prompt_id=self.run_config_properties.prompt_id, ) - @model_validator(mode="after") - def validate_task(self) -> Self: - # Check that the task in the run config matches the parent task - return self - # TODO P0 - parent_task = self.parent_task() - if parent_task is None: - raise ValueError("Run config must be parented to a task") - if self.run_config.task is None: - raise ValueError("Run config must have a task") - if self.run_config.task.id != parent_task.id: - raise ValueError("Run config task must match parent task") - return self - class Task( KilnParentedModel, From b08fcdf62ed69745830d106a5bc2d3c2a24c7bb7 Mon Sep 17 00:00:00 2001 From: scosman Date: Fri, 28 Feb 2025 20:35:38 -0500 Subject: [PATCH 090/102] improve doc comment --- libs/core/kiln_ai/datamodel/prompt_id.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/core/kiln_ai/datamodel/prompt_id.py b/libs/core/kiln_ai/datamodel/prompt_id.py index 2d2c5f02..19ca455a 100644 --- a/libs/core/kiln_ai/datamodel/prompt_id.py +++ b/libs/core/kiln_ai/datamodel/prompt_id.py @@ -28,6 +28,7 @@ class PromptGenerators(str, Enum): Prompt IDs can be one of: - A saved prompt ID - A fine-tune prompt ID +- A task run config ID - A prompt generator name """ From 00e8694ceb77dcc2ee504756eb392420978acf83 Mon Sep 17 00:00:00 2001 From: scosman Date: Sat, 1 Mar 2025 11:19:38 -0500 Subject: [PATCH 091/102] CR feedback: better names, comments, stricter typing, fewer dict lookups --- app/desktop/studio_server/eval_api.py | 123 +++++++++--------- app/desktop/studio_server/test_eval_api.py | 10 +- app/web_ui/src/lib/api_schema.d.ts | 16 +-- app/web_ui/src/lib/types.ts | 2 +- .../[task_id]/[eval_id]/+page.svelte | 2 +- .../[eval_id]/create_eval_config/+page.svelte | 4 +- .../[task_id]/create_evaluator/+page.svelte | 8 +- .../create_evaluator/eval_template.ts | 6 +- .../select_eval_template.svelte | 6 +- libs/core/kiln_ai/datamodel/eval.py | 7 +- libs/core/kiln_ai/datamodel/task.py | 3 +- 11 files changed, 92 insertions(+), 95 deletions(-) diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index fa32f6b6..f71c1612 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -22,7 +22,7 @@ EvalConfigType, EvalOutputScore, EvalRun, - EvalTemplate, + EvalTemplateId, ) from kiln_ai.datamodel.json_schema import string_to_json_key from kiln_ai.datamodel.prompt_id import is_frozen_prompt @@ -47,7 +47,7 @@ def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: raise HTTPException( status_code=404, - detail=f"Task not found. ID: {task_id}", + detail=f"Eval not found. ID: {eval_id}", ) @@ -79,9 +79,9 @@ def task_run_config_from_id( ) -# JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better async def run_eval_runner_with_status(eval_runner: EvalRunner) -> StreamingResponse: - # Async messages via server side events (SSE) + # Yields async messages designed to be used with server sent events (SSE) + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events async def event_generator(): async for progress in eval_runner.run(): data = { @@ -103,7 +103,7 @@ async def event_generator(): class CreateEvaluatorRequest(BaseModel): name: str description: str - template: EvalTemplate | None + template: EvalTemplateId | None output_scores: list[EvalOutputScore] eval_set_filter_id: DatasetFilterId eval_configs_filter_id: DatasetFilterId @@ -142,18 +142,18 @@ class EvalRunResult(BaseModel): class EvalResultSummary(BaseModel): # run_config_id -> output_score_id -> ScoreSummary - results: Dict[str, Dict[str, ScoreSummary]] + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] # run_config_id -> percent of the dataset that has been processed - run_config_percent_complete: Dict[str, float] + run_config_percent_complete: Dict[ID_TYPE, float] # The total size of the dataset used for the eval dataset_size: int class EvalConfigCompareSummary(BaseModel): # Summary of results. eval_config_id -> output_score_id -> CorrelationResult - results: Dict[str, Dict[str, CorrelationResult]] + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] # eval_config_id -> percent of the dataset that has been processed (run with eval scores) - eval_config_percent_complete: Dict[str, float] + eval_config_percent_complete: Dict[ID_TYPE, float] # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size) dataset_size: int # The number of dataset items which are fully rated, partially rated, or not rated at all. @@ -180,9 +180,10 @@ def human_score_from_task_run( if score_key == "overall_rating": human_score = task_run.output.rating.value else: - req_rating = task_run.output.rating.requirement_ratings.get( - score_key_to_task_requirement_id[score_key], None - ) + req_id = score_key_to_task_requirement_id.get(score_key, None) + if req_id is None: + return None + req_rating = task_run.output.rating.requirement_ratings.get(req_id, None) if req_rating is not None: human_score = req_rating.value @@ -199,7 +200,6 @@ def count_human_evals( partially_rated_count: int = 0 not_rated_count: int = 0 for dataset_item in items: - # Check it has all scores has_all_scores = True has_any_scores = False for output_score in eval.output_scores: @@ -346,8 +346,9 @@ async def create_eval_config( eval_config.save_to_file() return eval_config + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better @app.get( - "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run" + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval" ) async def run_eval_config( project_id: str, @@ -397,6 +398,7 @@ async def set_default_eval_config( return eval + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval" ) @@ -440,6 +442,7 @@ async def get_eval_run_results( run_config=run_config, ) + # This compares run_configs to each other on a given eval_config. Compare to below which compares eval_configs to each other. @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary" ) @@ -463,29 +466,27 @@ async def get_eval_config_score_summary( ) # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run - remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { - str(run_config.id): set(expected_dataset_ids) - for run_config in task_runs_configs + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + run_config.id: set(expected_dataset_ids) for run_config in task_runs_configs } # Track how often we are missing scores in a eval_config. Should be 0 for a complete eval_config - partial_incomplete_counts: Dict[str, int] = { - str(run_config.id): 0 for run_config in task_runs_configs + partial_incomplete_counts: Dict[ID_TYPE, int] = { + run_config.id: 0 for run_config in task_runs_configs } - # task_run_config_id -> output_score_id -> score/total - total_scores: Dict[str, Dict[str, float]] = {} - score_counts: Dict[str, Dict[str, int]] = {} + # task_run_config_id -> output_score_json_key -> score/total for calculating the mean score + total_scores: Dict[ID_TYPE, Dict[str, float]] = {} + score_counts: Dict[ID_TYPE, Dict[str, int]] = {} - # important: readonly makes this much faster for eval_run in eval_config.runs(readonly=True): if eval_run.task_run_config_id is None: - # This eval_run is not associated with a run_config, so we can't count it + # This eval_run is not associated with a run_config, so we should not count it continue - run_config_id = str(eval_run.task_run_config_id) + run_config_id = eval_run.task_run_config_id # Check if we should count this eval_run. Not every eval_run has to go into the stats: # - a dataset_id can be removed from the dataset filter (removed a tag) - # - this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) + # - this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]: continue else: @@ -513,25 +514,25 @@ async def get_eval_config_score_summary( partial_incomplete_counts[run_config_id] += 1 # Convert to score summaries - results: Dict[str, Dict[str, ScoreSummary]] = {} + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] = {} for run_config_id, output_scores in total_scores.items(): results[run_config_id] = {} for output_score_id, score in output_scores.items(): - if score_counts[run_config_id][output_score_id] > 0: + count = score_counts[run_config_id][output_score_id] + if count > 0: results[run_config_id][output_score_id] = ScoreSummary( - mean_score=score / score_counts[run_config_id][output_score_id] + mean_score=score / count ) # Calculate the percent of the dataset that has been processed - run_config_percent_complete: Dict[str, float] = {} + run_config_percent_complete: Dict[ID_TYPE, float] = {} for run_config in task_runs_configs: - run_config_id = str(run_config.id) # Partial incomplete (missing scores), and fully incomplete (no eval_run) - incomplete_count = partial_incomplete_counts[run_config_id] + len( - remaining_expected_dataset_ids[run_config_id] + incomplete_count = partial_incomplete_counts[run_config.id] + len( + remaining_expected_dataset_ids[run_config.id] ) percent_incomplete = incomplete_count / len(expected_dataset_ids) - run_config_percent_complete[str(run_config.id)] = 1 - percent_incomplete + run_config_percent_complete[run_config.id] = 1 - percent_incomplete return EvalResultSummary( results=results, @@ -573,18 +574,15 @@ async def get_eval_configs_score_summary( not_rated_count=0, ) - # save a copy of the expected dataset ids for each eval config, we'll update each as we process each eval run - remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { - str(eval_config.id): set(expected_dataset_ids) - for eval_config in eval_configs + # save a copy of the expected dataset ids for each eval config id, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + eval_config.id: set(expected_dataset_ids) for eval_config in eval_configs } - # eval_config_id -> output_score_id -> correlation calculator - correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {} + # eval_config_id -> output_score_json_key -> correlation calculator + correlation_calculators: Dict[ID_TYPE, Dict[str, CorrelationCalculator]] = {} - # important: readonly makes this much faster for eval_config in eval_configs: - eval_config_id = str(eval_config.id) for eval_run in eval_config.runs(readonly=True): dataset_item = expected_dataset_items.get(eval_run.dataset_id, None) if dataset_item is None: @@ -593,14 +591,14 @@ async def get_eval_configs_score_summary( continue # Check if we should count this eval_run. Not every eval_run has to go into the stats: - # Example: this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) + # Example: this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) if ( eval_run.dataset_id - not in remaining_expected_dataset_ids[eval_config_id] + not in remaining_expected_dataset_ids[eval_config.id] ): continue else: - remaining_expected_dataset_ids[eval_config_id].remove( + remaining_expected_dataset_ids[eval_config.id].remove( eval_run.dataset_id ) @@ -617,13 +615,15 @@ async def get_eval_configs_score_summary( # This score doesn't have both a human eval and eval score, so we can't compare continue - if eval_config_id not in correlation_calculators: - correlation_calculators[eval_config_id] = {} + if eval_config.id not in correlation_calculators: + correlation_calculators[eval_config.id] = {} - if score_key not in correlation_calculators[eval_config_id]: - correlation_calculators[eval_config_id][score_key] = ( - CorrelationCalculator() - ) + calculator = correlation_calculators[eval_config.id].get( + score_key, None + ) + if calculator is None: + calculator = CorrelationCalculator() + correlation_calculators[eval_config.id][score_key] = calculator normalized_eval_score = normalize_rating( eval_score, output_score.type @@ -631,7 +631,7 @@ async def get_eval_configs_score_summary( normalized_human_score = normalize_rating( human_score, output_score.type ) - correlation_calculators[eval_config_id][score_key].add_score( + calculator.add_score( CorrelationScore( measured_score=eval_score, human_score=human_score, @@ -641,27 +641,26 @@ async def get_eval_configs_score_summary( ) # Convert to score summaries - results: Dict[str, Dict[str, CorrelationResult]] = {} + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] = {} for eval_config_id in correlation_calculators.keys(): results[eval_config_id] = {} for score_key in correlation_calculators[eval_config_id].keys(): - if not correlation_calculators[eval_config_id][score_key]: + calculator = correlation_calculators[eval_config_id].get( + score_key, None + ) + if calculator is None: # No scores to calculate correlation for this pair continue - correlation_result = correlation_calculators[eval_config_id][ - score_key - ].calculate_correlation() + correlation_result = calculator.calculate_correlation() results[eval_config_id][score_key] = correlation_result # Calculate the percent of the dataset that has been processed - eval_config_percent_complete: Dict[str, float] = {} + eval_config_percent_complete: Dict[ID_TYPE, float] = {} for eval_config in eval_configs: - eval_config_id = str(eval_config.id) - # Partial incomplete (missing scores), and fully incomplete (no eval_run) - incomplete_count = len(remaining_expected_dataset_ids[eval_config_id]) + incomplete_count = len(remaining_expected_dataset_ids[eval_config.id]) percent_incomplete = incomplete_count / len(expected_dataset_ids) - eval_config_percent_complete[str(eval_config.id)] = 1 - percent_incomplete + eval_config_percent_complete[eval_config.id] = 1 - percent_incomplete # Count how many dataset items have human evals fully_rated_count, partially_rated_count, not_rated_count = count_human_evals( diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 175dec2a..58a6e2fc 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -27,7 +27,7 @@ EvalConfigType, EvalOutputScore, EvalRun, - EvalTemplate, + EvalTemplateId, ) from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig @@ -87,7 +87,7 @@ def mock_eval(mock_task): id="eval1", name="Test Eval", description="Test Description", - template=EvalTemplate.bias, + template=EvalTemplateId.bias, output_scores=[ EvalOutputScore(name="score1", description="desc1", type="five_star"), EvalOutputScore( @@ -177,7 +177,7 @@ def test_get_eval_not_found(client, mock_task, mock_task_from_id): response = client.get("/api/projects/project1/tasks/task1/eval/non_existent") assert response.status_code == 404 - assert response.json()["detail"] == "Task not found. ID: task1" + assert response.json()["detail"] == "Eval not found. ID: non_existent" @pytest.fixture @@ -428,7 +428,7 @@ async def mock_run(): # Make request with specific run_config_ids response = client.get( - "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run", + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval", params={"run_config_ids": ["run_config1", "run_config2"]}, ) @@ -465,7 +465,7 @@ async def test_run_eval_config_no_run_configs_error( # Make request with no run_config_ids and all_run_configs=False response = client.get( - "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run" + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval" ) assert response.status_code == 400 diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 0990e615..b2d369b7 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -793,7 +793,7 @@ export interface paths { patch?: never; trace?: never; }; - "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run": { + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval": { parameters: { query?: never; header?: never; @@ -801,7 +801,7 @@ export interface paths { cookie?: never; }; /** Run Eval Config */ - get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get"]; + get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get"]; put?: never; post?: never; delete?: never; @@ -1031,7 +1031,7 @@ export interface components { name: string; /** Description */ description: string; - template: components["schemas"]["EvalTemplate"] | null; + template: components["schemas"]["EvalTemplateId"] | null; /** Output Scores */ output_scores: components["schemas"]["EvalOutputScore"][]; /** Eval Set Filter Id */ @@ -1330,7 +1330,7 @@ export interface components { */ description?: string | null; /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */ - template?: components["schemas"]["EvalTemplate"] | null; + template?: components["schemas"]["EvalTemplateId"] | null; /** * Current Config Id * @description The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs. @@ -1540,11 +1540,11 @@ export interface components { run_config: components["schemas"]["TaskRunConfig"]; }; /** - * EvalTemplate + * EvalTemplateId * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval. * @enum {string} */ - EvalTemplate: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; + EvalTemplateId: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; /** * FineTuneParameter * @description A parameter for a fine-tune. Hyperparameters, etc. @@ -1818,7 +1818,7 @@ export interface components { * Where models have instruct and raw versions, instruct is default and raw is specified. * @enum {string} */ - ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b"; + ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "claude_3_7_sonnet" | "claude_3_7_sonnet_thinking" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b"; /** * ModelProviderName * @description Enumeration of supported AI model providers. @@ -4262,7 +4262,7 @@ export interface operations { }; }; }; - run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get: { + run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get: { parameters: { query?: { run_config_ids?: string[]; diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 4ee5b6f0..8419f6d7 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -21,7 +21,7 @@ export type RunSummary = components["schemas"]["RunSummary"] export type PromptResponse = components["schemas"]["PromptResponse"] export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"] export type EvalOutputScore = components["schemas"]["EvalOutputScore"] -export type EvalTemplate = components["schemas"]["EvalTemplate"] +export type EvalTemplateId = components["schemas"]["EvalTemplateId"] export type Eval = components["schemas"]["Eval"] export type EvalConfigType = components["schemas"]["EvalConfigType"] export type EvalConfig = components["schemas"]["EvalConfig"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 760b8d7e..f9687c0d 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -336,7 +336,7 @@ | "running" | "complete" | "complete_with_errors" = "not_started" - $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true` + $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run_task_run_eval?all_run_configs=true` let task_run_config_model_name = "" let task_run_config_provider_name = "" diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index 7a7496fb..399b2ed1 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -9,7 +9,7 @@ import { onMount } from "svelte" import Warning from "$lib/ui/warning.svelte" import AvailableModelsDropdown from "../../../../../run/available_models_dropdown.svelte" - import type { Eval, EvalTemplate, Task, EvalConfigType } from "$lib/types" + import type { Eval, EvalTemplateId, Task, EvalConfigType } from "$lib/types" import { tick } from "svelte" import { load_task } from "$lib/stores" import { goto } from "$app/navigation" @@ -18,7 +18,7 @@ let task_description: string = "" let eval_steps: string[] = [] - type EvalTemplateWithoutKiln = Exclude + type EvalTemplateWithoutKiln = Exclude const eval_steps_static_templates: Record = { toxicity: [ diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte index 87688a4a..de0c034b 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte @@ -1,7 +1,7 @@