diff --git a/.cursorrules b/.cursorrules index 32d21fb8..458a4bd8 100644 --- a/.cursorrules +++ b/.cursorrules @@ -1,3 +1,4 @@ - Always assume pydantic 2 (not pydantic 1) - Always use pytest for tests + - The project supports Python 3.10 and above diff --git a/README.md b/README.md index 02ed246e..c9ec6ce2 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,8 @@ | CI | [![Build and Test](https://github.com/Kiln-AI/kiln/actions/workflows/build_and_test.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/build_and_test.yml) [![Format and Lint](https://github.com/Kiln-AI/kiln/actions/workflows/format_and_lint.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/format_and_lint.yml) [![Desktop Apps Build](https://github.com/Kiln-AI/kiln/actions/workflows/build_desktop.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/build_desktop.yml) [![Web UI Build](https://github.com/Kiln-AI/kiln/actions/workflows/web_format_lint_build.yml/badge.svg)](https://github.com/Kiln-AI/kiln/actions/workflows/web_format_lint_build.yml) [![Test Count Badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/scosman/57742c1b1b60d597a6aba5d5148d728e/raw/test_count_kiln.json)](https://github.com/Kiln-AI/kiln/actions/workflows/test_count.yml) [![Test Coverage Badge](https://img.shields.io/endpoint?url=https://gist.githubusercontent.com/scosman/57742c1b1b60d597a6aba5d5148d728e/raw/library_coverage_kiln.json)](https://github.com/Kiln-AI/kiln/actions/workflows/test_count.yml) [![Docs](https://github.com/Kiln-AI/Kiln/actions/workflows/build_docs.yml/badge.svg)](https://github.com/Kiln-AI/Kiln/actions/workflows/build_docs.yml) | | Package | [![PyPI - Version](https://img.shields.io/pypi/v/kiln-ai.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/kiln-ai/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/kiln-ai.svg?logo=python&label=Python&logoColor=gold)](https://pypi.org/project/kiln-ai/) | | Meta | [![uv](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/uv/main/assets/badge/v0.json)](https://github.com/astral-sh/uv) [![linting - Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff) [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch) [![types - Pyright](https://img.shields.io/badge/types-pyright-blue.svg)](https://github.com/microsoft/pyright) [![Docs](https://img.shields.io/badge/docs-pdoc-blue)](https://kiln-ai.github.io/Kiln/kiln_core_docs/index.html) | -| Apps | [![MacOS](https://img.shields.io/badge/MacOS-black?logo=apple)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Windows](https://img.shields.io/badge/Windows-0067b8.svg?logo=data:image/svg%2bxml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHN2ZyBmaWxsPSIjZmZmIiB2aWV3Qm94PSIwIDAgMzIgMzIiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTE2Ljc0MiAxNi43NDJ2MTQuMjUzaDE0LjI1M3YtMTQuMjUzek0xLjAwNCAxNi43NDJ2MTQuMjUzaDE0LjI1NnYtMTQuMjUzek0xNi43NDIgMS4wMDR2MTQuMjU2aDE0LjI1M3YtMTQuMjU2ek0xLjAwNCAxLjAwNHYxNC4yNTZoMTQuMjU2di0xNC4yNTZ6Ij48L3BhdGg+Cjwvc3ZnPg==)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Linux](https://img.shields.io/badge/Linux-444444?logo=linux&logoColor=ffffff)](https://github.com/Kiln-AI/Kiln/releases/latest) ![Github Downsloads](https://img.shields.io/github/downloads/kiln-ai/kiln/total) | -| Connect | [![Discord](https://img.shields.io/badge/Discord-Kiln_AI-blue?logo=Discord&logoColor=white)](https://discord.gg/sVJEzDGu) [![Newsletter](https://img.shields.io/badge/Newsletter-kilnai-blue?logo=Substack&logoColor=white)](https://kilnai.substack.com) | +| Apps | [![MacOS](https://img.shields.io/badge/MacOS-black?logo=apple)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Windows](https://img.shields.io/badge/Windows-0067b8.svg?logo=data:image/svg%2bxml;base64,PD94bWwgdmVyc2lvbj0iMS4wIiBlbmNvZGluZz0idXRmLTgiPz4KPHN2ZyBmaWxsPSIjZmZmIiB2aWV3Qm94PSIwIDAgMzIgMzIiIHZlcnNpb249IjEuMSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPHBhdGggZD0iTTE2Ljc0MiAxNi43NDJ2MTQuMjUzaDE0LjI1M3YtMTQuMjUzek0xLjAwNCAxNi43NDJ2MTQuMjUzaDE0LjI1NnYtMTQuMjUzek0xNi43NDIgMS4wMDR2MTQuMjU2aDE0LjI1M3YtMTQuMjU2ek0xLjAwNCAxLjAwNHYxNC4yNTZoMTQuMjU2di0xNC4yNTZ6Ij48L3BhdGg+Cjwvc3ZnPg==)](https://github.com/Kiln-AI/Kiln/releases/latest) [![Linux](https://img.shields.io/badge/Linux-444444?logo=linux&logoColor=ffffff)](https://github.com/Kiln-AI/Kiln/releases/latest) ![Github Downsloads](https://img.shields.io/github/downloads/kiln-ai/kiln/total) | +| Connect | [![Discord](https://img.shields.io/badge/Discord-Kiln_AI-blue?logo=Discord&logoColor=white)](https://discord.gg/sVJEzDGu) [![Newsletter](https://img.shields.io/badge/Newsletter-kilnai-blue?logo=Substack&logoColor=white)](https://kilnai.substack.com) | [Download button](https://github.com/Kiln-AI/Kiln/releases/latest) [Quick start button](https://docs.getkiln.ai/getting-started/quickstart) @@ -61,6 +61,7 @@ Kiln is quite intuitive, so we suggest launching the desktop app and diving in. - [Fine Tuning LLM Models](https://docs.getkiln.ai/docs/fine-tuning-guide) - [Guide: Train a Reasoning Model](https://docs.getkiln.ai/docs/guide-train-a-reasoning-model) - [Reasoning & Chain of Thought](https://docs.getkiln.ai/docs/reasoning-and-chain-of-thought) +- [Evaluators](https://docs.getkiln.ai/docs/evaluators) - [Synthetic Data Generation](https://docs.getkiln.ai/docs/synthetic-data-generation) - [Collaborating with Kiln](https://docs.getkiln.ai/docs/collaboration) - [Rating and Labeling Data](https://docs.getkiln.ai/docs/reviewing-and-rating) diff --git a/app/desktop/desktop_server.py b/app/desktop/desktop_server.py index 4156c905..383d97e7 100644 --- a/app/desktop/desktop_server.py +++ b/app/desktop/desktop_server.py @@ -10,6 +10,7 @@ from app.desktop.log_config import log_config from app.desktop.studio_server.data_gen_api import connect_data_gen_api +from app.desktop.studio_server.eval_api import connect_evals_api from app.desktop.studio_server.finetune_api import connect_fine_tune_api from app.desktop.studio_server.prompt_api import connect_prompt_api from app.desktop.studio_server.provider_api import connect_provider_api @@ -36,6 +37,7 @@ def make_app(): connect_settings(app) connect_data_gen_api(app) connect_fine_tune_api(app) + connect_evals_api(app) # Important: webhost must be last, it handles all other URLs connect_webhost(app) diff --git a/app/desktop/pyproject.toml b/app/desktop/pyproject.toml index 1cf5e5e5..e28ea1c4 100644 --- a/app/desktop/pyproject.toml +++ b/app/desktop/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "pillow>=11.0.0", "pystray>=0.19.5", "pyinstaller==6.11.1", + "scipy>=1.15.2", ] diff --git a/app/desktop/studio_server/correlation_calculator.py b/app/desktop/studio_server/correlation_calculator.py new file mode 100644 index 00000000..c6fc6d95 --- /dev/null +++ b/app/desktop/studio_server/correlation_calculator.py @@ -0,0 +1,110 @@ +import math +from dataclasses import dataclass +from typing import List + +from scipy import stats + + +@dataclass +class CorrelationScore: + measured_score: float + human_score: float + normalized_measured_score: float + normalized_human_score: float + + +@dataclass +class CorrelationResult: + mean_absolute_error: float + mean_normalized_absolute_error: float + mean_squared_error: float + mean_normalized_squared_error: float + spearman_correlation: float | None + pearson_correlation: float | None + kendalltau_correlation: float | None + + +class CorrelationCalculator: + def __init__(self): + self.scores: List[CorrelationScore] = [] + + def add_score(self, score: CorrelationScore): + self.scores.append(score) + + def calculate_correlation(self) -> CorrelationResult: + if len(self.scores) == 0: + raise ValueError("No scores to calculate correlation") + + return CorrelationResult( + mean_absolute_error=self.calculate_mean_absolute_error(), + mean_normalized_absolute_error=self.calculate_mean_normalized_absolute_error(), + mean_squared_error=self.calculate_mean_squared_error(), + mean_normalized_squared_error=self.calculate_mean_normalized_squared_error(), + spearman_correlation=self.calculate_spearman_correlation(), + pearson_correlation=self.calculate_pearson_correlation(), + kendalltau_correlation=self.calculate_kendalltau_correlation(), + ) + + def calculate_mean_absolute_error(self) -> float: + total_absolute_error = sum( + abs(score.measured_score - score.human_score) for score in self.scores + ) + return total_absolute_error / len(self.scores) + + def calculate_mean_normalized_absolute_error(self) -> float: + total_normalized_absolute_error = sum( + abs(score.normalized_measured_score - score.normalized_human_score) + for score in self.scores + ) + return total_normalized_absolute_error / len(self.scores) + + def calculate_mean_squared_error(self) -> float: + total_squared_error = sum( + (score.measured_score - score.human_score) ** 2 for score in self.scores + ) + return total_squared_error / len(self.scores) + + def calculate_mean_normalized_squared_error(self) -> float: + total_normalized_squared_error = sum( + (score.normalized_measured_score - score.normalized_human_score) ** 2 + for score in self.scores + ) + return total_normalized_squared_error / len(self.scores) + + def calculate_spearman_correlation(self) -> float | None: + if len(self.scores) < 2: + # If there is only one pair, no correlation + return None + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.spearmanr(x, y) + # library doesn't support proper types + correlation = result.__getattribute__("correlation") + if math.isnan(correlation) or not isinstance(correlation, float): + # Very small samples may have a NaN result (unknown correlation) + return None + return correlation + + def calculate_pearson_correlation(self) -> float | None: + if len(self.scores) < 2: + # If there is only one pair, no correlation + return None + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.pearsonr(x, y) + if math.isnan(result.correlation): + # Very small samples may have a NaN result (unknown correlation) + return None + return result.correlation + + def calculate_kendalltau_correlation(self) -> float | None: + if len(self.scores) < 2: + # If there is only one pair, no correlation + return None + x = [score.measured_score for score in self.scores] + y = [score.human_score for score in self.scores] + result = stats.kendalltau(x, y) + if math.isnan(result.correlation): + # Very small samples may have a NaN result (unknown correlation) + return None + return result.correlation diff --git a/app/desktop/studio_server/data_gen_api.py b/app/desktop/studio_server/data_gen_api.py index a4f05315..c3564c54 100644 --- a/app/desktop/studio_server/data_gen_api.py +++ b/app/desktop/studio_server/data_gen_api.py @@ -5,9 +5,10 @@ DataGenCategoriesTaskInput, DataGenSampleTask, DataGenSampleTaskInput, + wrap_task_with_guidance, ) -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name -from kiln_ai.datamodel import DataSource, DataSourceType, TaskRun +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig +from kiln_ai.datamodel import DataSource, DataSourceType, PromptId, TaskRun from kiln_server.run_api import model_provider_from_string from kiln_server.task_api import task_from_id from pydantic import BaseModel, ConfigDict, Field @@ -60,9 +61,13 @@ class DataGenSaveSamplesApiInput(BaseModel): ) output_model_name: str = Field(description="The name of the model to use") output_provider: str = Field(description="The provider of the model to use") - prompt_method: str = Field( + prompt_method: PromptId = Field( description="The prompt method used to generate the output" ) + human_guidance: str | None = Field( + description="Optional human guidance for generation", + default=None, + ) def connect_data_gen_api(app: FastAPI): @@ -122,7 +127,11 @@ async def save_sample( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_ui_name(sample.prompt_method, task) + # Wrap the task instuctions with human guidance, if provided + if sample.human_guidance is not None and sample.human_guidance.strip() != "": + task.instruction = wrap_task_with_guidance( + task.instruction, sample.human_guidance + ) tags = ["synthetic"] if session_id: @@ -132,8 +141,8 @@ async def save_sample( task, model_name=sample.output_model_name, provider=model_provider_from_string(sample.output_provider), - prompt_builder=prompt_builder, - tags=tags, + prompt_id=sample.prompt_method, + base_adapter_config=AdapterConfig(default_tags=tags), ) properties: dict[str, str | int | float] = { diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py new file mode 100644 index 00000000..f71c1612 --- /dev/null +++ b/app/desktop/studio_server/eval_api.py @@ -0,0 +1,679 @@ +import json +from typing import Any, Dict, List, Set, Tuple + +from fastapi import FastAPI, HTTPException, Query +from fastapi.responses import StreamingResponse +from kiln_ai.adapters.eval.eval_runner import EvalRunner +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + PromptId, + Task, + TaskRun, +) +from kiln_ai.datamodel.basemodel import ID_TYPE +from kiln_ai.datamodel.dataset_filters import DatasetFilterId, dataset_filter_from_id +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalOutputScore, + EvalRun, + EvalTemplateId, +) +from kiln_ai.datamodel.json_schema import string_to_json_key +from kiln_ai.datamodel.prompt_id import is_frozen_prompt +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig +from kiln_ai.datamodel.task_output import normalize_rating +from kiln_ai.utils.name_generator import generate_memorable_name +from kiln_server.task_api import task_from_id +from pydantic import BaseModel + +from .correlation_calculator import ( + CorrelationCalculator, + CorrelationResult, + CorrelationScore, +) + + +def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: + task = task_from_id(project_id, task_id) + for eval in task.evals(): + if eval.id == eval_id: + return eval + + raise HTTPException( + status_code=404, + detail=f"Eval not found. ID: {eval_id}", + ) + + +def eval_config_from_id( + project_id: str, task_id: str, eval_id: str, eval_config_id: str +) -> EvalConfig: + eval = eval_from_id(project_id, task_id, eval_id) + for config in eval.configs(): + if config.id == eval_config_id: + return config + + raise HTTPException( + status_code=404, + detail=f"Eval config not found. ID: {eval_config_id}", + ) + + +def task_run_config_from_id( + project_id: str, task_id: str, run_config_id: str +) -> TaskRunConfig: + task = task_from_id(project_id, task_id) + for run_config in task.run_configs(): + if run_config.id == run_config_id: + return run_config + + raise HTTPException( + status_code=404, + detail=f"Task run config not found. ID: {run_config_id}", + ) + + +async def run_eval_runner_with_status(eval_runner: EvalRunner) -> StreamingResponse: + # Yields async messages designed to be used with server sent events (SSE) + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events + async def event_generator(): + async for progress in eval_runner.run(): + data = { + "progress": progress.complete, + "total": progress.total, + "errors": progress.errors, + } + yield f"data: {json.dumps(data)}\n\n" + + # Send the final complete message the app expects, and uses to stop listening + yield "data: complete\n\n" + + return StreamingResponse( + content=event_generator(), + media_type="text/event-stream", + ) + + +class CreateEvaluatorRequest(BaseModel): + name: str + description: str + template: EvalTemplateId | None + output_scores: list[EvalOutputScore] + eval_set_filter_id: DatasetFilterId + eval_configs_filter_id: DatasetFilterId + + +class CreateEvalConfigRequest(BaseModel): + name: str | None = None + type: EvalConfigType + properties: dict[str, Any] + model_name: str + provider: ModelProviderName + + +class CreateTaskRunConfigRequest(BaseModel): + name: str | None = None + description: str | None = None + model_name: str + model_provider_name: ModelProviderName + prompt_id: PromptId + + +class RunEvalConfigRequest(BaseModel): + run_config_ids: list[str] + + +class ScoreSummary(BaseModel): + mean_score: float + + +class EvalRunResult(BaseModel): + results: List[EvalRun] + eval: Eval + eval_config: EvalConfig + run_config: TaskRunConfig + + +class EvalResultSummary(BaseModel): + # run_config_id -> output_score_id -> ScoreSummary + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] + # run_config_id -> percent of the dataset that has been processed + run_config_percent_complete: Dict[ID_TYPE, float] + # The total size of the dataset used for the eval + dataset_size: int + + +class EvalConfigCompareSummary(BaseModel): + # Summary of results. eval_config_id -> output_score_id -> CorrelationResult + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] + # eval_config_id -> percent of the dataset that has been processed (run with eval scores) + eval_config_percent_complete: Dict[ID_TYPE, float] + # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size) + dataset_size: int + # The number of dataset items which are fully rated, partially rated, or not rated at all. + fully_rated_count: int + partially_rated_count: int + not_rated_count: int + + +def dataset_ids_in_filter(task: Task, filter_id: DatasetFilterId) -> Set[ID_TYPE]: + # Fetch all the dataset items IDs in a filter + filter = dataset_filter_from_id(filter_id) + return {run.id for run in task.runs() if filter(run)} + + +def human_score_from_task_run( + task_run: TaskRun, + score_key: str, + score_key_to_task_requirement_id: Dict[str, ID_TYPE], +) -> float | None: + if not task_run.output.rating: + return None + + human_score: float | None = None + if score_key == "overall_rating": + human_score = task_run.output.rating.value + else: + req_id = score_key_to_task_requirement_id.get(score_key, None) + if req_id is None: + return None + req_rating = task_run.output.rating.requirement_ratings.get(req_id, None) + if req_rating is not None: + human_score = req_rating.value + + return human_score + + +def count_human_evals( + items: List[TaskRun], + eval: Eval, + score_key_to_task_requirement_id: Dict[str, ID_TYPE], +) -> Tuple[int, int, int]: + # Track how often we are missing human evals in dataset items + fully_rated_count: int = 0 + partially_rated_count: int = 0 + not_rated_count: int = 0 + for dataset_item in items: + has_all_scores = True + has_any_scores = False + for output_score in eval.output_scores: + score_key = output_score.json_key() + score = human_score_from_task_run( + dataset_item, score_key, score_key_to_task_requirement_id + ) + if score is None: + has_all_scores = False + else: + has_any_scores = True + + if not has_any_scores: + not_rated_count += 1 + elif has_all_scores: + fully_rated_count += 1 + else: + partially_rated_count += 1 + + return fully_rated_count, partially_rated_count, not_rated_count + + +def connect_evals_api(app: FastAPI): + @app.post("/api/projects/{project_id}/tasks/{task_id}/create_evaluator") + async def create_evaluator( + project_id: str, + task_id: str, + request: CreateEvaluatorRequest, + ) -> Eval: + task = task_from_id(project_id, task_id) + eval = Eval( + name=request.name, + description=request.description, + template=request.template, + output_scores=request.output_scores, + eval_set_filter_id=request.eval_set_filter_id, + eval_configs_filter_id=request.eval_configs_filter_id, + parent=task, + ) + eval.save_to_file() + return eval + + @app.get("/api/projects/{project_id}/tasks/{task_id}/task_run_configs") + async def get_task_run_configs( + project_id: str, task_id: str + ) -> list[TaskRunConfig]: + task = task_from_id(project_id, task_id) + return task.run_configs() + + @app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}") + async def get_eval(project_id: str, task_id: str, eval_id: str) -> Eval: + return eval_from_id(project_id, task_id, eval_id) + + @app.get("/api/projects/{project_id}/tasks/{task_id}/evals") + async def get_evals(project_id: str, task_id: str) -> list[Eval]: + task = task_from_id(project_id, task_id) + return task.evals() + + @app.get("/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs") + async def get_eval_configs( + project_id: str, task_id: str, eval_id: str + ) -> list[EvalConfig]: + eval = eval_from_id(project_id, task_id, eval_id) + return eval.configs() + + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}" + ) + async def get_eval_config( + project_id: str, task_id: str, eval_id: str, eval_config_id: str + ) -> EvalConfig: + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + return eval_config + + @app.post("/api/projects/{project_id}/tasks/{task_id}/task_run_config") + async def create_task_run_config( + project_id: str, + task_id: str, + request: CreateTaskRunConfigRequest, + ) -> TaskRunConfig: + task = task_from_id(project_id, task_id) + name = request.name or generate_memorable_name() + + parent_project = task.parent_project() + if parent_project is None: + raise HTTPException( + status_code=400, + detail="Task must have a parent project.", + ) + + frozen_prompt: BasePrompt | None = None + if not is_frozen_prompt(request.prompt_id): + # For dynamic prompts, we "freeze" a copy of this prompt into the task run config so we don't accidentially invalidate evals if the user changes something that impacts the prompt (example: chanding data for multi-shot, or chanding task for basic-prompt) + # We then point the task_run_config.run_properties.prompt_id to this new frozen prompt + prompt_builder = prompt_builder_from_id(request.prompt_id, task) + prompt_name = generate_memorable_name() + frozen_prompt = BasePrompt( + name=prompt_name, + description=f"Frozen copy of prompt '{request.prompt_id}', created for evaluations.", + generator_id=request.prompt_id, + prompt=prompt_builder.build_base_prompt(), + chain_of_thought_instructions=prompt_builder.chain_of_thought_prompt(), + ) + + task_run_config = TaskRunConfig( + parent=task, + name=name, + description=request.description, + run_config_properties=RunConfigProperties( + model_name=request.model_name, + model_provider_name=request.model_provider_name, + prompt_id=request.prompt_id, + ), + prompt=frozen_prompt, + ) + if frozen_prompt is not None: + # Set after, because the ID isn't known until the TaskRunConfig is created + task_run_config.run_config_properties.prompt_id = ( + f"task_run_config::{parent_project.id}::{task.id}::{task_run_config.id}" + ) + task_run_config.save_to_file() + return task_run_config + + @app.post( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config" + ) + async def create_eval_config( + project_id: str, + task_id: str, + eval_id: str, + request: CreateEvalConfigRequest, + ) -> EvalConfig: + eval = eval_from_id(project_id, task_id, eval_id) + name = request.name or generate_memorable_name() + + eval_config = EvalConfig( + name=name, + config_type=request.type, + properties=request.properties, + model_name=request.model_name, + model_provider=request.provider, + parent=eval, + ) + eval_config.save_to_file() + return eval_config + + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval" + ) + async def run_eval_config( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + run_config_ids: list[str] = Query([]), + all_run_configs: bool = Query(False), + ) -> StreamingResponse: + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + + # Load the list of run configs to use. Two options: + run_configs: list[TaskRunConfig] = [] + if all_run_configs: + run_configs = task_from_id(project_id, task_id).run_configs() + else: + if len(run_config_ids) == 0: + raise HTTPException( + status_code=400, + detail="No run config ids provided. At least one run config id is required.", + ) + run_configs = [ + task_run_config_from_id(project_id, task_id, run_config_id) + for run_config_id in run_config_ids + ] + + eval_runner = EvalRunner( + eval_configs=[eval_config], + run_configs=run_configs, + eval_run_type="task_run_eval", + ) + + return await run_eval_runner_with_status(eval_runner) + + @app.post( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}" + ) + async def set_default_eval_config( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + ) -> Eval: + eval = eval_from_id(project_id, task_id, eval_id) + eval.current_config_id = eval_config_id + eval.save_to_file() + + return eval + + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval" + ) + async def run_eval_config_eval( + project_id: str, + task_id: str, + eval_id: str, + ) -> StreamingResponse: + eval = eval_from_id(project_id, task_id, eval_id) + eval_configs = eval.configs() + eval_runner = EvalRunner( + eval_configs=eval_configs, + run_configs=None, + eval_run_type="eval_config_eval", + ) + + return await run_eval_runner_with_status(eval_runner) + + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results" + ) + async def get_eval_run_results( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + run_config_id: str, + ) -> EvalRunResult: + eval = eval_from_id(project_id, task_id, eval_id) + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + run_config = task_run_config_from_id(project_id, task_id, run_config_id) + results = [ + run_result + for run_result in eval_config.runs(readonly=True) + if run_result.task_run_config_id == run_config_id + ] + return EvalRunResult( + results=results, + eval=eval, + eval_config=eval_config, + run_config=run_config, + ) + + # This compares run_configs to each other on a given eval_config. Compare to below which compares eval_configs to each other. + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary" + ) + async def get_eval_config_score_summary( + project_id: str, + task_id: str, + eval_id: str, + eval_config_id: str, + ) -> EvalResultSummary: + task = task_from_id(project_id, task_id) + eval = eval_from_id(project_id, task_id, eval_id) + eval_config = eval_config_from_id(project_id, task_id, eval_id, eval_config_id) + task_runs_configs = task.run_configs() + + # Build a set of all the dataset items IDs we expect to have scores for + expected_dataset_ids = dataset_ids_in_filter(task, eval.eval_set_filter_id) + if len(expected_dataset_ids) == 0: + raise HTTPException( + status_code=400, + detail="No dataset ids in eval set filter. Add items to your dataset matching the eval set filter.", + ) + + # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + run_config.id: set(expected_dataset_ids) for run_config in task_runs_configs + } + # Track how often we are missing scores in a eval_config. Should be 0 for a complete eval_config + partial_incomplete_counts: Dict[ID_TYPE, int] = { + run_config.id: 0 for run_config in task_runs_configs + } + + # task_run_config_id -> output_score_json_key -> score/total for calculating the mean score + total_scores: Dict[ID_TYPE, Dict[str, float]] = {} + score_counts: Dict[ID_TYPE, Dict[str, int]] = {} + + for eval_run in eval_config.runs(readonly=True): + if eval_run.task_run_config_id is None: + # This eval_run is not associated with a run_config, so we should not count it + continue + run_config_id = eval_run.task_run_config_id + + # Check if we should count this eval_run. Not every eval_run has to go into the stats: + # - a dataset_id can be removed from the dataset filter (removed a tag) + # - this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) + if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]: + continue + else: + remaining_expected_dataset_ids[run_config_id].remove( + eval_run.dataset_id + ) + + incomplete = False + for output_score in eval.output_scores: + score_key = output_score.json_key() + if run_config_id not in total_scores: + total_scores[run_config_id] = {} + score_counts[run_config_id] = {} + if score_key not in total_scores[run_config_id]: + total_scores[run_config_id][score_key] = 0 + score_counts[run_config_id][score_key] = 0 + if score_key in eval_run.scores: + total_scores[run_config_id][score_key] += eval_run.scores[score_key] + score_counts[run_config_id][score_key] += 1 + else: + # We're missing a required score, so this eval_run is incomplete + incomplete = True + + if incomplete: + partial_incomplete_counts[run_config_id] += 1 + + # Convert to score summaries + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] = {} + for run_config_id, output_scores in total_scores.items(): + results[run_config_id] = {} + for output_score_id, score in output_scores.items(): + count = score_counts[run_config_id][output_score_id] + if count > 0: + results[run_config_id][output_score_id] = ScoreSummary( + mean_score=score / count + ) + + # Calculate the percent of the dataset that has been processed + run_config_percent_complete: Dict[ID_TYPE, float] = {} + for run_config in task_runs_configs: + # Partial incomplete (missing scores), and fully incomplete (no eval_run) + incomplete_count = partial_incomplete_counts[run_config.id] + len( + remaining_expected_dataset_ids[run_config.id] + ) + percent_incomplete = incomplete_count / len(expected_dataset_ids) + run_config_percent_complete[run_config.id] = 1 - percent_incomplete + + return EvalResultSummary( + results=results, + run_config_percent_complete=run_config_percent_complete, + dataset_size=len(expected_dataset_ids), + ) + + # Compared to above, this is comparing all eval configs to each other, not looking at a single eval config + @app.get( + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary" + ) + async def get_eval_configs_score_summary( + project_id: str, + task_id: str, + eval_id: str, + ) -> EvalConfigCompareSummary: + task = task_from_id(project_id, task_id) + eval = eval_from_id(project_id, task_id, eval_id) + eval_configs = eval.configs(readonly=True) + + # Create a map of score_key -> Task requirement ID + score_key_to_task_requirement_id: Dict[str, ID_TYPE] = {} + for task_requirement in task.requirements: + score_key = string_to_json_key(task_requirement.name) + score_key_to_task_requirement_id[score_key] = task_requirement.id + + # Build a set of all the dataset items IDs we expect to have scores for + # Fetch all the dataset items in a filter, and return a map of dataset_id -> TaskRun + filter = dataset_filter_from_id(eval.eval_configs_filter_id) + expected_dataset_items = {run.id: run for run in task.runs() if filter(run)} + expected_dataset_ids = set(expected_dataset_items.keys()) + if len(expected_dataset_ids) == 0: + return EvalConfigCompareSummary( + results={}, + eval_config_percent_complete={}, + dataset_size=0, + fully_rated_count=0, + partially_rated_count=0, + not_rated_count=0, + ) + + # save a copy of the expected dataset ids for each eval config id, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + eval_config.id: set(expected_dataset_ids) for eval_config in eval_configs + } + + # eval_config_id -> output_score_json_key -> correlation calculator + correlation_calculators: Dict[ID_TYPE, Dict[str, CorrelationCalculator]] = {} + + for eval_config in eval_configs: + for eval_run in eval_config.runs(readonly=True): + dataset_item = expected_dataset_items.get(eval_run.dataset_id, None) + if dataset_item is None: + # A dataset_id can be removed from the dataset filter (ran previously, then removed the tag to remove it from the eval config set filter) + # A dataset_id could be for an run_config, not for comparing eval at all + continue + + # Check if we should count this eval_run. Not every eval_run has to go into the stats: + # Example: this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) + if ( + eval_run.dataset_id + not in remaining_expected_dataset_ids[eval_config.id] + ): + continue + else: + remaining_expected_dataset_ids[eval_config.id].remove( + eval_run.dataset_id + ) + + for output_score in eval.output_scores: + score_key = output_score.json_key() + eval_score: float | None = eval_run.scores.get(score_key, None) + + # Fetch the human eval score from the dataset item + human_score = human_score_from_task_run( + dataset_item, score_key, score_key_to_task_requirement_id + ) + + if human_score is None or eval_score is None: + # This score doesn't have both a human eval and eval score, so we can't compare + continue + + if eval_config.id not in correlation_calculators: + correlation_calculators[eval_config.id] = {} + + calculator = correlation_calculators[eval_config.id].get( + score_key, None + ) + if calculator is None: + calculator = CorrelationCalculator() + correlation_calculators[eval_config.id][score_key] = calculator + + normalized_eval_score = normalize_rating( + eval_score, output_score.type + ) + normalized_human_score = normalize_rating( + human_score, output_score.type + ) + calculator.add_score( + CorrelationScore( + measured_score=eval_score, + human_score=human_score, + normalized_measured_score=normalized_eval_score, + normalized_human_score=normalized_human_score, + ) + ) + + # Convert to score summaries + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] = {} + for eval_config_id in correlation_calculators.keys(): + results[eval_config_id] = {} + for score_key in correlation_calculators[eval_config_id].keys(): + calculator = correlation_calculators[eval_config_id].get( + score_key, None + ) + if calculator is None: + # No scores to calculate correlation for this pair + continue + + correlation_result = calculator.calculate_correlation() + results[eval_config_id][score_key] = correlation_result + + # Calculate the percent of the dataset that has been processed + eval_config_percent_complete: Dict[ID_TYPE, float] = {} + for eval_config in eval_configs: + incomplete_count = len(remaining_expected_dataset_ids[eval_config.id]) + percent_incomplete = incomplete_count / len(expected_dataset_ids) + eval_config_percent_complete[eval_config.id] = 1 - percent_incomplete + + # Count how many dataset items have human evals + fully_rated_count, partially_rated_count, not_rated_count = count_human_evals( + list(expected_dataset_items.values()), + eval, + score_key_to_task_requirement_id, + ) + + return EvalConfigCompareSummary( + results=results, + eval_config_percent_complete=eval_config_percent_complete, + dataset_size=len(expected_dataset_ids), + fully_rated_count=fully_rated_count, + partially_rated_count=partially_rated_count, + not_rated_count=not_rated_count, + ) diff --git a/app/desktop/studio_server/finetune_api.py b/app/desktop/studio_server/finetune_api.py index f4e09a43..ad2e4b46 100644 --- a/app/desktop/studio_server/finetune_api.py +++ b/app/desktop/studio_server/finetune_api.py @@ -11,7 +11,7 @@ ) from kiln_ai.adapters.prompt_builders import ( chain_of_thought_prompt, - prompt_builder_from_ui_name, + prompt_builder_from_id, ) from kiln_ai.adapters.provider_tools import ( provider_enabled, @@ -24,9 +24,11 @@ FineTuneStatusType, Task, ) +from kiln_ai.datamodel.dataset_filters import ( + DatasetFilterId, +) from kiln_ai.datamodel.dataset_split import ( AllSplitDefinition, - DatasetFilterType, Train60Test20Val20SplitDefinition, Train80Test10Val10SplitDefinition, Train80Test20SplitDefinition, @@ -73,7 +75,7 @@ class CreateDatasetSplitRequest(BaseModel): """Request to create a dataset split""" dataset_split_type: DatasetSplitType - filter_type: DatasetFilterType + filter_id: DatasetFilterId name: str | None = None description: str | None = None @@ -206,7 +208,7 @@ async def create_dataset_split( name, task, split_definitions, - filter_type=request.filter_type, + filter_id=request.filter_id, description=request.description, ) dataset_split.save_to_file() @@ -340,7 +342,7 @@ def system_message_from_request( detail="System message generator is required when custom system message is not provided", ) try: - prompt_builder = prompt_builder_from_ui_name(system_message_generator, task) + prompt_builder = prompt_builder_from_id(system_message_generator, task) system_message = prompt_builder.build_prompt( include_json_instructions=False ) diff --git a/app/desktop/studio_server/prompt_api.py b/app/desktop/studio_server/prompt_api.py index d43b8760..4e992983 100644 --- a/app/desktop/studio_server/prompt_api.py +++ b/app/desktop/studio_server/prompt_api.py @@ -1,30 +1,31 @@ from fastapi import FastAPI, HTTPException -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id +from kiln_ai.datamodel import PromptId from kiln_server.task_api import task_from_id from pydantic import BaseModel class PromptApiResponse(BaseModel): prompt: str - prompt_builder_name: str - ui_generator_name: str + prompt_id: PromptId def connect_prompt_api(app: FastAPI): - @app.get("/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_generator}") + @app.get("/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_id}") async def generate_prompt( - project_id: str, task_id: str, prompt_generator: str + project_id: str, + task_id: str, + prompt_id: PromptId, ) -> PromptApiResponse: task = task_from_id(project_id, task_id) try: - prompt_builder = prompt_builder_from_ui_name(prompt_generator, task) + prompt_builder = prompt_builder_from_id(prompt_id, task) prompt = prompt_builder.build_prompt_for_ui() except Exception as e: raise HTTPException(status_code=400, detail=str(e)) return PromptApiResponse( prompt=prompt, - prompt_builder_name=prompt_builder.__class__.prompt_builder_name(), - ui_generator_name=prompt_generator, + prompt_id=prompt_id, ) diff --git a/app/desktop/studio_server/provider_api.py b/app/desktop/studio_server/provider_api.py index 8a7f5917..5336cbac 100644 --- a/app/desktop/studio_server/provider_api.py +++ b/app/desktop/studio_server/provider_api.py @@ -75,6 +75,7 @@ class ModelDetails(BaseModel): name: str supports_structured_output: bool supports_data_gen: bool + supports_logprobs: bool # True if this is a untested model (typically user added). We don't know if these support structured output, data gen, etc. They should appear in their own section in the UI. untested_model: bool = Field(default=False) task_filter: List[str] | None = Field(default=None) @@ -139,6 +140,7 @@ async def get_available_models() -> List[AvailableModels]: name=model.friendly_name, supports_structured_output=provider.supports_structured_output, supports_data_gen=provider.supports_data_gen, + supports_logprobs=provider.supports_logprobs, ) ) @@ -534,6 +536,7 @@ async def available_ollama_models() -> AvailableModels | None: name=model.friendly_name, supports_structured_output=ollama_provider.supports_structured_output, supports_data_gen=ollama_provider.supports_data_gen, + supports_logprobs=False, # Ollama doesn't support logprobs https://github.com/ollama/ollama/issues/2415 ) ) for ollama_model in ollama_connection.untested_models: @@ -543,6 +546,7 @@ async def available_ollama_models() -> AvailableModels | None: name=ollama_model, supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) @@ -595,6 +599,7 @@ def custom_models() -> AvailableModels | None: name=f"{provider_name_from_id(provider_id)}: {model_name}", supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) @@ -626,6 +631,7 @@ def all_fine_tuned_models() -> AvailableModels | None: # YMMV, but we'll assume all fine tuned models support structured output and data gen supports_structured_output=True, supports_data_gen=True, + supports_logprobs=False, task_filter=[str(task.id)], ) ) @@ -727,6 +733,7 @@ def openai_compatible_providers_load_cache() -> OpenAICompatibleProviderCache | name=model.id, supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ) diff --git a/app/desktop/studio_server/test_correlation_calculator.py b/app/desktop/studio_server/test_correlation_calculator.py new file mode 100644 index 00000000..c0fca092 --- /dev/null +++ b/app/desktop/studio_server/test_correlation_calculator.py @@ -0,0 +1,246 @@ +import pytest + +from app.desktop.studio_server.correlation_calculator import ( + CorrelationCalculator, + CorrelationScore, +) + + +class TestCorrelationCalculator: + def create_correlation_scores(self, measured, human): + """Helper method to create correlation scores from raw data with normalization""" + scores = [] + + # Calculate normalized values + min_m, max_m = min(measured), max(measured) + min_h, max_h = min(human), max(human) + + for m, h in zip(measured, human): + norm_m = (m - min_m) / (max_m - min_m) if max_m != min_m else 0 + norm_h = (h - min_h) / (max_h - min_h) if max_h != min_h else 0 + scores.append( + CorrelationScore( + measured_score=m, + human_score=h, + normalized_measured_score=norm_m, + normalized_human_score=norm_h, + ) + ) + return scores + + @pytest.fixture + def perfect_correlation_data(self): + """Dataset with perfect correlation (r=1.0)""" + measured = list(range(10)) + human = list(range(10)) + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def high_correlation_data(self): + """Dataset with high correlation (r≈0.9)""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [1.1, 2.2, 2.9, 3.8, 5.2, 5.8, 7.1, 8.3, 8.7, 10.2] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def no_correlation_data(self): + """Dataset with no correlation""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [5.5, 6.2, 4.8, 7.3, 2.1, 8.9, 3.7, 5.4, 6.8, 4.2] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def inverse_correlation_data(self): + """Dataset with inverse correlation (r≈-0.9)""" + measured = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + human = [10, 9, 8, 7, 6, 5, 4, 3, 2, 1] + return self.create_correlation_scores(measured, human) + + @pytest.fixture + def single_data_point(self): + """Dataset with only one data point""" + return [ + CorrelationScore( + measured_score=5, + human_score=5, + normalized_measured_score=0.5, + normalized_human_score=0.5, + ) + ] + + @pytest.fixture + def two_data_points(self): + """Dataset with only two data points""" + measured = [1, 10] + human = [2, 9] + return self.create_correlation_scores(measured, human) + + def setup_calculator_with_data(self, data): + """Helper method to create and populate a calculator with data""" + calculator = CorrelationCalculator() + for score in data: + calculator.add_score(score) + return calculator + + def test_add_score(self): + """Test adding scores to the calculator""" + calculator = CorrelationCalculator() + score = CorrelationScore( + measured_score=5, + human_score=6, + normalized_measured_score=0.5, + normalized_human_score=0.6, + ) + + calculator.add_score(score) + assert len(calculator.scores) == 1 + assert calculator.scores[0] == score + + def test_empty_calculator(self): + """Test that calculating correlation with no scores raises an error""" + calculator = CorrelationCalculator() + + with pytest.raises(ValueError, match="No scores to calculate correlation"): + calculator.calculate_correlation() + + def test_perfect_correlation(self, perfect_correlation_data): + """Test correlation calculations with perfectly correlated data""" + calculator = CorrelationCalculator() + for score in perfect_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Perfect correlation should have: + # - MAE and MSE of 0 (no error) + # - Correlation coefficients of 1.0 + assert result.mean_absolute_error == 0.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 0.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation == pytest.approx(1.0) + assert result.pearson_correlation == pytest.approx(1.0) + assert result.kendalltau_correlation == pytest.approx(1.0) + + def test_high_correlation(self, high_correlation_data): + """Test correlation calculations with highly correlated data""" + calculator = CorrelationCalculator() + for score in high_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # High correlation should have: + # - Low but non-zero error metrics + # - Correlation coefficients close to 1.0 + assert 0 < result.mean_absolute_error < 1.0 + assert 0 < result.mean_normalized_absolute_error < 0.2 + assert 0 < result.mean_squared_error < 1.0 + assert 0 < result.mean_normalized_squared_error < 0.1 + assert result.spearman_correlation > 0.9 + assert result.pearson_correlation > 0.9 + assert result.kendalltau_correlation > 0.8 + + def test_no_correlation(self, no_correlation_data): + """Test correlation calculations with uncorrelated data""" + calculator = CorrelationCalculator() + for score in no_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # No correlation should have: + # - Higher error metrics + # - Correlation coefficients close to 0 + assert result.mean_absolute_error > 1.0 + assert result.mean_normalized_absolute_error > 0.2 + assert result.mean_squared_error > 2.0 + assert result.mean_normalized_squared_error > 0.1 + assert -0.3 < result.spearman_correlation < 0.3 + assert -0.3 < result.pearson_correlation < 0.3 + assert -0.3 < result.kendalltau_correlation < 0.3 + + def test_inverse_correlation(self, inverse_correlation_data): + """Test correlation calculations with inversely correlated data""" + calculator = CorrelationCalculator() + for score in inverse_correlation_data: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Inverse correlation should have: + # - Higher error metrics + # - Correlation coefficients close to -1.0 + assert result.mean_absolute_error > 4.0 + assert result.mean_normalized_absolute_error > 0.5 + assert result.mean_squared_error > 20.0 + assert result.mean_normalized_squared_error > 0.3 + assert result.spearman_correlation < -0.9 + assert result.pearson_correlation < -0.9 + assert result.kendalltau_correlation < -0.9 + + def test_single_data_point(self, single_data_point): + """Test correlation calculations with a single data point""" + calculator = CorrelationCalculator() + for score in single_data_point: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Single data point should have: + # - Zero error (since the point matches itself) + # - Correlation coefficients of 0 (as defined in the implementation) + assert result.mean_absolute_error == 0.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 0.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation is None + assert result.pearson_correlation is None + assert result.kendalltau_correlation is None + + def test_two_data_points(self, two_data_points): + """Test correlation calculations with two data points""" + calculator = CorrelationCalculator() + for score in two_data_points: + calculator.add_score(score) + + result = calculator.calculate_correlation() + + # Two data points with positive correlation should have: + # - Some error + # - Positive correlation coefficients + assert result.mean_absolute_error == 1.0 + assert result.mean_normalized_absolute_error == 0.0 + assert result.mean_squared_error == 1.0 + assert result.mean_normalized_squared_error == 0.0 + assert result.spearman_correlation == pytest.approx(1.0) + assert result.pearson_correlation == pytest.approx(1.0) + assert result.kendalltau_correlation == pytest.approx(1.0) + + def test_individual_calculation_methods(self, high_correlation_data): + """Test that individual calculation methods match the combined result""" + calculator = CorrelationCalculator() + for score in high_correlation_data: + calculator.add_score(score) + + # Calculate individual metrics + mae = calculator.calculate_mean_absolute_error() + # Our spell checker thinks n-m-a-e is a misspelling of name :) + n_mae = calculator.calculate_mean_normalized_absolute_error() + mse = calculator.calculate_mean_squared_error() + nmse = calculator.calculate_mean_normalized_squared_error() + spearman = calculator.calculate_spearman_correlation() + pearson = calculator.calculate_pearson_correlation() + kendall = calculator.calculate_kendalltau_correlation() + + # Calculate combined result + result = calculator.calculate_correlation() + + # Verify they match + assert result.mean_absolute_error == mae + assert result.mean_normalized_absolute_error == n_mae + assert result.mean_squared_error == mse + assert result.mean_normalized_squared_error == nmse + assert result.spearman_correlation == spearman + assert result.pearson_correlation == pearson + assert result.kendalltau_correlation == kendall diff --git a/app/desktop/studio_server/test_data_gen_api.py b/app/desktop/studio_server/test_data_gen_api.py index 1bb39875..80d9dcaf 100644 --- a/app/desktop/studio_server/test_data_gen_api.py +++ b/app/desktop/studio_server/test_data_gen_api.py @@ -160,7 +160,7 @@ def test_save_sample_success_paid_run( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", topic_path=[], # No topic path ) @@ -215,7 +215,7 @@ def test_save_sample_success_with_mock_invoke( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", topic_path=["AI", "Machine Learning", "Deep Learning"], ) @@ -270,7 +270,7 @@ def test_save_sample_success_with_topic_path( input_provider="openai", output_model_name="gpt_4o_mini", output_provider="openai", - prompt_method="basic", + prompt_method="simple_prompt_builder", ) # Act diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py new file mode 100644 index 00000000..58a6e2fc --- /dev/null +++ b/app/desktop/studio_server/test_eval_api.py @@ -0,0 +1,1039 @@ +import json +from dataclasses import dataclass +from typing import Dict, List, Tuple +from unittest.mock import Mock, patch + +import pytest +from fastapi import FastAPI, HTTPException +from fastapi.responses import StreamingResponse +from fastapi.testclient import TestClient +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.datamodel import ( + BasePrompt, + DataSource, + DataSourceType, + Priority, + Project, + RequirementRating, + Task, + TaskOutput, + TaskOutputRating, + TaskRequirement, + TaskRun, +) +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalOutputScore, + EvalRun, + EvalTemplateId, +) +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig + +from app.desktop.studio_server.eval_api import ( + CreateEvalConfigRequest, + CreateEvaluatorRequest, + connect_evals_api, + eval_config_from_id, + task_run_config_from_id, +) + + +@pytest.fixture +def app(): + app = FastAPI() + connect_evals_api(app) + return app + + +@pytest.fixture +def client(app): + return TestClient(app) + + +@pytest.fixture +def mock_task(tmp_path): + project = Project( + id="project1", + name="Test Project", + path=tmp_path / "project.kiln", + ) + project.save_to_file() + task = Task( + id="task1", + name="Test Task", + description="Test Description", + instruction="Test Instructions", + path=tmp_path / "task.kiln", + requirements=[ + TaskRequirement( + name="score1", + description="desc1", + instruction="inst1", + priority=Priority.p1, + type="five_star", + ), + ], + parent=project, + ) + task.save_to_file() + return task + + +@pytest.fixture +def mock_eval(mock_task): + eval = Eval( + id="eval1", + name="Test Eval", + description="Test Description", + template=EvalTemplateId.bias, + output_scores=[ + EvalOutputScore(name="score1", description="desc1", type="five_star"), + EvalOutputScore( + name="overall_rating", description="desc2", type="five_star" + ), + ], + eval_set_filter_id="tag::eval_set", + eval_configs_filter_id="tag::golden", + parent=mock_task, + ) + eval.save_to_file() + return eval + + +@pytest.fixture +def mock_eval_config(mock_eval): + eval_config = EvalConfig( + id="eval_config1", + name="Test Eval Config", + config_type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + parent=mock_eval, + model_name="gpt-4", + model_provider="openai", + prompt=BasePrompt( + name="test", + prompt="base prompt", + chain_of_thought_instructions="cot prompt", + ), + ) + eval_config.save_to_file() + return eval_config + + +@pytest.fixture +def mock_run_config(mock_task): + run_config = TaskRunConfig( + parent=mock_task, + id="run_config1", + name="Test Run Config", + description="Test Description", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_chain_of_thought_prompt_builder", + ), + ) + run_config.save_to_file() + return run_config + + +@pytest.fixture +def mock_task_from_id(mock_task): + with patch("app.desktop.studio_server.eval_api.task_from_id") as mock: + mock.return_value = mock_task + yield mock + + +def test_get_evals_success(client, mock_task, mock_task_from_id, mock_eval): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/evals") + + assert response.status_code == 200 + result = response.json() + assert len(result) == 1 + assert result[0]["id"] == "eval1" + assert result[0]["name"] == "Test Eval" + mock_task_from_id.assert_called_once_with("project1", "task1") + + +def test_get_eval_success(client, mock_task, mock_task_from_id, mock_eval): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/eval/eval1") + + assert response.status_code == 200 + result = response.json() + assert result["id"] == "eval1" + assert result["name"] == "Test Eval" + mock_task_from_id.assert_called_once_with("project1", "task1") + + +def test_get_eval_not_found(client, mock_task, mock_task_from_id): + mock_task_from_id.return_value = mock_task + + response = client.get("/api/projects/project1/tasks/task1/eval/non_existent") + + assert response.status_code == 404 + assert response.json()["detail"] == "Eval not found. ID: non_existent" + + +@pytest.fixture +def valid_evaluator_request(): + return CreateEvaluatorRequest( + name="Test Evaluator", + description="Test Description", + template=None, + output_scores=[ + EvalOutputScore(name="score1", description="desc1", type="five_star"), + ], + eval_set_filter_id="tag::eval_set", + eval_configs_filter_id="tag::golden", + ) + + +@pytest.fixture +def valid_eval_config_request(): + return CreateEvalConfigRequest( + name="Test Eval Config", + type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + model_name="gpt-4", + provider=ModelProviderName.openai, + prompt_id="simple_chain_of_thought_prompt_builder", + ) + + +@pytest.mark.asyncio +async def test_create_evaluator( + client, mock_task_from_id, valid_evaluator_request, mock_task +): + mock_task_from_id.return_value = mock_task + + with patch.object(Eval, "save_to_file") as mock_save: + response = client.post( + "/api/projects/project1/tasks/task1/create_evaluator", + json=valid_evaluator_request.model_dump(), + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == valid_evaluator_request.name + assert result["description"] == valid_evaluator_request.description + mock_save.assert_called_once() + + +@pytest.mark.asyncio +async def test_create_task_run_config_with_freezing( + client, mock_task_from_id, mock_task +): + mock_task_from_id.return_value = mock_task + + with ( + patch( + "app.desktop.studio_server.eval_api.generate_memorable_name" + ) as mock_generate_memorable_name, + ): + mock_generate_memorable_name.return_value = "Custom Name" + + response = client.post( + "/api/projects/project1/tasks/task1/task_run_config", + json={ + "name": "Test Task Run Config", + "description": "Test Description", + "model_name": "gpt-4o", + "model_provider_name": "openai", + "prompt_id": "simple_chain_of_thought_prompt_builder", + }, + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == "Test Task Run Config" + assert result["description"] == "Test Description" + assert result["run_config_properties"]["model_name"] == "gpt-4o" + assert result["run_config_properties"]["model_provider_name"] == "openai" + assert ( + result["run_config_properties"]["prompt_id"] + == "task_run_config::project1::task1::" + result["id"] + ) + assert result["prompt"]["name"] == "Custom Name" + assert ( + result["prompt"]["description"] + == "Frozen copy of prompt 'simple_chain_of_thought_prompt_builder', created for evaluations." + ) + # Fetch it from API + fetch_response = client.get("/api/projects/project1/tasks/task1/task_run_configs") + assert fetch_response.status_code == 200 + configs = fetch_response.json() + assert len(configs) == 1 + assert configs[0]["id"] == result["id"] + assert configs[0]["name"] == result["name"] + assert configs[0]["prompt"]["name"] == "Custom Name" + assert configs[0]["prompt"]["description"] == ( + "Frozen copy of prompt 'simple_chain_of_thought_prompt_builder', created for evaluations." + ) + assert configs[0]["run_config_properties"]["prompt_id"] == ( + "task_run_config::project1::task1::" + result["id"] + ) + + +@pytest.mark.asyncio +async def test_create_task_run_config_without_freezing( + client, mock_task_from_id, mock_task +): + mock_task_from_id.return_value = mock_task + + with ( + patch( + "app.desktop.studio_server.eval_api.generate_memorable_name" + ) as mock_generate_memorable_name, + ): + mock_generate_memorable_name.return_value = "Custom Name" + + response = client.post( + "/api/projects/project1/tasks/task1/task_run_config", + json={ + "name": "Test Task Run Config", + "description": "Test Description", + "model_name": "gpt-4o", + "model_provider_name": "openai", + "prompt_id": "id::prompt_123", + }, + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == "Test Task Run Config" + assert result["description"] == "Test Description" + assert result["run_config_properties"]["model_name"] == "gpt-4o" + assert result["run_config_properties"]["model_provider_name"] == "openai" + assert result["run_config_properties"]["prompt_id"] == "id::prompt_123" + assert result["prompt"] is None + + +@pytest.mark.asyncio +async def test_create_eval_config( + client, mock_task_from_id, valid_eval_config_request, mock_eval, mock_task +): + mock_task_from_id.return_value = mock_task + + with ( + patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, + ): + mock_eval_from_id.return_value = mock_eval + + response = client.post( + "/api/projects/project1/tasks/task1/eval/eval1/create_eval_config", + json=valid_eval_config_request.model_dump(), + ) + + assert response.status_code == 200 + result = response.json() + assert result["name"] == valid_eval_config_request.name + assert result["config_type"] == valid_eval_config_request.type + assert result["properties"] == valid_eval_config_request.properties + assert result["model_name"] == valid_eval_config_request.model_name + assert result["model_provider"] == valid_eval_config_request.provider + + # Fetch disk + assert len(mock_eval.configs()) == 1 + config = mock_eval.configs()[0] + assert config.config_type == valid_eval_config_request.type + assert config.properties == valid_eval_config_request.properties + assert config.model_name == valid_eval_config_request.model_name + assert config.model_provider == valid_eval_config_request.provider + assert config.properties["eval_steps"][0] == "step1" + assert config.properties["eval_steps"][1] == "step2" + + +def test_get_eval_config( + client, mock_task_from_id, mock_eval, mock_task, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1" + ) + + assert response.status_code == 200 + config = response.json() + assert isinstance(config, dict) + + assert config["config_type"] == mock_eval_config.config_type + assert config["properties"] == mock_eval_config.properties + assert config["model_name"] == mock_eval_config.model_name + assert config["model_provider"] == mock_eval_config.model_provider + + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") + + +def test_get_eval_configs( + client, mock_task_from_id, mock_eval, mock_task, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_configs" + ) + + assert response.status_code == 200 + configs = response.json() + assert isinstance(configs, list) + assert len(configs) == 1 + + config = configs[0] + assert config["config_type"] == mock_eval_config.config_type + assert config["properties"] == mock_eval_config.properties + assert config["model_name"] == mock_eval_config.model_name + assert config["model_provider"] == mock_eval_config.model_provider + + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") + + +@pytest.mark.asyncio +async def test_run_eval_config( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config, mock_run_config +): + mock_task_from_id.return_value = mock_task + + # Mock progress updates + progress_updates = [ + Mock(complete=1, total=3, errors=0), + Mock(complete=2, total=3, errors=0), + Mock(complete=3, total=3, errors=0), + ] + + # Create async generator for mock progress + async def mock_run(): + for progress in progress_updates: + yield progress + + with ( + patch( + "app.desktop.studio_server.eval_api.task_run_config_from_id" + ) as mock_run_config_from_id, + patch("app.desktop.studio_server.eval_api.EvalRunner") as MockEvalRunner, + ): + mock_run_config_from_id.return_value = mock_run_config + mock_eval_runner = Mock() + mock_eval_runner.run.return_value = mock_run() + MockEvalRunner.return_value = mock_eval_runner + + # Make request with specific run_config_ids + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval", + params={"run_config_ids": ["run_config1", "run_config2"]}, + ) + + assert response.status_code == 200 + + # Parse SSE messages + messages = [msg for msg in response.iter_lines() if msg] + + # Should have 4 messages: 3 progress updates and 1 complete + assert len(messages) == 4 + + # Check progress messages + for i, msg in enumerate(messages[:-1]): + assert msg.startswith("data: ") + data = json.loads(msg.split("data: ")[1]) + assert data["progress"] == i + 1 + assert data["total"] == 3 + assert data["errors"] == 0 + + # Check complete message + assert messages[-1] == "data: complete" + + +@pytest.mark.asyncio +async def test_run_eval_config_no_run_configs_error( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + with patch( + "app.desktop.studio_server.eval_api.eval_config_from_id" + ) as mock_eval_config_from_id: + mock_eval_config_from_id.return_value = mock_eval_config + + # Make request with no run_config_ids and all_run_configs=False + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval" + ) + + assert response.status_code == 400 + assert ( + response.json()["detail"] + == "No run config ids provided. At least one run config id is required." + ) + + +@pytest.mark.asyncio +async def test_eval_config_from_id( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + eval_config = eval_config_from_id("project1", "task1", "eval1", "eval_config1") + + assert eval_config.id == "eval_config1" + assert eval_config.name == "Test Eval Config" + assert eval_config.config_type == EvalConfigType.g_eval + assert eval_config.properties == {"eval_steps": ["step1", "step2"]} + + with pytest.raises(HTTPException, match="Eval config not found. ID: non_existent"): + eval_config_from_id("project1", "task1", "eval1", "non_existent") + + +@pytest.mark.asyncio +async def test_task_run_config_from_id( + client, mock_task_from_id, mock_task, mock_run_config +): + mock_task_from_id.return_value = mock_task + + run_config = task_run_config_from_id("project1", "task1", "run_config1") + + assert run_config.id == "run_config1" + assert run_config.name == "Test Run Config" + assert run_config.description == "Test Description" + + with pytest.raises( + HTTPException, match="Task run config not found. ID: non_existent" + ): + task_run_config_from_id("project1", "task1", "non_existent") + + +@pytest.fixture +def mock_eval_for_score_summary(): + eval = Mock(spec=Eval) + eval.output_scores = [ + EvalOutputScore(name="accuracy", description="Test accuracy", type="pass_fail"), + EvalOutputScore( + name="relevance", description="Test relevance", type="pass_fail" + ), + ] + eval.eval_set_filter_id = "tag::eval_set" + return eval + + +@pytest.fixture +def mock_eval_config_for_score_summary(): + config = Mock(spec=EvalConfig) + + scores: Tuple[str, str, Dict[str, float]] = [ + # Run 1 - normal + ("run1", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run1", "dataset_id_2", {"accuracy": 0.6, "relevance": 0.7}), + # Run 2 - only 1 score, should be 0.5 complete + ("run2", "dataset_id_1", {"accuracy": 0.9, "relevance": 0.85}), + # Run 3 - no valid scores, 0.0 complete + ("run3", "dataset_id_1", {"other": 0.5}), + # Run 4 - Partial incomplete doesn't divide by zero, still 0.0 complete + ("run4", "dataset_id_1", {"accuracy": 0.5}), + # Run 5 - duplicate dataset_id not double counted, item not in dataset filter ignored + ("run5", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run5", "dataset_id_1", {"accuracy": 0.8, "relevance": 0.9}), + ("run5", "dataset_id_2", {"accuracy": 0.6, "relevance": 0.7}), + ("run5", "not_in_filter", {"accuracy": 0.1, "relevance": 0.1}), + ] + runs = [] + + id = 0 + for run_id, dataset_id, score in scores: + id += 1 + runs.append( + EvalRun( + task_run_config_id=run_id, + scores=score, + input="input", + output="output", + dataset_id=dataset_id, + ) + ) + + config.runs.return_value = runs + return config + + +@pytest.mark.asyncio +async def test_get_eval_config_score_summary( + client, mock_eval_for_score_summary, mock_eval_config_for_score_summary +): + with ( + patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id, + patch( + "app.desktop.studio_server.eval_api.dataset_ids_in_filter" + ) as mock_dataset_ids_in_filter, + patch( + "app.desktop.studio_server.eval_api.eval_config_from_id" + ) as mock_eval_config_from_id, + patch("app.desktop.studio_server.eval_api.task_from_id") as mock_task_from_id, + ): + mock_eval_from_id.return_value = mock_eval_for_score_summary + mock_eval_config_from_id.return_value = mock_eval_config_for_score_summary + mock_dataset_ids_in_filter.return_value = { + "dataset_id_1", + "dataset_id_2", + } + + mock_task = Mock(spec=Task) + mock_task.run_configs.return_value = [ + Mock(spec=TaskRunConfig, id="run1"), + Mock(spec=TaskRunConfig, id="run2"), + Mock(spec=TaskRunConfig, id="run3"), + Mock(spec=TaskRunConfig, id="run4"), + Mock(spec=TaskRunConfig, id="run5"), + ] + mock_task_from_id.return_value = mock_task + + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/score_summary" + ) + + assert response.status_code == 200 + top_level_result = response.json() + + # Verify the structure of the response + assert "results" in top_level_result + results = top_level_result["results"] + assert "run_config_percent_complete" in top_level_result + run_config_percent_complete = top_level_result["run_config_percent_complete"] + assert "dataset_size" in top_level_result + assert top_level_result["dataset_size"] == 2 + + # Check average scores for run1 + assert results["run1"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 + assert results["run1"]["relevance"]["mean_score"] == 0.8 # Only one valid score + assert run_config_percent_complete["run1"] == 1.0 + + # Check average scores for run2 + assert results["run2"]["accuracy"]["mean_score"] == 0.9 + assert results["run2"]["relevance"]["mean_score"] == 0.85 + assert run_config_percent_complete["run2"] == 0.5 + + # run 3 has non valid scores + assert results["run3"] == {} + assert run_config_percent_complete["run3"] == 0.0 + + # run 4 has no scores + assert results["run4"]["accuracy"]["mean_score"] == 0.5 + assert "relevance" not in results["run4"] + assert run_config_percent_complete["run4"] == 0.0 + + # Check average scores for run5 - duplicate dataset_id not double counted + assert results["run5"]["accuracy"]["mean_score"] == 0.7 # (0.8 + 0.6) / 2 + assert results["run5"]["relevance"]["mean_score"] == 0.8 # Only one valid score + assert run_config_percent_complete["run5"] == 1.0 + + # Verify the mocks were called correctly + mock_eval_from_id.assert_called_once_with("project1", "task1", "eval1") + mock_eval_config_from_id.assert_called_once_with( + "project1", "task1", "eval1", "eval_config1" + ) + mock_eval_config_for_score_summary.runs.assert_called_once_with(readonly=True) + mock_dataset_ids_in_filter.assert_called_once_with(mock_task, "tag::eval_set") + + +@pytest.mark.asyncio +async def test_get_eval_run_results( + client, + mock_task_from_id, + mock_task, + mock_eval, + mock_eval_config, + mock_run_config, +): + mock_task_from_id.return_value = mock_task + + eval_run = EvalRun( + task_run_config_id="run_config1", + scores={"score1": 3.0, "overall_rating": 1.0}, + input="input", + output="output", + dataset_id="dataset_id1", + parent=mock_eval_config, + ) + eval_run.save_to_file() + + # Test successful retrieval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/eval_config1/run_config/run_config1/results" + ) + + assert response.status_code == 200 + data = response.json() + + # Verify response structure + assert "results" in data + assert "eval" in data + assert "eval_config" in data + assert "run_config" in data + + # Verify results content + assert len(data["results"]) == 1 + assert data["results"][0]["id"] == eval_run.id + assert data["results"][0]["task_run_config_id"] == mock_run_config.id + assert data["results"][0]["scores"] == {"score1": 3.0, "overall_rating": 1.0} + + # Test with invalid eval ID + response = client.get( + "/api/projects/project1/tasks/task1/eval/invalid_eval" + "/eval_config/eval_config1/run_config/run_config1/results" + ) + assert response.status_code == 404 + + # Test with invalid eval config ID + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/invalid_config/run_config/run_config1/results" + ) + assert response.status_code == 404 + + # Test with invalid run config ID + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1" + "/eval_config/eval_config1/run_config/invalid_run_config/results" + ) + assert response.status_code == 404 + + +@pytest.mark.asyncio +async def test_get_eval_config_compare_summary( + client, + mock_task_from_id, + mock_task, + mock_eval, + mock_eval_config, + mock_run_config, +): + mock_task_from_id.return_value = mock_task + + # structed data to make it easier to generate test cases. + @dataclass + class EvalCondigSummaryTestData: + human_overall_rating: float | None + score1_overall_rating: float | None + eval_overall_rating: float + eval__score1_rating: float + eval_config_id: str + skip_eval_run: bool = False + skip_golden_tag: bool = False + + test_data: List[EvalCondigSummaryTestData] = [ + # Test 1: ec1 + # Normal run, with some data to check calulations on a sinlgle run + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=2.0, + eval_overall_rating=1.0, + eval__score1_rating=3.5, + eval_config_id="ec1", + ), + # Should be ignored as it's not in the eval set filter (golden tag). Would mess up the scores of eval_config1 if included + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec1", + skip_golden_tag=True, + ), + # Test 2: ec2 - Test multiple, and correct averaging + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec2", + ), + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=1.0, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec2", + ), + # Test 3: Dataset item that has partial human rating + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=None, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec3", + ), + # Test 4: Dataset item that has no human rating + EvalCondigSummaryTestData( + human_overall_rating=None, + score1_overall_rating=None, + eval_overall_rating=3.0, + eval__score1_rating=3.0, + eval_config_id="ec4", + ), + # Test 5: skipping eval run should lower the percent complete + EvalCondigSummaryTestData( + human_overall_rating=5.0, + score1_overall_rating=5.0, + eval_overall_rating=4.0, + eval__score1_rating=4.0, + eval_config_id="ec5", + skip_eval_run=True, + ), + ] + + # Count items that don't have skip_golden_tag set to True + total_in_dataset = sum(1 for x in test_data if not x.skip_golden_tag) + + eval_configs_by_id: Dict[str, EvalConfig] = {} + + assert len(mock_task.requirements) == 1 + assert mock_task.requirements[0].name == "score1" + score1_requirement_id = mock_task.requirements[0].id + for test_case in test_data: + # create eval config if it doesn't exist + eval_config = eval_configs_by_id.get(test_case.eval_config_id) + if eval_config is None: + eval_config = EvalConfig( + id=test_case.eval_config_id, + name="Test Eval Config", + config_type=EvalConfigType.g_eval, + properties={"eval_steps": ["step1", "step2"]}, + parent=mock_eval, + model_name="gpt-4", + model_provider="openai", + prompt=BasePrompt( + name="test", + prompt="base prompt", + chain_of_thought_instructions="cot prompt", + ), + ) + eval_config.save_to_file() + eval_configs_by_id[test_case.eval_config_id] = eval_config + + tags = ["golden"] + if test_case.skip_golden_tag: + tags = [] + + ratings = {} + if test_case.score1_overall_rating is not None: + ratings[score1_requirement_id] = RequirementRating( + value=test_case.score1_overall_rating, + type="five_star", + ) + + task_run = TaskRun( + output=TaskOutput( + output="Test Output", + source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "langchain_adapter", + }, + ), + rating=TaskOutputRating( + value=test_case.human_overall_rating, + requirement_ratings=ratings, + ), + ), + input="Test Input", + input_source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "langchain_adapter", + }, + ), + tags=tags, + parent=mock_task, + ) + task_run.save_to_file() + + if test_case.skip_eval_run: + continue + + eval_run = EvalRun( + task_run_config_id="run_config1", + scores={ + "score1": test_case.eval__score1_rating, + "overall_rating": test_case.eval_overall_rating, + }, + input="input", + output="output", + dataset_id=task_run.id, + parent=eval_config, + ) + eval_run.save_to_file() + + # Test successful retrieval + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/eval_configs_score_summary" + ) + + assert response.status_code == 200 + data = response.json() + + assert "results" in data + results = data["results"] + assert isinstance(results, dict) + + assert "eval_config_percent_complete" in data + eval_config_percent_complete = data["eval_config_percent_complete"] + assert isinstance(eval_config_percent_complete, dict) + + # check the counts + assert data["fully_rated_count"] == 4 + assert data["partially_rated_count"] == 1 + assert data["not_rated_count"] == 1 + assert data["dataset_size"] == total_in_dataset + + # Test case 1: 1 item should be included, manually calculated scores, should exclude a second item that isn't in the eval config set filter + assert results["ec1"] == { + "overall_rating": { + "mean_squared_error": 16.0, # error 4.0^2 + "mean_absolute_error": 4.0, # error 4.0 + "mean_normalized_squared_error": 1, # max error: 1 v 5 + "mean_normalized_absolute_error": 1, # max error: 1 v 5 + "spearman_correlation": None, # Not enough data + "pearson_correlation": None, + "kendalltau_correlation": None, + }, + "score1": { + "mean_squared_error": 2.25, # error (3.5-5.0)^2 + "mean_absolute_error": 1.5, # error 1.5 + "mean_normalized_squared_error": 0.140625, # hand calc + "mean_normalized_absolute_error": 0.375, # 1.5/4 + "spearman_correlation": None, # Not enough data + "pearson_correlation": None, # Not enough data + "kendalltau_correlation": None, # Not enough data + }, + } + # 1 of total_in_dataset eval configs are are in ec1 test + assert eval_config_percent_complete["ec1"] == pytest.approx(1 / total_in_dataset) + + # Test case 2: check proper averaging + assert results["ec2"] == { + "overall_rating": { + "mean_squared_error": 2.5, # error (1^2 + 2^2) / 2 + "mean_absolute_error": 1.5, # (1+2)/2 + "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 + "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 + "spearman_correlation": None, + "pearson_correlation": None, + "kendalltau_correlation": None, + }, + "score1": { + "mean_squared_error": 2.5, # (1^2+2^2)/2 + "mean_absolute_error": 1.5, # (1+2)/2 + "mean_normalized_squared_error": 0.15625, # (0.25^2 + 0.5^2) / 2 + "mean_normalized_absolute_error": 0.375, # (0.25 + 0.5) / 2 + "spearman_correlation": 0.9999999999999999, + "pearson_correlation": 1, + "kendalltau_correlation": 1, + }, + } + # 2 of total_in_dataset eval configs are are in ec2 test + assert eval_config_percent_complete["ec2"] == pytest.approx(2 / total_in_dataset) + + # Test case 3: Check partials still calculate available scores + assert results["ec3"] == { + "overall_rating": { + "mean_squared_error": 4, + "mean_absolute_error": 2, + "mean_normalized_squared_error": 0.25, + "mean_normalized_absolute_error": 0.5, + "spearman_correlation": None, + "pearson_correlation": None, + "kendalltau_correlation": None, + }, + } + # 2 of total_in_dataset eval configs are are in ec2 test + assert eval_config_percent_complete["ec3"] == pytest.approx(1 / total_in_dataset) + + # Test case 4: Check no rating is empty results + assert results.get("ec4", {}) == {} + assert eval_config_percent_complete["ec4"] == pytest.approx(1 / total_in_dataset) + + # Test case 5: Check skipping eval run lowers the percent complete + assert eval_config_percent_complete["ec5"] == pytest.approx(0 / total_in_dataset) + + +@pytest.mark.asyncio +async def test_run_eval_config_eval( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + mock_task_from_id.return_value = mock_task + + # Create a mock response for run_eval_runner_with_status + mock_response = StreamingResponse( + content=iter([b"data: test\n\n"]), media_type="text/event-stream" + ) + + with patch( + "app.desktop.studio_server.eval_api.run_eval_runner_with_status" + ) as mock_run_eval: + # Set up the mock to return our mock response + mock_run_eval.return_value = mock_response + + # Call the endpoint + response = client.get( + "/api/projects/project1/tasks/task1/eval/eval1/run_eval_config_eval" + ) + + # Verify the response + assert response.status_code == 200 + + # Verify run_eval_runner_with_status was called with correct parameters + mock_run_eval.assert_called_once() + + # Get the EvalRunner that was passed to run_eval_runner_with_status + eval_runner = mock_run_eval.call_args[0][0] + + # Verify the EvalRunner was configured correctly + assert len(eval_runner.eval_configs) == 1 + assert eval_runner.eval_configs[0].id == mock_eval_config.id + assert eval_runner.run_configs is None + assert eval_runner.eval_run_type == "eval_config_eval" + + +@pytest.mark.asyncio +async def test_set_current_eval_config( + client, mock_task_from_id, mock_task, mock_eval, mock_eval_config +): + """Test setting the current eval config for an evaluation.""" + mock_task_from_id.return_value = mock_task + + # Get the eval before updating to verify the change + response = client.get("/api/projects/project1/tasks/task1/eval/eval1") + assert response.status_code == 200 + eval_before = response.json() + + # The current_config_id might be None or different initially + initial_config_id = eval_before.get("current_config_id") + assert initial_config_id is None + + # Set the current eval config + with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id: + mock_eval_from_id.return_value = mock_eval + response = client.post( + "/api/projects/project1/tasks/task1/eval/eval1/set_current_eval_config/eval_config1" + ) + assert response.status_code == 200 + updated_eval = response.json() + + # Verify the current_config_id was updated + assert updated_eval["current_config_id"] == "eval_config1" + assert updated_eval["id"] == "eval1" + + # Verify the change persists by fetching the eval again + eval_from_disk = mock_task.evals()[0] + assert eval_from_disk.current_config_id == "eval_config1" diff --git a/app/desktop/studio_server/test_finetune_api.py b/app/desktop/studio_server/test_finetune_api.py index 4e99fe4c..b86eeecf 100644 --- a/app/desktop/studio_server/test_finetune_api.py +++ b/app/desktop/studio_server/test_finetune_api.py @@ -15,21 +15,18 @@ Project, Task, ) +from kiln_ai.datamodel.dataset_filters import DatasetFilterId from kiln_ai.datamodel.dataset_split import ( - AllDatasetFilter, AllSplitDefinition, - HighRatingDatasetFilter, - ThinkingModelDatasetFilter, - ThinkingModelHighRatedFilter, Train60Test20Val20SplitDefinition, Train80Test10Val10SplitDefinition, Train80Test20SplitDefinition, ) +from pydantic import BaseModel from app.desktop.studio_server.finetune_api import ( CreateDatasetSplitRequest, CreateFinetuneRequest, - DatasetFilterType, DatasetSplitType, connect_fine_tune_api, thinking_instructions_from_request, @@ -281,9 +278,28 @@ def test_dataset_split_type_enum(): assert DatasetSplitType.ALL.value == "all" -def test_dataset_filter_type_enum(): - assert DatasetFilterType.ALL.value == "all" - assert DatasetFilterType.HIGH_RATING.value == "high_rating" +class ModelTester(BaseModel): + dataset_id: DatasetFilterId + + +# Check these stings from UI exist +@pytest.mark.parametrize( + "id,expect_error", + [ + ("all", False), + ("high_rating", False), + ("thinking_model", False), + ("thinking_model_high_rated", False), + ("invalid", True), + ], +) +def test_dataset_filter_ids(id, expect_error): + if expect_error: + with pytest.raises(ValueError): + ModelTester(dataset_id=id) + else: + model = ModelTester(dataset_id=id) + assert model.dataset_id == id def test_api_split_types_mapping(): @@ -303,22 +319,6 @@ def test_api_split_types_mapping(): assert split_type in api_split_types -def test_api_filter_types_mapping(): - from kiln_ai.datamodel.dataset_split import dataset_filters - - assert dataset_filters[DatasetFilterType.ALL] == AllDatasetFilter - assert dataset_filters[DatasetFilterType.HIGH_RATING] == HighRatingDatasetFilter - assert ( - dataset_filters[DatasetFilterType.THINKING_MODEL] == ThinkingModelDatasetFilter - ) - assert ( - dataset_filters[DatasetFilterType.THINKING_MODEL_HIGH_RATED] - == ThinkingModelHighRatedFilter - ) - for filter_type in DatasetFilterType: - assert filter_type in dataset_filters - - @pytest.fixture def mock_dataset_split(): split = DatasetSplit( @@ -342,7 +342,7 @@ def test_create_dataset_split( with mock_from_task as from_task_mock, mock_save as save_mock: request_data = { "dataset_split_type": "train_test", - "filter_type": "high_rating", + "filter_id": "high_rating", "name": "Test Split", "description": "Test description", } @@ -360,7 +360,7 @@ def test_create_dataset_split( mock_task_from_id_disk_backed.assert_called_once_with("project1", "task1") from_task_mock.assert_called_once() args, kwargs = from_task_mock.call_args - assert kwargs["filter_type"] == DatasetFilterType.HIGH_RATING + assert kwargs["filter_id"] == "high_rating" save_mock.assert_called_once() @@ -374,7 +374,7 @@ def test_create_dataset_split_auto_name( mock_save = unittest.mock.patch.object(DatasetSplit, "save_to_file") with mock_from_task as from_task_mock, mock_save as save_mock: - request_data = {"dataset_split_type": "train_test", "filter_type": "all"} + request_data = {"dataset_split_type": "train_test", "filter_id": "all"} response = client.post( "/api/projects/project1/tasks/task1/dataset_splits", json=request_data @@ -395,33 +395,31 @@ def test_create_dataset_split_request_validation(): # Test valid request request = CreateDatasetSplitRequest( dataset_split_type=DatasetSplitType.TRAIN_TEST, - filter_type=DatasetFilterType.ALL, + filter_id="all", name="Test Split", description="Test description", ) assert request.dataset_split_type == DatasetSplitType.TRAIN_TEST - assert request.filter_type == DatasetFilterType.ALL + assert request.filter_id == "all" assert request.name == "Test Split" assert request.description == "Test description" # Test optional fields request = CreateDatasetSplitRequest( dataset_split_type=DatasetSplitType.TRAIN_TEST, - filter_type=DatasetFilterType.ALL, + filter_id="all", ) assert request.name is None assert request.description is None # Test invalid dataset split type with pytest.raises(ValueError): - CreateDatasetSplitRequest( - dataset_split_type="invalid_type", filter_type=DatasetFilterType.ALL - ) + CreateDatasetSplitRequest(dataset_split_type="invalid_type", filter_id="all") # Test invalid filter type with pytest.raises(ValueError): CreateDatasetSplitRequest( - dataset_split_type=DatasetSplitType.TRAIN_TEST, filter_type="invalid_type" + dataset_split_type=DatasetSplitType.TRAIN_TEST, filter_id="invalid_type" ) @@ -660,7 +658,7 @@ def mock_prompt_builder(): builder.build_prompt.return_value = "Generated system message" with unittest.mock.patch( - "app.desktop.studio_server.finetune_api.prompt_builder_from_ui_name", + "app.desktop.studio_server.finetune_api.prompt_builder_from_id", return_value=builder, ) as mock: yield mock, builder diff --git a/app/desktop/studio_server/test_prompt_api.py b/app/desktop/studio_server/test_prompt_api.py index 35c0f17c..0b1ccf67 100644 --- a/app/desktop/studio_server/test_prompt_api.py +++ b/app/desktop/studio_server/test_prompt_api.py @@ -20,10 +20,6 @@ def client(): # Mock prompt builder class class MockPromptBuilder(BasePromptBuilder): - @classmethod - def prompt_builder_name(cls): - return "MockPromptBuilder" - def build_base_prompt(self): return "Mock prompt" @@ -37,10 +33,8 @@ def mock_task(): @pytest.fixture -def mock_prompt_builder_from_ui_name(mock_task): - with patch( - "app.desktop.studio_server.prompt_api.prompt_builder_from_ui_name" - ) as mock: +def mock_prompt_builder_from_id(mock_task): + with patch("app.desktop.studio_server.prompt_api.prompt_builder_from_id") as mock: mock.return_value = MockPromptBuilder(mock_task) yield mock @@ -53,42 +47,42 @@ def mock_task_from_id(mock_task): def test_generate_prompt_success( - client, mock_task, mock_prompt_builder_from_ui_name, mock_task_from_id + client, mock_task, mock_prompt_builder_from_id, mock_task_from_id ): response = client.get( - "/api/projects/project123/task/task456/gen_prompt/mock_generator" + "/api/projects/project123/task/task456/gen_prompt/simple_prompt_builder" ) assert response.status_code == 200 data = response.json() assert data == { "prompt": "Mock prompt for UI", - "prompt_builder_name": "MockPromptBuilder", - "ui_generator_name": "mock_generator", + "prompt_id": "simple_prompt_builder", } mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_ui_name.assert_called_once_with( - "mock_generator", mock_task + mock_prompt_builder_from_id.assert_called_once_with( + "simple_prompt_builder", mock_task ) def test_generate_prompt_exception( - client, mock_task, mock_prompt_builder_from_ui_name, mock_task_from_id + client, mock_task, mock_prompt_builder_from_id, mock_task_from_id ): - mock_prompt_builder_from_ui_name.side_effect = ValueError( - "Invalid prompt generator" - ) + mock_prompt_builder_from_id.side_effect = ValueError("Invalid prompt generator") response = client.get( - "/api/projects/project123/task/task456/gen_prompt/invalid_generator" + "/api/projects/project123/task/task456/gen_prompt/simple_prompt_builder" ) assert response.status_code == 400 - data = response.json() - assert data == {"detail": "Invalid prompt generator"} + assert "Invalid prompt generator" in response.text - mock_task_from_id.assert_called_once_with("project123", "task456") - mock_prompt_builder_from_ui_name.assert_called_once_with( - "invalid_generator", mock_task + +def test_generate_prompt_id_format(client, mock_task, mock_task_from_id): + response = client.get( + "/api/projects/project123/task/task456/gen_prompt/invalid_generator_id" ) + + assert response.status_code == 422 + assert "Value error, Invalid prompt ID: invalid_generator_id" in response.text diff --git a/app/desktop/studio_server/test_provider_api.py b/app/desktop/studio_server/test_provider_api.py index 3cb2239c..4f9e9dae 100644 --- a/app/desktop/studio_server/test_provider_api.py +++ b/app/desktop/studio_server/test_provider_api.py @@ -405,6 +405,7 @@ async def test_get_available_models(app, client): "name": "Model 2", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -419,6 +420,7 @@ async def test_get_available_models(app, client): "name": "Model 1", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -433,6 +435,7 @@ async def test_get_available_models(app, client): "name": "Model 2", "supports_structured_output": False, "supports_data_gen": False, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -495,6 +498,7 @@ async def test_get_available_models_ollama_exception(app, client): "name": "Model 1", "supports_structured_output": True, "supports_data_gen": True, + "supports_logprobs": False, "task_filter": None, "untested_model": False, } @@ -1214,6 +1218,7 @@ def test_openai_compatible_providers(): name="model1", supports_structured_output=False, supports_data_gen=False, + supports_logprobs=False, untested_model=True, ) ], diff --git a/app/desktop/studio_server/test_repair_api.py b/app/desktop/studio_server/test_repair_api.py index 2d0fc8b6..d39eab16 100644 --- a/app/desktop/studio_server/test_repair_api.py +++ b/app/desktop/studio_server/test_repair_api.py @@ -40,7 +40,7 @@ def data_source(): "model_name": "gpt_4o", "model_provider": "openai", "adapter_name": "langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ) diff --git a/app/desktop/studio_server/webhost.py b/app/desktop/studio_server/webhost.py index e0c157a6..2882b6b6 100644 --- a/app/desktop/studio_server/webhost.py +++ b/app/desktop/studio_server/webhost.py @@ -2,7 +2,7 @@ import os import sys -from fastapi import FastAPI +from fastapi import FastAPI, Response from fastapi.responses import FileResponse from fastapi.staticfiles import StaticFiles @@ -25,12 +25,20 @@ def studio_path(): return os.path.join(base_path, "../../app/web_ui/build") +def add_no_cache_headers(response: Response): + # This is already local, disable browser caching to prevent issues of old web-app trying to load old APIs and out of date web-ui + response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0" + response.headers["Pragma"] = "no-cache" + response.headers["Expires"] = "0" + + # File server that maps /foo/bar to /foo/bar.html (Starlette StaticFiles only does index.html) class HTMLStaticFiles(StaticFiles): async def get_response(self, path: str, scope): try: response = await super().get_response(path, scope) if response.status_code != 404: + add_no_cache_headers(response) return response except Exception as e: # catching HTTPException explicitly not working for some reason @@ -39,8 +47,7 @@ async def get_response(self, path: str, scope): raise e # Try the .html version of the file if the .html version exists, for 404s response = await super().get_response(f"{path}.html", scope) - # This is already local, disable browser caching to prevent issues - response.headers["Cache-Control"] = "no-store" + add_no_cache_headers(response) return response diff --git a/app/web_ui/src/lib/api_client.ts b/app/web_ui/src/lib/api_client.ts index a39cf3dd..8b4e9e0e 100644 --- a/app/web_ui/src/lib/api_client.ts +++ b/app/web_ui/src/lib/api_client.ts @@ -1,6 +1,8 @@ import createClient from "openapi-fetch" import type { paths } from "./api_schema" +export const base_url = "http://localhost:8757" + export const client = createClient({ - baseUrl: "http://localhost:8757", + baseUrl: base_url, }) diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index be6777a5..b2d369b7 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -416,7 +416,7 @@ export interface paths { patch?: never; trace?: never; }; - "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_generator}": { + "/api/projects/{project_id}/task/{task_id}/gen_prompt/{prompt_id}": { parameters: { query?: never; header?: never; @@ -424,7 +424,7 @@ export interface paths { cookie?: never; }; /** Generate Prompt */ - get: operations["generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_generator__get"]; + get: operations["generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_id__get"]; put?: never; post?: never; delete?: never; @@ -657,10 +657,282 @@ export interface paths { patch?: never; trace?: never; }; + "/api/projects/{project_id}/tasks/{task_id}/create_evaluator": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Evaluator */ + post: operations["create_evaluator_api_projects__project_id__tasks__task_id__create_evaluator_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/task_run_configs": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Task Run Configs */ + get: operations["get_task_run_configs_api_projects__project_id__tasks__task_id__task_run_configs_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval */ + get: operations["get_eval_api_projects__project_id__tasks__task_id__eval__eval_id__get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/evals": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Evals */ + get: operations["get_evals_api_projects__project_id__tasks__task_id__evals_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Configs */ + get: operations["get_eval_configs_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Config */ + get: operations["get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/task_run_config": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Task Run Config */ + post: operations["create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/create_eval_config": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Create Eval Config */ + post: operations["create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Run Eval Config */ + get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + get?: never; + put?: never; + /** Set Default Eval Config */ + post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"]; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Run Eval Config Eval */ + get: operations["run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Run Results */ + get: operations["get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Config Score Summary */ + get: operations["get_eval_config_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary": { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + /** Get Eval Configs Score Summary */ + get: operations["get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get"]; + put?: never; + post?: never; + delete?: never; + options?: never; + head?: never; + patch?: never; + trace?: never; + }; } export type webhooks = Record; export interface components { schemas: { + /** ApiPrompt */ + ApiPrompt: { + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description A more detailed description of the prompt. + */ + description?: string | null; + /** + * Generator Id + * @description The id of the generator that created this prompt. + */ + generator_id?: string | null; + /** + * Prompt + * @description The prompt for the task. + */ + prompt: string; + /** + * Chain Of Thought Instructions + * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. + */ + chain_of_thought_instructions?: string | null; + /** Id */ + id: string; + /** Created At */ + created_at?: string | null; + /** Created By */ + created_by?: string | null; + }; /** AvailableModels */ AvailableModels: { /** Provider Name */ @@ -670,6 +942,39 @@ export interface components { /** Models */ models: components["schemas"]["ModelDetails"][]; }; + /** + * BasePrompt + * @description A prompt for a task. This is the basic data storage format which can be used throughout a project. + * + * The "Prompt" model name is reserved for the custom prompts parented by a task. + */ + BasePrompt: { + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description A more detailed description of the prompt. + */ + description?: string | null; + /** + * Generator Id + * @description The id of the generator that created this prompt. + */ + generator_id?: string | null; + /** + * Prompt + * @description The prompt for the task. + */ + prompt: string; + /** + * Chain Of Thought Instructions + * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. + */ + chain_of_thought_instructions?: string | null; + }; /** Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post */ Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post: { /** Run Ids */ @@ -679,29 +984,72 @@ export interface components { /** Remove Tags */ remove_tags?: string[] | null; }; + /** CorrelationResult */ + CorrelationResult: { + /** Mean Absolute Error */ + mean_absolute_error: number; + /** Mean Normalized Absolute Error */ + mean_normalized_absolute_error: number; + /** Mean Squared Error */ + mean_squared_error: number; + /** Mean Normalized Squared Error */ + mean_normalized_squared_error: number; + /** Spearman Correlation */ + spearman_correlation: number | null; + /** Pearson Correlation */ + pearson_correlation: number | null; + /** Kendalltau Correlation */ + kendalltau_correlation: number | null; + }; /** * CreateDatasetSplitRequest * @description Request to create a dataset split */ CreateDatasetSplitRequest: { dataset_split_type: components["schemas"]["DatasetSplitType"]; - filter_type: components["schemas"]["DatasetFilterType"]; + /** Filter Id */ + filter_id: string; /** Name */ name?: string | null; /** Description */ description?: string | null; }; - /** - * CreateFinetuneRequest - * @description Request to create a finetune - */ - CreateFinetuneRequest: { + /** CreateEvalConfigRequest */ + CreateEvalConfigRequest: { /** Name */ name?: string | null; - /** Description */ - description?: string | null; - /** Dataset Id */ - dataset_id: string; + type: components["schemas"]["EvalConfigType"]; + /** Properties */ + properties: Record; + /** Model Name */ + model_name: string; + provider: components["schemas"]["ModelProviderName"]; + }; + /** CreateEvaluatorRequest */ + CreateEvaluatorRequest: { + /** Name */ + name: string; + /** Description */ + description: string; + template: components["schemas"]["EvalTemplateId"] | null; + /** Output Scores */ + output_scores: components["schemas"]["EvalOutputScore"][]; + /** Eval Set Filter Id */ + eval_set_filter_id: string; + /** Eval Configs Filter Id */ + eval_configs_filter_id: string; + }; + /** + * CreateFinetuneRequest + * @description Request to create a finetune + */ + CreateFinetuneRequest: { + /** Name */ + name?: string | null; + /** Description */ + description?: string | null; + /** Dataset Id */ + dataset_id: string; /** Train Split Name */ train_split_name: string; /** Validation Split Name */ @@ -722,6 +1070,18 @@ export interface components { custom_thinking_instructions?: string | null; data_strategy: components["schemas"]["FinetuneDataStrategy"]; }; + /** CreateTaskRunConfigRequest */ + CreateTaskRunConfigRequest: { + /** Name */ + name?: string | null; + /** Description */ + description?: string | null; + /** Model Name */ + model_name: string; + model_provider_name: components["schemas"]["ModelProviderName"]; + /** Prompt Id */ + prompt_id: string; + }; /** DataGenCategoriesApiInput */ DataGenCategoriesApiInput: { /** @@ -824,6 +1184,11 @@ export interface components { * @description The prompt method used to generate the output */ prompt_method: string; + /** + * Human Guidance + * @description Optional human guidance for generation + */ + human_guidance?: string | null; }; /** * DataSource @@ -852,12 +1217,6 @@ export interface components { * @enum {string} */ DataSourceType: "human" | "synthetic"; - /** - * DatasetFilterType - * @description Dataset filter names. - * @enum {string} - */ - DatasetFilterType: "all" | "high_rating" | "thinking_model" | "thinking_model_high_rated"; /** * DatasetSplit * @description A collection of task runs, with optional splits (train, test, validation). @@ -905,8 +1264,11 @@ export interface components { split_contents: { [key: string]: string[]; }; - /** @description The filter used to build the dataset. */ - filter?: components["schemas"]["DatasetFilterType"] | null; + /** + * Filter + * @description The filter used to build the dataset. + */ + filter?: string | null; /** Model Type */ readonly model_type: string; }; @@ -939,6 +1301,250 @@ export interface components { * @enum {string} */ DatasetSplitType: "train_test" | "train_test_val" | "train_test_val_80" | "all"; + /** Eval */ + Eval: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description The description of the eval + */ + description?: string | null; + /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */ + template?: components["schemas"]["EvalTemplateId"] | null; + /** + * Current Config Id + * @description The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs. + */ + current_config_id?: string | null; + /** + * Eval Set Filter Id + * @description The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id. + */ + eval_set_filter_id: string; + /** + * Eval Configs Filter Id + * @description The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id. + */ + eval_configs_filter_id: string; + /** + * Output Scores + * @description The scores this evaluator should produce. + */ + output_scores: components["schemas"]["EvalOutputScore"][]; + /** Model Type */ + readonly model_type: string; + }; + /** + * EvalConfig + * @description A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. + * + * A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid when the same eval is run with the same config. + */ + EvalConfig: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Model Name + * @description The name of the model to use for this eval config. + */ + model_name: string; + /** + * Model Provider + * @description The provider of the model to use for this eval config. + */ + model_provider: string; + /** + * @description This is used to determine the type of eval to run. + * @default g_eval + */ + config_type: components["schemas"]["EvalConfigType"]; + /** + * Properties + * @description Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict. + * @default {} + */ + properties: Record; + /** Model Type */ + readonly model_type: string; + }; + /** EvalConfigCompareSummary */ + EvalConfigCompareSummary: { + /** Results */ + results: { + [key: string]: { + [key: string]: components["schemas"]["CorrelationResult"]; + }; + }; + /** Eval Config Percent Complete */ + eval_config_percent_complete: { + [key: string]: number; + }; + /** Dataset Size */ + dataset_size: number; + /** Fully Rated Count */ + fully_rated_count: number; + /** Partially Rated Count */ + partially_rated_count: number; + /** Not Rated Count */ + not_rated_count: number; + }; + /** + * EvalConfigType + * @enum {string} + */ + EvalConfigType: "g_eval" | "llm_as_judge"; + /** + * EvalOutputScore + * @description A definition of a score that an evaluator will produce. + * + * Very similar to TaskRequirement, but conceptually different so separate models. + */ + EvalOutputScore: { + /** + * Name + * @description The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance. + */ + name: string; + /** + * Instruction + * @description A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user. + */ + instruction?: string | null; + /** @description The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical'). */ + type: components["schemas"]["TaskOutputRatingType"]; + }; + /** EvalResultSummary */ + EvalResultSummary: { + /** Results */ + results: { + [key: string]: { + [key: string]: components["schemas"]["ScoreSummary"]; + }; + }; + /** Run Config Percent Complete */ + run_config_percent_complete: { + [key: string]: number; + }; + /** Dataset Size */ + dataset_size: number; + }; + /** + * EvalRun + * @description The results of running an eval on a single dataset item, with a specific TaskRunConfig and EvalConfig. + */ + EvalRun: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Dataset Id + * @description The ID of the dataset item that was used for this run (we only use it's input). Must belong to the same Task as this eval. + */ + dataset_id: string | null; + /** + * Task Run Config Id + * @description The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config. + */ + task_run_config_id: string | null; + /** + * Eval Config Eval + * @description Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task. + * @default false + */ + eval_config_eval: boolean; + /** + * Input + * @description The input to the task. JSON formatted for structured input, plaintext for unstructured input. + */ + input: string; + /** + * Output + * @description The output of the task. JSON formatted for structured output, plaintext for unstructured output. + */ + output: string; + /** + * Intermediate Outputs + * @description The intermediate outputs of the task. + */ + intermediate_outputs?: { + [key: string]: string; + } | null; + /** + * Scores + * @description The scores of the evaluator (specifically the EvalConfig this object is a child of). + */ + scores: { + [key: string]: number; + }; + /** Model Type */ + readonly model_type: string; + }; + /** EvalRunResult */ + EvalRunResult: { + /** Results */ + results: components["schemas"]["EvalRun"][]; + eval: components["schemas"]["Eval"]; + eval_config: components["schemas"]["EvalConfig"]; + run_config: components["schemas"]["TaskRunConfig"]; + }; + /** + * EvalTemplateId + * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval. + * @enum {string} + */ + EvalTemplateId: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; /** * FineTuneParameter * @description A parameter for a fine-tune. Hyperparameters, etc. @@ -1137,7 +1743,36 @@ export interface components { * created_at (datetime): Timestamp when the model was created * created_by (str): User ID of the creator */ - KilnBaseModel: { + "KilnBaseModel-Input": { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + }; + /** + * KilnBaseModel + * @description Base model for all Kiln data models with common functionality for persistence and versioning. + * + * Attributes: + * v (int): Schema version number for migration support + * id (str): Unique identifier for the model instance + * path (Path): File system path where the model is stored + * created_at (datetime): Timestamp when the model was created + * created_by (str): User ID of the creator + */ + "KilnBaseModel-Output": { /** * V * @default 1 @@ -1154,6 +1789,8 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; + /** Model Type */ + readonly model_type: string; }; /** ModelDetails */ ModelDetails: { @@ -1165,6 +1802,8 @@ export interface components { supports_structured_output: boolean; /** Supports Data Gen */ supports_data_gen: boolean; + /** Supports Logprobs */ + supports_logprobs: boolean; /** * Untested Model * @default false @@ -1179,7 +1818,13 @@ export interface components { * Where models have instruct and raw versions, instruct is default and raw is specified. * @enum {string} */ - ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b"; + ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "claude_3_7_sonnet" | "claude_3_7_sonnet_thinking" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b"; + /** + * ModelProviderName + * @description Enumeration of supported AI model providers. + * @enum {string} + */ + ModelProviderName: "openai" | "groq" | "amazon_bedrock" | "ollama" | "openrouter" | "fireworks_ai" | "kiln_fine_tune" | "kiln_custom_registry" | "openai_compatible"; /** OllamaConnection */ OllamaConnection: { /** Message */ @@ -1269,30 +1914,24 @@ export interface components { }; /** * Prompt - * @description A prompt for a task. + * @description A prompt for a task. This is the custom prompt parented by a task. */ Prompt: { /** - * V - * @default 1 + * Name + * @description A name for this entity. */ - v: number; - /** Id */ - id?: string | null; - /** Path */ - path?: string | null; + name: string; /** - * Created At - * Format: date-time + * Description + * @description A more detailed description of the prompt. */ - created_at?: string; - /** Created By */ - created_by?: string; + description?: string | null; /** - * Name - * @description A name for this entity. + * Generator Id + * @description The id of the generator that created this prompt. */ - name: string; + generator_id?: string | null; /** * Prompt * @description The prompt for the task. @@ -1303,6 +1942,22 @@ export interface components { * @description Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided. */ chain_of_thought_instructions?: string | null; + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; /** Model Type */ readonly model_type: string; }; @@ -1310,15 +1965,15 @@ export interface components { PromptApiResponse: { /** Prompt */ prompt: string; - /** Prompt Builder Name */ - prompt_builder_name: string; - /** Ui Generator Name */ - ui_generator_name: string; + /** Prompt Id */ + prompt_id: string; }; /** PromptCreateRequest */ PromptCreateRequest: { /** Name */ name: string; + /** Description */ + description?: string | null; /** Prompt */ prompt: string; /** Chain Of Thought Instructions */ @@ -1328,8 +1983,6 @@ export interface components { PromptGenerator: { /** Id */ id: string; - /** Ui Id */ - ui_id: string; /** Short Description */ short_description: string; /** Description */ @@ -1344,7 +1997,7 @@ export interface components { /** Generators */ generators: components["schemas"]["PromptGenerator"][]; /** Prompts */ - prompts: components["schemas"]["Prompt"][]; + prompts: components["schemas"]["ApiPrompt"][]; }; /** ProviderModel */ ProviderModel: { @@ -1397,7 +2050,30 @@ export interface components { /** @description The type of rating */ type: components["schemas"]["TaskOutputRatingType"]; }; - /** RunSummary */ + /** + * RunConfigProperties + * @description A configuration for running a task. + * + * This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + */ + RunConfigProperties: { + /** + * Model Name + * @description The model to use for this run config. + */ + model_name: string; + /** + * Model Provider Name + * @description The provider to use for this run config. + */ + model_provider_name: string; + /** + * Prompt Id + * @description The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided. + */ + prompt_id: string; + }; + /** RunSummary */ RunSummary: { /** Id */ id: string | null; @@ -1435,6 +2111,11 @@ export interface components { /** Tags */ tags?: string[] | null; }; + /** ScoreSummary */ + ScoreSummary: { + /** Mean Score */ + mean_score: number; + }; /** * StructuredOutputMode * @description Enumeration of supported structured output modes. @@ -1447,7 +2128,7 @@ export interface components { * - json_instruction_and_object: append instructions to the prompt to request json matching the schema. Also request the response as json_mode via API capabilities (returning dictionaries). * @enum {string} */ - StructuredOutputMode: "default" | "json_schema" | "function_calling" | "json_mode" | "json_instructions" | "json_instruction_and_object"; + StructuredOutputMode: "default" | "json_schema" | "function_calling_weak" | "function_calling" | "json_mode" | "json_instructions" | "json_instruction_and_object"; /** * Task * @description Represents a specific task to be performed, with associated requirements and validation rules. @@ -1720,7 +2401,7 @@ export interface components { created_at?: string; /** Created By */ created_by?: string; - parent?: components["schemas"]["KilnBaseModel"] | null; + parent?: components["schemas"]["KilnBaseModel-Input"] | null; /** * Input * @description The inputs to the task. JSON formatted for structured input, plaintext for unstructured input. @@ -1751,85 +2432,433 @@ export interface components { */ tags: string[]; }; - /** - * TaskRun - * @description Represents a single execution of a Task. - * - * Contains the input used, its source, the output produced, and optional - * repair information if the output needed correction. - */ - "TaskRun-Output": { - /** - * V - * @default 1 - */ - v: number; - /** Id */ - id?: string | null; - /** Path */ - path?: string | null; - /** - * Created At - * Format: date-time - */ - created_at?: string; - /** Created By */ - created_by?: string; - /** - * Input - * @description The inputs to the task. JSON formatted for structured input, plaintext for unstructured input. - */ - input: string; - /** @description The source of the input: human or synthetic. */ - input_source?: components["schemas"]["DataSource"] | null; - /** @description The output of the task run. */ - output: components["schemas"]["TaskOutput-Output"]; - /** - * Repair Instructions - * @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models. - */ - repair_instructions?: string | null; - /** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */ - repaired_output?: components["schemas"]["TaskOutput-Output"] | null; - /** - * Intermediate Outputs - * @description Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data. - */ - intermediate_outputs?: { - [key: string]: string; - } | null; - /** - * Tags - * @description Tags for the task run. Tags are used to categorize task runs for filtering and reporting. - * @default [] - */ - tags: string[]; - /** Model Type */ - readonly model_type: string; + /** + * TaskRun + * @description Represents a single execution of a Task. + * + * Contains the input used, its source, the output produced, and optional + * repair information if the output needed correction. + */ + "TaskRun-Output": { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Input + * @description The inputs to the task. JSON formatted for structured input, plaintext for unstructured input. + */ + input: string; + /** @description The source of the input: human or synthetic. */ + input_source?: components["schemas"]["DataSource"] | null; + /** @description The output of the task run. */ + output: components["schemas"]["TaskOutput-Output"]; + /** + * Repair Instructions + * @description Instructions for fixing the output. Should define what is wrong, and how to fix it. Will be used by models for both generating a fixed output, and evaluating future models. + */ + repair_instructions?: string | null; + /** @description An version of the output with issues fixed. This must be a 'fixed' version of the existing output, and not an entirely new output. If you wish to generate an ideal curatorial output for this task unrelated to this output, generate a new TaskOutput with type 'human' instead of using this field. */ + repaired_output?: components["schemas"]["TaskOutput-Output"] | null; + /** + * Intermediate Outputs + * @description Intermediate outputs from the task run. Keys are the names of the intermediate output steps (cot=chain of thought, etc), values are the output data. + */ + intermediate_outputs?: { + [key: string]: string; + } | null; + /** + * Tags + * @description Tags for the task run. Tags are used to categorize task runs for filtering and reporting. + * @default [] + */ + tags: string[]; + /** Model Type */ + readonly model_type: string; + }; + /** + * TaskRunConfig + * @description A Kiln model for persisting a run config in a Kiln Project, nested under a task. + * + * Typically used to save a method of running a task for evaluation. + * + * A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + */ + TaskRunConfig: { + /** + * V + * @default 1 + */ + v: number; + /** Id */ + id?: string | null; + /** Path */ + path?: string | null; + /** + * Created At + * Format: date-time + */ + created_at?: string; + /** Created By */ + created_by?: string; + /** + * Name + * @description A name for this entity. + */ + name: string; + /** + * Description + * @description The description of the task run config. + */ + description?: string | null; + /** @description The run config properties to use for this task run. */ + run_config_properties: components["schemas"]["RunConfigProperties"]; + /** @description A prompt to use for run config. */ + prompt?: components["schemas"]["BasePrompt"] | null; + /** Model Type */ + readonly model_type: string; + }; + /** ValidationError */ + ValidationError: { + /** Location */ + loc: (string | number)[]; + /** Message */ + msg: string; + /** Error Type */ + type: string; + }; + }; + responses: never; + parameters: never; + requestBodies: never; + headers: never; + pathItems: never; +} +export type $defs = Record; +export interface operations { + ping_ping_get: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + }; + }; + create_project_api_project_post: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": components["schemas"]["Project-Input"]; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Project-Output"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + update_project_api_project__project_id__patch: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": Record; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Project-Output"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_projects_api_projects_get: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Project-Output"][]; + }; + }; + }; + }; + get_project_api_projects__project_id__get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Project-Output"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + delete_project_api_projects__project_id__delete: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": Record; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + import_project_api_import_project_post: { + parameters: { + query: { + project_path: string; + }; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Project-Output"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + create_task_api_projects__project_id__task_post: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": Record; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Task"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + update_task_api_projects__project_id__task__task_id__patch: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody: { + content: { + "application/json": Record; + }; + }; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Task"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_tasks_api_projects__project_id__tasks_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + }; + cookie?: never; }; - /** ValidationError */ - ValidationError: { - /** Location */ - loc: (string | number)[]; - /** Message */ - msg: string; - /** Error Type */ - type: string; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["Task"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; }; }; - responses: never; - parameters: never; - requestBodies: never; - headers: never; - pathItems: never; -} -export type $defs = Record; -export interface operations { - ping_ping_get: { + get_task_api_projects__project_id__tasks__task_id__get: { parameters: { query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody?: never; @@ -1840,21 +2869,33 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["Task"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; }; }; }; }; - create_project_api_project_post: { + create_prompt_api_projects__project_id__task__task_id__prompt_post: { parameters: { query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody: { content: { - "application/json": components["schemas"]["Project-Input"]; + "application/json": components["schemas"]["PromptCreateRequest"]; }; }; responses: { @@ -1864,7 +2905,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Project-Output"]; + "application/json": components["schemas"]["Prompt"]; }; }; /** @description Validation Error */ @@ -1878,12 +2919,112 @@ export interface operations { }; }; }; - update_project_api_project__project_id__patch: { + get_prompts_api_projects__project_id__task__task_id__prompts_get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["PromptResponse"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + get_run_api_projects__project_id__tasks__task_id__runs__run_id__get: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + run_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["TaskRun-Output"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + delete_run_api_projects__project_id__tasks__task_id__runs__run_id__delete: { + parameters: { + query?: never; + header?: never; + path: { + project_id: string; + task_id: string; + run_id: string; + }; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": unknown; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; + }; + }; + }; + }; + update_run_api_projects__project_id__tasks__task_id__runs__run_id__patch: { parameters: { query?: never; header?: never; path: { project_id: string; + task_id: string; + run_id: string; }; cookie?: never; }; @@ -1899,7 +3040,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Project-Output"]; + "application/json": components["schemas"]["TaskRun-Output"]; }; }; /** @description Validation Error */ @@ -1913,11 +3054,14 @@ export interface operations { }; }; }; - get_projects_api_projects_get: { + get_runs_api_projects__project_id__tasks__task_id__runs_get: { parameters: { query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody?: never; @@ -1928,17 +3072,27 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Project-Output"][]; + "application/json": components["schemas"]["TaskRun-Output"][]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; }; }; }; }; - get_project_api_projects__project_id__get: { + get_runs_summary_api_projects__project_id__tasks__task_id__runs_summaries_get: { parameters: { query?: never; header?: never; path: { project_id: string; + task_id: string; }; cookie?: never; }; @@ -1950,7 +3104,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Project-Output"]; + "application/json": components["schemas"]["RunSummary"][]; }; }; /** @description Validation Error */ @@ -1964,16 +3118,21 @@ export interface operations { }; }; }; - delete_project_api_projects__project_id__delete: { + delete_runs_api_projects__project_id__tasks__task_id__runs_delete_post: { parameters: { query?: never; header?: never; path: { project_id: string; + task_id: string; }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": string[]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -1981,7 +3140,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": Record; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -1995,16 +3154,21 @@ export interface operations { }; }; }; - import_project_api_import_project_post: { + run_task_api_projects__project_id__tasks__task_id__run_post: { parameters: { - query: { - project_path: string; - }; + query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": components["schemas"]["RunTaskRequest"]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -2012,7 +3176,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Project-Output"]; + "application/json": components["schemas"]["TaskRun-Output"]; }; }; /** @description Validation Error */ @@ -2026,18 +3190,19 @@ export interface operations { }; }; }; - create_task_api_projects__project_id__task_post: { + edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post: { parameters: { query?: never; header?: never; path: { project_id: string; + task_id: string; }; cookie?: never; }; requestBody: { content: { - "application/json": Record; + "application/json": components["schemas"]["Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post"]; }; }; responses: { @@ -2047,7 +3212,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Task"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2061,21 +3226,56 @@ export interface operations { }; }; }; - update_task_api_projects__project_id__task__task_id__patch: { + get_providers_models_api_providers_models_get: { parameters: { query?: never; header?: never; - path: { - project_id: string; - task_id: string; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["ProviderModels"]; + }; }; + }; + }; + get_available_models_api_available_models_get: { + parameters: { + query?: never; + header?: never; + path?: never; cookie?: never; }; - requestBody: { - content: { - "application/json": Record; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["AvailableModels"][]; + }; + }; + }; + }; + connect_ollama_api_api_provider_ollama_connect_get: { + parameters: { + query?: { + custom_ollama_url?: string | null; }; + header?: never; + path?: never; + cookie?: never; }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2083,7 +3283,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Task"]; + "application/json": components["schemas"]["OllamaConnection"]; }; }; /** @description Validation Error */ @@ -2097,13 +3297,15 @@ export interface operations { }; }; }; - get_tasks_api_projects__project_id__tasks_get: { + save_openai_compatible_providers_api_provider_openai_compatible_post: { parameters: { - query?: never; - header?: never; - path: { - project_id: string; + query: { + name: string; + base_url: string; + api_key: string; }; + header?: never; + path?: never; cookie?: never; }; requestBody?: never; @@ -2114,7 +3316,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Task"][]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2128,14 +3330,13 @@ export interface operations { }; }; }; - get_task_api_projects__project_id__tasks__task_id__get: { + delete_openai_compatible_providers_api_provider_openai_compatible_delete: { parameters: { - query?: never; - header?: never; - path: { - project_id: string; - task_id: string; + query: { + name: string; }; + header?: never; + path?: never; cookie?: never; }; requestBody?: never; @@ -2146,7 +3347,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Task"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2160,19 +3361,16 @@ export interface operations { }; }; }; - create_prompt_api_projects__project_id__task__task_id__prompt_post: { + connect_api_key_api_provider_connect_api_key_post: { parameters: { query?: never; header?: never; - path: { - project_id: string; - task_id: string; - }; + path?: never; cookie?: never; }; requestBody: { content: { - "application/json": components["schemas"]["PromptCreateRequest"]; + "application/json": Record; }; }; responses: { @@ -2182,7 +3380,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Prompt"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2196,14 +3394,13 @@ export interface operations { }; }; }; - get_prompts_api_projects__project_id__task__task_id__prompts_get: { + disconnect_api_key_api_provider_disconnect_api_key_post: { parameters: { - query?: never; - header?: never; - path: { - project_id: string; - task_id: string; + query: { + provider_id: string; }; + header?: never; + path?: never; cookie?: never; }; requestBody?: never; @@ -2214,7 +3411,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["PromptResponse"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2228,14 +3425,14 @@ export interface operations { }; }; }; - get_run_api_projects__project_id__tasks__task_id__runs__run_id__get: { + generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_id__get: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; - run_id: string; + prompt_id: string; }; cookie?: never; }; @@ -2247,7 +3444,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": components["schemas"]["PromptApiResponse"]; }; }; /** @description Validation Error */ @@ -2261,7 +3458,7 @@ export interface operations { }; }; }; - delete_run_api_projects__project_id__tasks__task_id__runs__run_id__delete: { + run_repair_api_projects__project_id__tasks__task_id__runs__run_id__run_repair_post: { parameters: { query?: never; header?: never; @@ -2272,7 +3469,11 @@ export interface operations { }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": components["schemas"]["RepairTaskApiInput"]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -2280,7 +3481,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["TaskRun-Output"]; }; }; /** @description Validation Error */ @@ -2294,7 +3495,7 @@ export interface operations { }; }; }; - update_run_api_projects__project_id__tasks__task_id__runs__run_id__patch: { + post_repair_run_api_projects__project_id__tasks__task_id__runs__run_id__repair_post: { parameters: { query?: never; header?: never; @@ -2307,7 +3508,7 @@ export interface operations { }; requestBody: { content: { - "application/json": Record; + "application/json": components["schemas"]["RepairRunPost"]; }; }; responses: { @@ -2331,14 +3532,11 @@ export interface operations { }; }; }; - get_runs_api_projects__project_id__tasks__task_id__runs_get: { + read_settings_api_settings_get: { parameters: { query?: never; header?: never; - path: { - project_id: string; - task_id: string; - }; + path?: never; cookie?: never; }; requestBody?: never; @@ -2349,31 +3547,25 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"][]; - }; - }; - /** @description Validation Error */ - 422: { - headers: { - [name: string]: unknown; - }; - content: { - "application/json": components["schemas"]["HTTPValidationError"]; + "application/json": Record; }; }; }; }; - get_runs_summary_api_projects__project_id__tasks__task_id__runs_summaries_get: { + update_settings_api_settings_post: { parameters: { query?: never; header?: never; - path: { - project_id: string; - task_id: string; - }; + path?: never; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": { + [key: string]: number | string | boolean | unknown[] | null; + }; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -2381,7 +3573,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["RunSummary"][]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2395,21 +3587,16 @@ export interface operations { }; }; }; - delete_runs_api_projects__project_id__tasks__task_id__runs_delete_post: { + read_setting_item_api_settings__item_id__get: { parameters: { query?: never; header?: never; path: { - project_id: string; - task_id: string; + item_id: string; }; cookie?: never; }; - requestBody: { - content: { - "application/json": string[]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2431,7 +3618,7 @@ export interface operations { }; }; }; - run_task_api_projects__project_id__tasks__task_id__run_post: { + generate_categories_api_projects__project_id__tasks__task_id__generate_categories_post: { parameters: { query?: never; header?: never; @@ -2443,7 +3630,7 @@ export interface operations { }; requestBody: { content: { - "application/json": components["schemas"]["RunTaskRequest"]; + "application/json": components["schemas"]["DataGenCategoriesApiInput"]; }; }; responses: { @@ -2467,7 +3654,7 @@ export interface operations { }; }; }; - edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post: { + generate_samples_api_projects__project_id__tasks__task_id__generate_samples_post: { parameters: { query?: never; header?: never; @@ -2479,7 +3666,7 @@ export interface operations { }; requestBody: { content: { - "application/json": components["schemas"]["Body_edit_tags_api_projects__project_id__tasks__task_id__runs_edit_tags_post"]; + "application/json": components["schemas"]["DataGenSampleApiInput"]; }; }; responses: { @@ -2489,7 +3676,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["TaskRun-Output"]; }; }; /** @description Validation Error */ @@ -2503,14 +3690,23 @@ export interface operations { }; }; }; - get_providers_models_api_providers_models_get: { + save_sample_api_projects__project_id__tasks__task_id__save_sample_post: { parameters: { - query?: never; + query?: { + session_id?: string | null; + }; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": components["schemas"]["DataGenSaveSamplesApiInput"]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -2518,38 +3714,28 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["ProviderModels"]; + "application/json": components["schemas"]["TaskRun-Output"]; }; }; - }; - }; - get_available_models_api_available_models_get: { - parameters: { - query?: never; - header?: never; - path?: never; - cookie?: never; - }; - requestBody?: never; - responses: { - /** @description Successful Response */ - 200: { + /** @description Validation Error */ + 422: { headers: { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["AvailableModels"][]; + "application/json": components["schemas"]["HTTPValidationError"]; }; }; }; }; - connect_ollama_api_api_provider_ollama_connect_get: { + dataset_splits_api_projects__project_id__tasks__task_id__dataset_splits_get: { parameters: { - query?: { - custom_ollama_url?: string | null; - }; + query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody?: never; @@ -2560,7 +3746,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["OllamaConnection"]; + "application/json": components["schemas"]["DatasetSplit"][]; }; }; /** @description Validation Error */ @@ -2574,18 +3760,21 @@ export interface operations { }; }; }; - save_openai_compatible_providers_api_provider_openai_compatible_post: { + create_dataset_split_api_projects__project_id__tasks__task_id__dataset_splits_post: { parameters: { - query: { - name: string; - base_url: string; - api_key: string; - }; + query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": components["schemas"]["CreateDatasetSplitRequest"]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -2593,7 +3782,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["DatasetSplit"]; }; }; /** @description Validation Error */ @@ -2607,13 +3796,16 @@ export interface operations { }; }; }; - delete_openai_compatible_providers_api_provider_openai_compatible_delete: { + finetunes_api_projects__project_id__tasks__task_id__finetunes_get: { parameters: { - query: { - name: string; + query?: { + update_status?: boolean; }; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody?: never; @@ -2624,7 +3816,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["Finetune"][]; }; }; /** @description Validation Error */ @@ -2638,16 +3830,19 @@ export interface operations { }; }; }; - connect_api_key_api_provider_connect_api_key_post: { + create_finetune_api_projects__project_id__tasks__task_id__finetunes_post: { parameters: { query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + }; cookie?: never; }; requestBody: { content: { - "application/json": Record; + "application/json": components["schemas"]["CreateFinetuneRequest"]; }; }; responses: { @@ -2657,7 +3852,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["Finetune"]; }; }; /** @description Validation Error */ @@ -2671,13 +3866,15 @@ export interface operations { }; }; }; - disconnect_api_key_api_provider_disconnect_api_key_post: { + finetune_api_projects__project_id__tasks__task_id__finetunes__finetune_id__get: { parameters: { - query: { - provider_id: string; - }; + query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + finetune_id: string; + }; cookie?: never; }; requestBody?: never; @@ -2688,7 +3885,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["FinetuneWithStatus"]; }; }; /** @description Validation Error */ @@ -2702,14 +3899,32 @@ export interface operations { }; }; }; - generate_prompt_api_projects__project_id__task__task_id__gen_prompt__prompt_generator__get: { + finetune_providers_api_finetune_providers_get: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody?: never; + responses: { + /** @description Successful Response */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["FinetuneProvider"][]; + }; + }; + }; + }; + finetune_hyperparameters_api_finetune_hyperparameters__provider_id__get: { parameters: { query?: never; header?: never; path: { - project_id: string; - task_id: string; - prompt_generator: string; + provider_id: string; }; cookie?: never; }; @@ -2721,7 +3936,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["PromptApiResponse"]; + "application/json": components["schemas"]["FineTuneParameter"][]; }; }; /** @description Validation Error */ @@ -2735,22 +3950,24 @@ export interface operations { }; }; }; - run_repair_api_projects__project_id__tasks__task_id__runs__run_id__run_repair_post: { + download_dataset_jsonl_api_download_dataset_jsonl_get: { parameters: { - query?: never; - header?: never; - path: { + query: { project_id: string; task_id: string; - run_id: string; + dataset_id: string; + split_name: string; + format_type: string; + data_strategy: string; + system_message_generator?: string | null; + custom_system_message?: string | null; + custom_thinking_instructions?: string | null; }; + header?: never; + path?: never; cookie?: never; }; - requestBody: { - content: { - "application/json": components["schemas"]["RepairTaskApiInput"]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2758,7 +3975,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -2772,20 +3989,19 @@ export interface operations { }; }; }; - post_repair_run_api_projects__project_id__tasks__task_id__runs__run_id__repair_post: { + create_evaluator_api_projects__project_id__tasks__task_id__create_evaluator_post: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; - run_id: string; }; cookie?: never; }; requestBody: { content: { - "application/json": components["schemas"]["RepairRunPost"]; + "application/json": components["schemas"]["CreateEvaluatorRequest"]; }; }; responses: { @@ -2795,7 +4011,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": components["schemas"]["Eval"]; }; }; /** @description Validation Error */ @@ -2809,40 +4025,17 @@ export interface operations { }; }; }; - read_settings_api_settings_get: { + get_task_run_configs_api_projects__project_id__tasks__task_id__task_run_configs_get: { parameters: { query?: never; header?: never; - path?: never; - cookie?: never; - }; - requestBody?: never; - responses: { - /** @description Successful Response */ - 200: { - headers: { - [name: string]: unknown; - }; - content: { - "application/json": Record; - }; + path: { + project_id: string; + task_id: string; }; - }; - }; - update_settings_api_settings_post: { - parameters: { - query?: never; - header?: never; - path?: never; cookie?: never; }; - requestBody: { - content: { - "application/json": { - [key: string]: number | string | boolean | unknown[] | null; - }; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2850,7 +4043,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["TaskRunConfig"][]; }; }; /** @description Validation Error */ @@ -2864,12 +4057,14 @@ export interface operations { }; }; }; - read_setting_item_api_settings__item_id__get: { + get_eval_api_projects__project_id__tasks__task_id__eval__eval_id__get: { parameters: { query?: never; header?: never; path: { - item_id: string; + project_id: string; + task_id: string; + eval_id: string; }; cookie?: never; }; @@ -2881,7 +4076,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["Eval"]; }; }; /** @description Validation Error */ @@ -2895,7 +4090,7 @@ export interface operations { }; }; }; - generate_categories_api_projects__project_id__tasks__task_id__generate_categories_post: { + get_evals_api_projects__project_id__tasks__task_id__evals_get: { parameters: { query?: never; header?: never; @@ -2905,11 +4100,7 @@ export interface operations { }; cookie?: never; }; - requestBody: { - content: { - "application/json": components["schemas"]["DataGenCategoriesApiInput"]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2917,7 +4108,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": components["schemas"]["Eval"][]; }; }; /** @description Validation Error */ @@ -2931,21 +4122,18 @@ export interface operations { }; }; }; - generate_samples_api_projects__project_id__tasks__task_id__generate_samples_post: { + get_eval_configs_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_get: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; + eval_id: string; }; cookie?: never; }; - requestBody: { - content: { - "application/json": components["schemas"]["DataGenSampleApiInput"]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2953,7 +4141,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": components["schemas"]["EvalConfig"][]; }; }; /** @description Validation Error */ @@ -2967,23 +4155,19 @@ export interface operations { }; }; }; - save_sample_api_projects__project_id__tasks__task_id__save_sample_post: { + get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get: { parameters: { - query?: { - session_id?: string | null; - }; + query?: never; header?: never; path: { project_id: string; task_id: string; + eval_id: string; + eval_config_id: string; }; cookie?: never; }; - requestBody: { - content: { - "application/json": components["schemas"]["DataGenSaveSamplesApiInput"]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -2991,7 +4175,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["TaskRun-Output"]; + "application/json": components["schemas"]["EvalConfig"]; }; }; /** @description Validation Error */ @@ -3005,7 +4189,7 @@ export interface operations { }; }; }; - dataset_splits_api_projects__project_id__tasks__task_id__dataset_splits_get: { + create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post: { parameters: { query?: never; header?: never; @@ -3015,7 +4199,11 @@ export interface operations { }; cookie?: never; }; - requestBody?: never; + requestBody: { + content: { + "application/json": components["schemas"]["CreateTaskRunConfigRequest"]; + }; + }; responses: { /** @description Successful Response */ 200: { @@ -3023,7 +4211,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["DatasetSplit"][]; + "application/json": components["schemas"]["TaskRunConfig"]; }; }; /** @description Validation Error */ @@ -3037,19 +4225,20 @@ export interface operations { }; }; }; - create_dataset_split_api_projects__project_id__tasks__task_id__dataset_splits_post: { + create_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__create_eval_config_post: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; + eval_id: string; }; cookie?: never; }; requestBody: { content: { - "application/json": components["schemas"]["CreateDatasetSplitRequest"]; + "application/json": components["schemas"]["CreateEvalConfigRequest"]; }; }; responses: { @@ -3059,7 +4248,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["DatasetSplit"]; + "application/json": components["schemas"]["EvalConfig"]; }; }; /** @description Validation Error */ @@ -3073,15 +4262,18 @@ export interface operations { }; }; }; - finetunes_api_projects__project_id__tasks__task_id__finetunes_get: { + run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get: { parameters: { query?: { - update_status?: boolean; + run_config_ids?: string[]; + all_run_configs?: boolean; }; header?: never; path: { project_id: string; task_id: string; + eval_id: string; + eval_config_id: string; }; cookie?: never; }; @@ -3093,7 +4285,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Finetune"][]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -3107,21 +4299,19 @@ export interface operations { }; }; }; - create_finetune_api_projects__project_id__tasks__task_id__finetunes_post: { + set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; + eval_id: string; + eval_config_id: string; }; cookie?: never; }; - requestBody: { - content: { - "application/json": components["schemas"]["CreateFinetuneRequest"]; - }; - }; + requestBody?: never; responses: { /** @description Successful Response */ 200: { @@ -3129,7 +4319,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["Finetune"]; + "application/json": components["schemas"]["Eval"]; }; }; /** @description Validation Error */ @@ -3143,14 +4333,14 @@ export interface operations { }; }; }; - finetune_api_projects__project_id__tasks__task_id__finetunes__finetune_id__get: { + run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: { parameters: { query?: never; header?: never; path: { project_id: string; task_id: string; - finetune_id: string; + eval_id: string; }; cookie?: never; }; @@ -3162,7 +4352,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["FinetuneWithStatus"]; + "application/json": unknown; }; }; /** @description Validation Error */ @@ -3176,11 +4366,17 @@ export interface operations { }; }; }; - finetune_providers_api_finetune_providers_get: { + get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get: { parameters: { query?: never; header?: never; - path?: never; + path: { + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; + run_config_id: string; + }; cookie?: never; }; requestBody?: never; @@ -3191,17 +4387,29 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["FinetuneProvider"][]; + "application/json": components["schemas"]["EvalRunResult"]; + }; + }; + /** @description Validation Error */ + 422: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["HTTPValidationError"]; }; }; }; }; - finetune_hyperparameters_api_finetune_hyperparameters__provider_id__get: { + get_eval_config_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__score_summary_get: { parameters: { query?: never; header?: never; path: { - provider_id: string; + project_id: string; + task_id: string; + eval_id: string; + eval_config_id: string; }; cookie?: never; }; @@ -3213,7 +4421,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": components["schemas"]["FineTuneParameter"][]; + "application/json": components["schemas"]["EvalResultSummary"]; }; }; /** @description Validation Error */ @@ -3227,21 +4435,15 @@ export interface operations { }; }; }; - download_dataset_jsonl_api_download_dataset_jsonl_get: { + get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get: { parameters: { - query: { + query?: never; + header?: never; + path: { project_id: string; task_id: string; - dataset_id: string; - split_name: string; - format_type: string; - data_strategy: string; - system_message_generator?: string | null; - custom_system_message?: string | null; - custom_thinking_instructions?: string | null; + eval_id: string; }; - header?: never; - path?: never; cookie?: never; }; requestBody?: never; @@ -3252,7 +4454,7 @@ export interface operations { [name: string]: unknown; }; content: { - "application/json": unknown; + "application/json": components["schemas"]["EvalConfigCompareSummary"]; }; }; /** @description Validation Error */ diff --git a/app/web_ui/src/lib/stores.ts b/app/web_ui/src/lib/stores.ts index dbd4d8f6..a86dbe25 100644 --- a/app/web_ui/src/lib/stores.ts +++ b/app/web_ui/src/lib/stores.ts @@ -119,6 +119,27 @@ function localStorageStore(key: string, initialValue: T) { return store } +export async function load_task( + project_id: string, + task_id: string, +): Promise { + const { + data, // only present if 2XX response + error, // only present if 4XX or 5XX response + } = await client.GET("/api/projects/{project_id}/tasks/{task_id}", { + params: { + path: { + project_id: project_id, + task_id: task_id, + }, + }, + }) + if (error) { + throw error + } + return data +} + export async function load_current_task(project: Project | null) { let task: Task | null = null try { @@ -126,21 +147,7 @@ export async function load_current_task(project: Project | null) { if (!project || !project?.id || !task_id) { return } - const { - data, // only present if 2XX response - error, // only present if 4XX or 5XX response - } = await client.GET("/api/projects/{project_id}/tasks/{task_id}", { - params: { - path: { - project_id: project.id, - task_id: task_id, - }, - }, - }) - if (error) { - throw error - } - task = data + task = await load_task(project.id, task_id) // Load the current task's prompts after 50ms, as it's not the most critical data setTimeout(() => { @@ -222,6 +229,29 @@ export function provider_name_from_id(provider_id: string): string { return provider?.provider_name || provider_id } +export function prompt_name_from_id(prompt_id: string): string { + // Attempt to lookup a nice name for the prompt. First from named prompts, then from generators + // Special case for fine-tuned prompts + let prompt_name: string | undefined = undefined + if (prompt_id && prompt_id.startsWith("fine_tune_prompt::")) { + prompt_name = "Fine-Tune Prompt" + } + if (!prompt_name) { + prompt_name = get(current_task_prompts)?.prompts.find( + (prompt) => prompt.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = get(current_task_prompts)?.generators.find( + (generator) => generator.id === prompt_id, + )?.name + } + if (!prompt_name) { + prompt_name = prompt_id + } + return prompt_name +} + // Available prompts for the current export async function load_available_prompts() { const project = get(current_project) diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index e5f98175..8419f6d7 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -6,6 +6,7 @@ export type Task = components["schemas"]["Task"] export type TaskRun = components["schemas"]["TaskRun-Input"] export type TaskRequirement = components["schemas"]["TaskRequirement"] export type TaskOutputRating = components["schemas"]["TaskOutputRating-Output"] +export type TaskOutputRatingType = components["schemas"]["TaskOutputRatingType"] export type RequirementRating = components["schemas"]["RequirementRating"] export type RatingType = components["schemas"]["TaskOutputRatingType"] export type AvailableModels = components["schemas"]["AvailableModels"] @@ -19,3 +20,14 @@ export type OllamaConnection = components["schemas"]["OllamaConnection"] export type RunSummary = components["schemas"]["RunSummary"] export type PromptResponse = components["schemas"]["PromptResponse"] export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"] +export type EvalOutputScore = components["schemas"]["EvalOutputScore"] +export type EvalTemplateId = components["schemas"]["EvalTemplateId"] +export type Eval = components["schemas"]["Eval"] +export type EvalConfigType = components["schemas"]["EvalConfigType"] +export type EvalConfig = components["schemas"]["EvalConfig"] +export type TaskRunConfig = components["schemas"]["TaskRunConfig"] +export type EvalResultSummary = components["schemas"]["EvalResultSummary"] +export type EvalRunResult = components["schemas"]["EvalRunResult"] +export type EvalConfigCompareSummary = + components["schemas"]["EvalConfigCompareSummary"] +export type EvalRun = components["schemas"]["EvalRun"] diff --git a/app/web_ui/src/lib/ui/dialog.svelte b/app/web_ui/src/lib/ui/dialog.svelte index 9645b6f0..bda972e9 100644 --- a/app/web_ui/src/lib/ui/dialog.svelte +++ b/app/web_ui/src/lib/ui/dialog.svelte @@ -2,6 +2,7 @@ import { KilnError, createKilnError } from "$lib/utils/error_handlers" export let title: string + export let blur_background: boolean = false const id: string = "dialog-" + Math.random().toString(36) type ActionButton = { label: string @@ -9,6 +10,8 @@ asyncAction?: () => Promise action?: () => boolean isCancel?: boolean + isPrimary?: boolean + isError?: boolean disabled?: boolean } export let action_buttons: ActionButton[] = [] @@ -91,7 +94,10 @@ {:else} diff --git a/app/web_ui/src/lib/ui/info_tooltip.svelte b/app/web_ui/src/lib/ui/info_tooltip.svelte index 6b800a64..c4f7b1cf 100644 --- a/app/web_ui/src/lib/ui/info_tooltip.svelte +++ b/app/web_ui/src/lib/ui/info_tooltip.svelte @@ -2,7 +2,10 @@ export let tooltip_text: string - + {/if} + { + get_score_summary() + }} + /> + + + + + {#if show_incomplete_warning(score_summary)} +
+ +
+ {/if} + +
+ + + + + {#each evaluator.output_scores as output_score} + + {/each} + + + + {#each task_run_configs || [] as task_run_config} + {@const percent_complete = + score_summary?.run_config_percent_complete?.[ + "" + task_run_config.id + ]} + { + goto( + `/evals/${project_id}/${task_id}/${eval_id}/${current_eval_config_id}/${task_run_config.id}/run_result`, + ) + }} + > + + {#each evaluator.output_scores as output_score} + {@const score = + score_summary?.results?.["" + task_run_config.id]?.[ + string_to_json_key(output_score.name) + ]?.mean_score} + + {/each} + + {/each} + +
+
Run Method
+
How task output is generated
+
+ {output_score.name} + +
+
+ {task_run_config.name} +
+
+ {model_name( + task_run_config?.run_config_properties?.model_name, + $model_info, + )} +
+
+ {provider_name_from_id( + task_run_config?.run_config_properties + ?.model_provider_name, + )} +
+
+ Prompt Name: + {task_run_config.prompt?.name || + prompt_name_from_id( + task_run_config?.run_config_properties?.prompt_id, + )} +
+ {#if task_run_config?.prompt?.generator_id && task_run_config?.run_config_properties?.prompt_id?.startsWith("task_run_config::")} + +
+ Prompt Source: {prompt_name_from_id( + task_run_config?.prompt?.generator_id, + )} +
+ {/if} + {#if percent_complete} +
+ {(percent_complete * 100.0).toFixed(1)}% complete +
+ {:else if score_summary} + +
0% complete
+ {/if} +
+ {score != null ? score.toFixed(2) : "unknown"} +
+
+ {:else} +
Compare Run Methods
+
+ Find the best method of running your task comparing various prompts, + models, fine-tunes, and more. Add one or more task run methods to get + started. +
+ + + {/if} + + {/if} + + + +

+ Define a method of running this task (model+prompt). +

+

+ Your evaluator can compare multiple run methods to find which one produces + the highest scores on your eval dataset. +

+
+ + + {#if add_task_config_error} +
+ {add_task_config_error.getMessage() || "An unknown error occurred"} +
+ {/if} +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte new file mode 100644 index 00000000..8d85f241 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte @@ -0,0 +1,301 @@ + + + + {#if results_loading} +
+
+
+ {:else if results_error} +
+
Error Loading Eval Results
+
+ {results_error.getMessage() || "An unknown error occurred"} +
+
+ {:else if results && results.results.length === 0} +
+
Eval Results Empty
+
+ No results found for this run method. +
+
+ {:else if results} +
+
+
Task Run Method
+
+ How the task outputs were generated. +
+
+ {#each Object.entries(get_run_config_properties(results.run_config, results.eval)) as [prop_name, prop_value]} +
{prop_name}
+
+ {prop_value} +
+ {/each} +
+
+
+
Evaluation Method
+
+ How the task outputs were evaluated. +
+
+ {#each Object.entries(get_eval_properties(results.eval, results.eval_config)) as [prop_name, prop_value]} +
{prop_name}
+
+ {prop_value} +
+ {/each} +
+
+
+
+ + + + + + {#each results.eval.output_scores as score} + + {/each} + + + + {#each results.results as result} + + + + {#each results.eval.output_scores as score} + {@const score_value = + result.scores[string_to_json_key(score.name)]} + + {/each} + + {/each} + +
Input & OutputThinking + {score.name} + +
+
Input:
+
+ {result.input} +
+
Output:
+
+ {result.output} +
+
+ {#if result.intermediate_outputs?.reasoning || result.intermediate_outputs?.chain_of_thought} +
+
+ {result.intermediate_outputs?.reasoning || + result.intermediate_outputs?.chain_of_thought || + "N/A"} +
+
+
+ +
+
+
+
+ {:else} + N/A + {/if} +
+ {score_value ? score_value.toFixed(2) : "N/A"} +
+
+ {/if} +
+ + { + window.history.back() + return true + }, + }, + ]} +> +
+ +
+ Viewing these evaluation results may lead to data leakage - a fundamental + issue in machine learning where information from your test set + inadvertently influences your development process. When you examine + specific examples, you're likely to optimize for those particular cases + rather than developing solutions that generalize well to unseen data. +
+
+ Use our "Run" screen or fresh synthetic dataset generation if you want to + explore what type of content a run method is generating. +
+
+
+ + +
+ {displayed_result?.intermediate_outputs?.reasoning || + displayed_result?.intermediate_outputs?.chain_of_thought || + "N/A"} +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte new file mode 100644 index 00000000..54c9fa4f --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -0,0 +1,374 @@ + + +
+ + {#if loading} +
+
+
+ {:else if loading_error} +
+
Error Loading Task Information
+
+ {loading_error?.getMessage() || "An unknown error occurred"} +
+
+ {:else} + +
Step 1: Select Evaluator Algorithm
+ +
+ {#each evaluator_algorithms as evaluator} + + {/each} +
+ + {#if selected_algo} +
+
+ Step 2: Select Eval Model +
+
+ Specify which model will be used to run the evaluation. This is + not necessarily the model that will be used to run the task. +
+
+ + + {/if} + + {#if selected_algo && combined_model_name} +
+
+ Step 3: Task Description +
+
+
+ Include a short description of what this task does. The + evaluator will use this for context. Keep it short, ideally one + sentence. Include requirements for the eval below, not in this + description. +
+
+
+ + +
+
+ Step 4: Evaluation Instructions +
+
+ This is a list of instructions to be used by the evaluator's + model. It will 'think' through each of these steps in order before + generating final scores. +
+ {#if evaluator?.template} +
+ We've pre-populated the evaluation steps for you based on the + template you selected ({evaluator.template}). Feel free to edit. +
+ {/if} +
+ + + + + {/if} +
+ {/if} +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte new file mode 100644 index 00000000..201f001e --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte @@ -0,0 +1,618 @@ + + + { + score_legend_dialog?.show() + }, + }, + { + label: "Add Eval Method", + href: `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/create_eval_config?next_page=eval_configs`, + }, + ]} +> + {#if loading} +
+
+
+ {:else if error} +
+
Error Loading
+
+ {error.getMessage() || "An unknown error occurred"} +
+
+ {:else if evaluator} +
+
+
Evaluator Properties
+
+ {#each get_eval_properties(evaluator, score_summary) as property} +
{property.name}
+
+ {property.value} +
+ {/each} +
+ {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25} +
+ +
+ {/if} +
+
+
+ {#if eval_configs?.length} +
+
+
Correlation to Human Ratings
+
+ Each score in this table is a measure for how much the eval method + correlates to human ratings, using the selected scoring metric. +
+ {#if score_summary_error} +
+ {score_summary_error.getMessage() || + "An unknown error occurred fetching scores."} +
+ {/if} +
+
+ +
+ { + get_score_summary() + }} + /> +
+
+
+ + + + {#if incomplete_warning(score_summary).length} +
+ +
    + {#each incomplete_warning(score_summary) as warning} +
  • {warning}
  • + {/each} +
+
+ {/if} + +
+ + + + + + {#each evaluator.output_scores as output_score} + + {/each} + + + + {#each eval_configs || [] as eval_config} + {@const percent_complete = + score_summary?.eval_config_percent_complete?.[ + "" + eval_config.id + ]} + + + + {#each evaluator.output_scores as output_score} + {@const scores = + score_summary?.results?.["" + eval_config.id]?.[ + string_to_json_key(output_score.name) + ]} + + {/each} + + {/each} + +
+
Eval Method
+
How task output is evaluated
+
Eval Instructions + {output_score.name} + + + +
+
+ {eval_config.name} +
+
+ {eval_config_to_ui_name(eval_config.config_type)} +
+
+ {model_name(eval_config?.model_name, $model_info)} +
+
+ {provider_name_from_id(eval_config?.model_provider)} +
+ {#if percent_complete} +
+ {(percent_complete * 100.0).toFixed(1)}% complete +
+ {:else if score_summary} + +
0% complete
+ {/if} + {#if eval_config.id == evaluator.current_config_id} +
Default
+ {:else} + + {/if} +
+
+
+ +
+
+
+ +
+
+
+
+
+ {#if scores} + {#if score_type === "mae"} + {scores.mean_absolute_error.toFixed(2)} + {:else if score_type === "mse"} + {scores.mean_squared_error.toFixed(2)} + {:else if score_type === "norm_mse"} + {scores.mean_normalized_squared_error.toFixed(3)} + {:else if score_type === "norm_mae"} + {scores.mean_normalized_absolute_error.toFixed(3)} + {:else if score_type === "spearman"} + {#if scores.spearman_correlation} + {scores.spearman_correlation.toFixed(3)} + {:else} + N/A + {/if} + {:else if score_type === "pearson"} + {#if scores.pearson_correlation} + {scores.pearson_correlation.toFixed(3)} + {:else} + N/A + {/if} + {:else if score_type === "kendalltau"} + {#if scores.kendalltau_correlation} + {scores.kendalltau_correlation.toFixed(3)} + {:else} + N/A + {/if} + {/if} + {:else} + None + + {/if} +
+
+ {:else} + + {/if} +
+ {/if} +
+ + + + + + +
+ Each score is a correlation score between human ratings and the automated + eval method's scores. Use these scores to find the eval method which best + matches human ratings, and set it as your default eval method. +
+
+
Quick Start
+
+ Add a variety of eval methods with different options (model, algorithm, + instructions). Then click 'Run Eval' to generate scores from each eval + method on your eval method dataset. +
+
+ We suggest you use Kendall's Tau correlation scores to compare results. + Kendall's Tau scores range from -1.0 to 1. Higher values indicate higher + correlation between the human ratings and the automated eval method's + scores. The absolute value of Kendall's Tau scores will vary depending on + how subjective your task is. +
+
+ Finally, set the eval method with the highest Kendall's Tau score as your + default eval method. +
+ +
Detailed Instructions
+
+ Read the docs for more information, a detailed walkthrough, and technical details about + each scoring metric. +
+
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte new file mode 100644 index 00000000..86a92616 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/eval_config_instruction.svelte @@ -0,0 +1,36 @@ + + +{#if eval_config} + {@const eval_steps = get_eval_steps(eval_config)} +
+
Task Description:
+ {eval_config.properties["task_description"] || "No description provided."} +
+ {#if eval_steps} +
+
Evaluation Steps:
+
    + {#each eval_steps as step} +
  1. + + {step} + +
  2. + {/each} +
+
+ {/if} +{/if} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte new file mode 100644 index 00000000..a6dd2500 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/output_type_table_preview.svelte @@ -0,0 +1,29 @@ + + +
+ {#if output_score_type === "five_star"} + 1 to 5 + + + + {:else if output_score_type === "pass_fail"} + pass/fail + + + + {:else if output_score_type === "pass_fail_critical"} + pass/fail/critical + + + + {:else} + {output_score_type} + {/if} +
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte new file mode 100644 index 00000000..ebfdd0c8 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/run_eval.svelte @@ -0,0 +1,188 @@ + + +{#if eval_state === "not_started"} + +{:else} + +{/if} + + +
+ {#if eval_state === "complete" && eval_complete_count == 0} +
No Data Needed to be Evaluated
+
+
+ If you want to add more data to your eval, + read the docs for instructions. +
+
+ {:else if eval_state === "complete"} +
Eval Complete 🎉
+ {:else if eval_state === "complete_with_errors"} +
Eval Complete with Errors
+ {:else if eval_state === "running"} +
+
Running...
+ {/if} +
+ {#if eval_total_count > 0} +
+ {eval_complete_count + eval_error_count} of {eval_total_count} +
+ {/if} + {#if eval_error_count > 0} +
+ {eval_error_count} error{eval_error_count === 1 ? "" : "s"} +
+ {/if} + {#if eval_run_error} +
+ {eval_run_error.getMessage() || "An unknown error occurred"} +
+ {/if} +
+
+
+ + +
+
Run this eval with the selected configuration?
+
Don't close this page if you want to monitor progress.
+ +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte new file mode 100644 index 00000000..0109e350 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte @@ -0,0 +1,337 @@ + + +
+ + {#if loading} +
+
+
+ {:else if loading_error} +
+
Error Loading Task Information
+
+ {loading_error?.getMessage() || "An unknown error occurred"} +
+
+ {:else if !selected_template} + + {:else} + 0 && output_scores[0].name)) + )} + > +
Part 1: Evaluator Details
+ + + +
+
+ Part 2: Evaluator Output Scores +
+
+ Define the scores that the evaluator will output. +
+ {#if selected_template !== "none"} + + {/if} +
+ + +
+
+
+ +
+
+ +
+
+
+ +
+
+
+ +
+
+ Part 3: Task Evaluation Dataset +
+
+ Specify which which part of your dataset is used when evaluating + different methods of running your task (various prompts, models, + fine-tunes, etc). +
+
+ + + {#if eval_dataset === "custom_tag"} + + {/if} + +
+
+ Part 4: Dataset to Compare Evaluation Methods +
+
+ Specify which which part of your dataset is used when trying to find + the best evaluation method for this task. You'll rate these dataset + items, so we can compare the evaluator's ratings to your human + preferences. +
+
+ + + {#if config_dataset === "custom_tag"} + + {/if} +
+ {/if} +
+
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.ts new file mode 100644 index 00000000..9786e09d --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.ts @@ -0,0 +1 @@ +export const prerender = false diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts new file mode 100644 index 00000000..ac7f8d8f --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts @@ -0,0 +1,9 @@ +import type { EvalOutputScore, EvalTemplateId } from "$lib/types" + +export type EvalTemplateResult = { + // Server IDs are EvalTemplateId. We have a custom "none" value for the UI. + template_id: EvalTemplateId | "none" + name: string + description: string + output_scores: EvalOutputScore[] +} diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte new file mode 100644 index 00000000..6383c335 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte @@ -0,0 +1,223 @@ + + +
+
+ Select Evaluator Template +
+ {#each evaluator_template_descriptions as template_description} + + {/each} +
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte new file mode 100644 index 00000000..53eb12e9 --- /dev/null +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/empty_eval.svelte @@ -0,0 +1,74 @@ + + +
+
+
+ + + + + + + + +
+
+ Improve Quality and Move Faster with Evaluations +
+
Create powerful evaluators using LLMs to judge performance.
+
+ Quickly compare many approaches to find what works best for your task. +
+
+ Ensure quality over time, back testing prior bugs and benchmarking new + approaches. +
+ + Create an Evaluator + + + Evals Guide + +
+
diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte index b5c5adcb..3a720aa3 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/+page.svelte @@ -12,7 +12,7 @@ $: project_id = $page.params.project_id $: task_id = $page.params.task_id - $: is_empty = !!finetunes && finetunes.length == 0 + $: is_empty = !finetunes || finetunes.length == 0 let finetunes: Finetune[] | null = null let finetunes_error: KilnError | null = null diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte index 1e7100aa..40441dac 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/create_finetune/+page.svelte @@ -3,7 +3,7 @@ import FormContainer from "$lib/utils/form_container.svelte" import FormElement from "$lib/utils/form_element.svelte" import { page } from "$app/stores" - import { client } from "$lib/api_client" + import { client, base_url } from "$lib/api_client" import { KilnError, createKilnError } from "$lib/utils/error_handlers" import { onMount } from "svelte" import { formatDate } from "$lib/utils/formatters" @@ -31,7 +31,7 @@ let finetune_custom_system_prompt = "" let finetune_custom_thinking_instructions = "Think step by step, explaining your reasoning." - let system_prompt_method = "basic" + let system_prompt_method = "simple_prompt_builder" $: project_id = $page.params.project_id $: task_id = $page.params.task_id @@ -298,8 +298,7 @@ body: { // @ts-expect-error types are validated by the server dataset_split_type: new_dataset_split, - // @ts-expect-error types are validated by the server - filter_type: new_dataset_filter, + filter_id: new_dataset_filter, }, }, ) @@ -474,9 +473,7 @@ .map(([key, value]) => `${key}=${encodeURIComponent(value || "")}`) .join("&") - window.open( - "http://localhost:8757/api/download_dataset_jsonl?" + query_string, - ) + window.open(base_url + "/api/download_dataset_jsonl?" + query_string) } diff --git a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte index 9ef51548..9779c733 100644 --- a/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte +++ b/app/web_ui/src/routes/(app)/fine_tune/[project_id]/[task_id]/empty_finetune.svelte @@ -76,16 +76,17 @@
- Fine tuning learns from your dataset to create custom models. + Fine-Tuning Learns from Your Dataset to Create Custom Models
- Fine tunes can be faster, cheaper and more accurate than standard models. + Fine-tuned models can be faster, cheaper and more accurate than standard + models.
- Create Fine-Tune + Create a Fine-Tune 0 + ? human_guidance + : undefined const { error: post_error, data, @@ -308,6 +313,7 @@ output_provider: provider, prompt_method, topic_path: topic_path || [], + human_guidance: save_sample_guidance, }, }, ) @@ -485,6 +491,18 @@ {/if} + {#if guidance_enabled && human_guidance.length > 0} + {#if prompt_method.includes("::")} + + {:else} + + {/if} + {/if} Saved Prompts {#if $current_task_prompts.prompts.length > 0}
- + + @@ -94,10 +94,32 @@ `/prompts/${project_id}/${task_id}/saved/${prompt.id}`, )} > - + + diff --git a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/create/+page.svelte b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/create/+page.svelte index 303fcead..5aa9fead 100644 --- a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/create/+page.svelte +++ b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/create/+page.svelte @@ -13,6 +13,7 @@ $: task_name = $current_task?.id == task_id ? $current_task?.name : "unknown" let prompt_name = "" + let prompt_description = "" let prompt = "" let is_chain_of_thought = false let chain_of_thought_instructions = @@ -35,6 +36,7 @@ }, body: { name: prompt_name, + description: prompt_description, prompt: prompt, chain_of_thought_instructions: is_chain_of_thought ? chain_of_thought_instructions @@ -51,7 +53,7 @@ // Success! Reload then navigate to the new prompt await load_available_prompts() - goto(`/prompts/${project_id}/${task_id}/saved/${data.id}`) + goto(`/prompts/${project_id}/${task_id}/saved/id::${data.id}`) } catch (e) { create_error = createKilnError(e) } finally { @@ -77,6 +79,13 @@ max_length={60} /> + + edit the task instructions or requirements, or add more data to your dataset by running the task, or add ratings and repairs to your diff --git a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/saved/[prompt_id]/+page.svelte b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/saved/[prompt_id]/+page.svelte index 4ac7b152..55d74481 100644 --- a/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/saved/[prompt_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/prompts/[project_id]/[task_id]/saved/[prompt_id]/+page.svelte @@ -1,6 +1,10 @@ @@ -29,9 +38,8 @@
{#if !$current_task_prompts}
@@ -55,14 +63,16 @@ {/if}
-
+
Details
{#each Object.entries(prompt_props) as [key, value]}
{key}
-
+
{value}
{/each} diff --git a/app/web_ui/src/routes/(app)/run/+page.svelte b/app/web_ui/src/routes/(app)/run/+page.svelte index e0c3c57e..c7324078 100644 --- a/app/web_ui/src/routes/(app)/run/+page.svelte +++ b/app/web_ui/src/routes/(app)/run/+page.svelte @@ -20,7 +20,7 @@ let input_form: RunInputForm - let prompt_method = "basic" + let prompt_method = "simple_prompt_builder" let model: string = $ui_state.selected_model $: model_name = model ? model.split("/").slice(1).join("/") : "" @@ -107,7 +107,7 @@ } else { if (prompt_method == "custom") { // Reset to basic, since custom is no longer available - prompt_method = "basic" + prompt_method = "simple_prompt_builder" } } } diff --git a/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte b/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte index 4350eab7..4a32d9a8 100644 --- a/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte +++ b/app/web_ui/src/routes/(app)/run/available_models_dropdown.svelte @@ -12,15 +12,28 @@ export let model: string = $ui_state.selected_model export let requires_structured_output: boolean = false export let requires_data_gen: boolean = false + export let requires_logprobs: boolean = false export let error_message: string | null = null $: $ui_state.selected_model = model $: model_options = format_model_options( $available_models || {}, requires_structured_output, requires_data_gen, + requires_logprobs, $ui_state.current_task_id, ) + // Export the parsed model name and provider name + export let model_name: string | null = null + export let provider_name: string | null = null + $: get_model_provider(model) + function get_model_provider(model_provider: string) { + model_name = model_provider + ? model_provider.split("/").slice(1).join("/") + : null + provider_name = model_provider ? model_provider.split("/")[0] : null + } + onMount(async () => { await load_available_models() }) @@ -31,6 +44,7 @@ providers: AvailableModels[], structured_output: boolean, requires_data_gen: boolean, + requires_logprobs: boolean, current_task_id: string | null, ): [string, [unknown, string][]][] { let options = [] @@ -63,6 +77,10 @@ unsupported_models.push([id, long_label]) continue } + if (requires_logprobs && !model.supports_logprobs) { + unsupported_models.push([id, long_label]) + continue + } model_list.push([id, model.name]) } if (model_list.length > 0) { @@ -75,9 +93,14 @@ } if (unsupported_models.length > 0) { - const not_recommended_label = requires_data_gen - ? "Not Recommended - Data Gen Not Supported" - : "Not Recommended - Structured Output Fails" + let not_recommended_label = "Not Recommended" + if (requires_data_gen) { + not_recommended_label = "Not Recommended - Data Gen Not Supported" + } else if (requires_structured_output) { + not_recommended_label = "Not Recommended - Structured Output Fails" + } else if (requires_logprobs) { + not_recommended_label = "Not Recommended - Logprobs Not Supported" + } options.push([not_recommended_label, unsupported_models]) } @@ -118,6 +141,10 @@ + {:else if requires_logprobs} + {:else if requires_structured_output} 0) { grouped_options.push(["Saved Prompts", static_prompts]) diff --git a/app/web_ui/src/routes/(app)/run/tag_dropdown.svelte b/app/web_ui/src/routes/(app)/run/tag_dropdown.svelte index 12cc63a2..6d2426c7 100644 --- a/app/web_ui/src/routes/(app)/run/tag_dropdown.svelte +++ b/app/web_ui/src/routes/(app)/run/tag_dropdown.svelte @@ -43,6 +43,7 @@ + diff --git a/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte b/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte index bd490638..b4ac6f9b 100644 --- a/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte +++ b/app/web_ui/src/routes/(fullscreen)/setup/(setup)/connect_providers/connect_providers.svelte @@ -5,7 +5,7 @@ import FormElement from "$lib/utils/form_element.svelte" import FormContainer from "$lib/utils/form_container.svelte" import { KilnError, createKilnError } from "$lib/utils/error_handlers" - import { client } from "$lib/api_client" + import { client, base_url } from "$lib/api_client" type Provider = { name: string @@ -309,19 +309,16 @@ api_key_submitting = true try { const provider_id = api_key_provider ? api_key_provider.id : "" - let res = await fetch( - "http://localhost:8757/api/provider/connect_api_key", - { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - provider: provider_id, - key_data: apiKeyData, - }), + let res = await fetch(base_url + "/api/provider/connect_api_key", { + method: "POST", + headers: { + "Content-Type": "application/json", }, - ) + body: JSON.stringify({ + provider: provider_id, + key_data: apiKeyData, + }), + }) let data = await res.json() if (res.status !== 200) { @@ -354,7 +351,7 @@ let custom_openai_compatible_providers: CustomOpenAICompatibleProvider[] = [] const check_existing_providers = async () => { try { - let res = await fetch("http://localhost:8757/api/settings") + let res = await fetch(base_url + "/api/settings") let data = await res.json() if (data["open_ai_api_key"]) { status.openai.connected = true diff --git a/libs/core/kiln_ai/adapters/adapter_registry.py b/libs/core/kiln_ai/adapters/adapter_registry.py index aea617af..ccdf7139 100644 --- a/libs/core/kiln_ai/adapters/adapter_registry.py +++ b/libs/core/kiln_ai/adapters/adapter_registry.py @@ -2,14 +2,14 @@ from kiln_ai import datamodel from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, BaseAdapter from kiln_ai.adapters.model_adapters.langchain_adapters import LangchainAdapter from kiln_ai.adapters.model_adapters.openai_model_adapter import ( OpenAICompatibleAdapter, OpenAICompatibleConfig, ) -from kiln_ai.adapters.prompt_builders import BasePromptBuilder from kiln_ai.adapters.provider_tools import core_provider, openai_compatible_config +from kiln_ai.datamodel import PromptId from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -18,8 +18,8 @@ def adapter_for_task( kiln_task: datamodel.Task, model_name: str, provider: ModelProviderName, - prompt_builder: BasePromptBuilder | None = None, - tags: list[str] | None = None, + prompt_id: PromptId | None = None, + base_adapter_config: AdapterConfig | None = None, ) -> BaseAdapter: # Get the provider to run. For things like the fine-tune provider, we want to run the underlying provider core_provider_name = core_provider(model_name, provider) @@ -40,8 +40,8 @@ def adapter_for_task( "X-Title": "KilnAI", }, ), - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id, + base_adapter_config=base_adapter_config, ) case ModelProviderName.openai: return OpenAICompatibleAdapter( @@ -51,16 +51,16 @@ def adapter_for_task( model_name=model_name, provider_name=provider, ), - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id, + base_adapter_config=base_adapter_config, ) case ModelProviderName.openai_compatible: config = openai_compatible_config(model_name) return OpenAICompatibleAdapter( kiln_task=kiln_task, config=config, - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id, + base_adapter_config=base_adapter_config, ) # Use LangchainAdapter for the rest case ModelProviderName.groq: @@ -88,6 +88,6 @@ def adapter_for_task( kiln_task, model_name=model_name, provider=provider, - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id, + base_adapter_config=base_adapter_config, ) diff --git a/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py b/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py index ddeadd0f..1bb5620a 100644 --- a/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py +++ b/libs/core/kiln_ai/adapters/data_gen/data_gen_task.py @@ -183,3 +183,21 @@ def __init__(self, target_task: Task, num_samples: int = 8): input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()), output_json_schema=list_json_schema_for_task(target_task), ) + + +def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str: + """Wrap the original instruction with human guidance. + + Args: + original_instruction: The original instruction to wrap + guidance: The human guidance to wrap the instruction with + """ + return f"""{original_instruction} + +# Special Instructions + +The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are: + +{guidance} + +""" diff --git a/libs/core/kiln_ai/adapters/eval/base_eval.py b/libs/core/kiln_ai/adapters/eval/base_eval.py new file mode 100644 index 00000000..8c1dcd09 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/base_eval.py @@ -0,0 +1,164 @@ +import json +from abc import abstractmethod +from typing import Dict + +from kiln_ai.adapters.adapter_registry import adapter_for_task +from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalScores +from kiln_ai.datamodel.json_schema import validate_schema +from kiln_ai.datamodel.task import RunConfig, TaskOutputRatingType, TaskRun +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error + + +class BaseEval: + """ + Base class for all evals/evaluators. + + Should be subclassed, and the run_eval method implemented. + """ + + def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): + self.eval_config = eval_config + eval = eval_config.parent_eval() + if not eval: + raise ValueError("Eval config must have a parent eval") + self.eval = eval + task = self.eval.parent_task() + if not task: + raise ValueError("Eval must have a parent task") + self.target_task = task + self.score_schema = BaseEval.build_score_schema(eval, allow_float_scores=True) + self.run_config = run_config + + def model_and_provider(self) -> tuple[str, ModelProviderName]: + model_name = self.eval_config.model_name + provider = self.eval_config.model_provider + if ( + not model_name + or not provider + or not isinstance(model_name, str) + or not isinstance(provider, str) + or provider not in ModelProviderName.__members__ + ): + raise ValueError( + "Model name and provider must be set in the eval config model properties" + ) + + return model_name, ModelProviderName(provider) + + async def run_task_and_eval( + self, input: str + ) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]: + """ + Runs the task on the provided run_config to generate fresh output, then runs the eval on that output. + """ + if self.run_config is None: + raise ValueError("Run config is required for run_task_and_eval") + + run_adapter = adapter_for_task( + self.target_task, + self.run_config.model_name, + ModelProviderName(self.run_config.model_provider_name), + base_adapter_config=AdapterConfig(allow_saving=False), + ) + + # Parse structured input if needed + parsed_input = input + if self.target_task.output_json_schema is not None: + parsed_input = json.loads(input) + + # we don't save by default here. We'll save manually after validating the output + run_output = await run_adapter.invoke(parsed_input) + + eval_output, intermediate_outputs = await self.run_eval(run_output) + validate_schema(eval_output, self.score_schema) + + return run_output, eval_output, intermediate_outputs + + @abstractmethod + async def run_eval( + self, task_run: TaskRun + ) -> tuple[EvalScores, Dict[str, str] | None]: + """ + Runs the eval on the given task run. + + Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs (eval thinking). + """ + pass + + @classmethod + def build_score_schema(cls, eval: Eval, allow_float_scores: bool = False) -> str: + """ + Build a JSON schema for the scoring output of the task requirements + + We allow 2 modes: allow_float_scores=True and allow_float_scores=False. + + allow_float_scores=False is used for the call to the model, and forces the model into selecting into discrete rating options (int 1-5, pass-fail, etc). + allow_float_scores=True is used for final score output (for example, after we take a g-eval weighting of the model's logprobs). A pass/fail rating might return 0.75 for likely pass (as opposed to 0.99 for near certain pass), or a 1-5 score might return 3.75. + """ + + # Note: python maintains order, which is good as we want the user defined order, and overall last + properties = {} + for output_score in eval.output_scores: + output_score_json_key = output_score.json_key() + + if len(output_score_json_key) == 0: + raise ValueError( + f"Invalid output score name: {output_score.name}. Can not be used as JSON schema key." + ) + property: dict[str, str | int | float | list[str] | list[int]] = { + "title": output_score.name, + } + match output_score.type: + case TaskOutputRatingType.five_star: + if allow_float_scores: + property["type"] = "number" + property["minimum"] = 1 + property["maximum"] = 5 + else: + property["enum"] = [1, 2, 3, 4, 5] + + property["description"] = ( + f"{output_score.instruction}\n\nThe rating should be between 1 and 5, with 1 being the worst and 5 being the best." + ) + case TaskOutputRatingType.pass_fail: + if allow_float_scores: + property["type"] = "number" + property["minimum"] = 0 + property["maximum"] = 1 + property["description"] = ( + f"{output_score.instruction}\n\nThe rating should be between 0 and 1, with 0 being a failure and 1 being a pass." + ) + else: + property["enum"] = ["pass", "fail"] + property["description"] = ( + f"{output_score.instruction}\n\nThe rating should be either 'pass' or 'fail'." + ) + case TaskOutputRatingType.pass_fail_critical: + if allow_float_scores: + property["type"] = "number" + property["minimum"] = -1 + property["maximum"] = 1 + property["description"] = ( + f"{output_score.instruction}\n\nThe rating should be between -1 and 1, with 1 being a pass, 0 being a failure, and -1 being a critical failure (very severe failure)." + ) + else: + property["enum"] = ["pass", "fail", "critical"] + property["description"] = ( + f"{output_score.instruction}\n\nThe rating should be either 'pass', 'fail', or 'critical' where critical a very severe failure." + ) + case TaskOutputRatingType.custom: + # Skip custom rating types in evals + continue + case _: + raise_exhaustive_enum_error(output_score.type) + + properties[output_score_json_key] = property + + schema = { + "type": "object", + "properties": properties, + "required": list(properties.keys()), + } + return json.dumps(schema, ensure_ascii=False) diff --git a/libs/core/kiln_ai/adapters/eval/eval_runner.py b/libs/core/kiln_ai/adapters/eval/eval_runner.py new file mode 100644 index 00000000..d82593c9 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/eval_runner.py @@ -0,0 +1,267 @@ +import asyncio +import logging +from dataclasses import dataclass +from typing import AsyncGenerator, Dict, List, Literal, Set + +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.registry import eval_adapter_from_type +from kiln_ai.datamodel.basemodel import ID_TYPE +from kiln_ai.datamodel.dataset_filters import dataset_filter_from_id +from kiln_ai.datamodel.eval import EvalConfig, EvalRun, EvalScores +from kiln_ai.datamodel.task import TaskRunConfig +from kiln_ai.datamodel.task_run import TaskRun + +logger = logging.getLogger(__name__) + + +@dataclass +class EvalJob: + item: TaskRun + type: Literal["task_run_eval", "eval_config_eval"] + # If type == "task_run_eval", both of these should be set. If type == "eval_config_eval", only eval_config should be set. + eval_config: EvalConfig + task_run_config: TaskRunConfig | None = None + + +@dataclass +class EvalProgress: + complete: int | None = None + total: int | None = None + errors: int | None = None + + +class EvalRunner: + """ + Runs an eval. Async execution is supported to make it faster when using remote/fast model providers. + + Can run an eval in 2 modes: + 1) eval_config_eval: evaluate an eval config using existing dataset items. + 2) task_run_eval: evaluate a range of task run configs, generating new run output using existing dataset item input. + """ + + def __init__( + self, + eval_configs: List[EvalConfig], + run_configs: List[TaskRunConfig] | None, + eval_run_type: Literal["eval_config_eval", "task_run_eval"], + ): + if len(eval_configs) == 0: + raise ValueError("Eval runner requires at least one eval config") + target_eval = eval_configs[0].parent_eval() + if target_eval is None: + raise ValueError("Eval config requires a parent eval") + for eval_config in eval_configs: + parent_eval = eval_config.parent_eval() + if parent_eval is None: + raise ValueError("Eval config requires a parent eval") + if parent_eval.id != target_eval.id: + raise ValueError("All eval configs must have the same parent eval") + + target_task = target_eval.parent_task() + if target_task is None: + raise ValueError("Eval config requires a (grand)parent task") + + # Check that run_configs is compatible + if eval_run_type == "task_run_eval": + if run_configs is None or len(run_configs) == 0: + raise ValueError("Task run eval requires run configs") + for run_config in run_configs: + parent_task = run_config.parent_task() + if parent_task is None: + raise ValueError("All run configs must have a parent task") + if parent_task.id != target_task.id: + raise ValueError( + "Run config is not for the same task as the eval configs" + ) + else: + if run_configs is not None: + raise ValueError("Mode 'eval_config_eval' does not support run configs") + + self.eval_run_type = eval_run_type + self.eval_configs = eval_configs + self.run_configs = run_configs + self.task = target_task + self.eval = target_eval + + def collect_tasks(self) -> List[EvalJob]: + if self.eval_run_type == "eval_config_eval": + return self.collect_tasks_for_eval_config_eval() + else: + return self.collect_tasks_for_task_run_eval() + + def collect_tasks_for_eval_config_eval(self) -> List[EvalJob]: + """ + Collect all jobs for this run, excluding any that have already been run. + + This variant is used for mode "eval_config_eval", using existing dataset run data (input/output). + + The tasks: + - should be in the eval config set filter + - should not have already been run for this eval config + dataset item pair + """ + filter = dataset_filter_from_id(self.eval.eval_configs_filter_id) + + # already_run[eval_config_id][dataset_id] + already_run: Dict[ID_TYPE, Set[ID_TYPE]] = {} + for eval_config in self.eval_configs: + already_run[eval_config.id] = set() + for run in eval_config.runs(readonly=True): + already_run[eval_config.id].add(run.dataset_id) + + return [ + EvalJob( + item=task_run, + eval_config=eval_config, + type="eval_config_eval", + ) + for task_run in self.task.runs(readonly=True) + if filter(task_run) + for eval_config in self.eval_configs + if task_run.id not in already_run[eval_config.id] + ] + + def collect_tasks_for_task_run_eval(self) -> List[EvalJob]: + """ + Collect all jobs for this run, excluding any that have already been run. + + This variant is used for mode "task_run_eval", generating new run output using existing dataset item input. + + The tasks: + - should be in the eval set filter + - should not have already been run for this eval config + run config + dataset item + """ + filter = dataset_filter_from_id(self.eval.eval_set_filter_id) + + # already_run[eval_config_id][run_config_id][dataset_id] + already_run: Dict[ID_TYPE, Dict[ID_TYPE, Set[ID_TYPE]]] = {} + for eval_config in self.eval_configs: + already_run[eval_config.id] = {} + for run_config in self.run_configs or []: + already_run[eval_config.id][run_config.id] = set() + for run in eval_config.runs(readonly=True): + if run.task_run_config_id is not None: + already_run[eval_config.id][run.task_run_config_id].add( + run.dataset_id + ) + + return [ + EvalJob( + item=task_run, + task_run_config=run_config, + type="task_run_eval", + eval_config=eval_config, + ) + for task_run in self.task.runs(readonly=True) + if filter(task_run) + for eval_config in self.eval_configs + for run_config in self.run_configs or [] + if task_run.id not in already_run[eval_config.id][run_config.id] + ] + + async def run(self, concurrency: int = 25) -> AsyncGenerator[EvalProgress, None]: + """ + Runs the configured eval run with parallel workers and yields progress updates. + """ + jobs = self.collect_tasks() + + complete = 0 + errors = 0 + total = len(jobs) + + # Send initial status + yield EvalProgress(complete=complete, total=total, errors=errors) + + worker_queue: asyncio.Queue[EvalJob] = asyncio.Queue() + for job in jobs: + worker_queue.put_nowait(job) + + # simple status queue to return progress. True=success, False=error + status_queue: asyncio.Queue[bool] = asyncio.Queue() + + workers = [] + for i in range(concurrency): + task = asyncio.create_task(self.run_worker(worker_queue, status_queue)) + workers.append(task) + + # Send status updates until workers are done, and they are all sent + while not status_queue.empty() or not all(worker.done() for worker in workers): + try: + # Use timeout to prevent hanging if all workers complete + # between our while condition check and get() + success = await asyncio.wait_for(status_queue.get(), timeout=0.1) + if success: + complete += 1 + else: + errors += 1 + + yield EvalProgress(complete=complete, total=total, errors=errors) + except asyncio.TimeoutError: + # Timeout is expected, just continue to recheck worker status + # Don't love this but beats sentinels for reliability + continue + + # These are redundant, but keeping them will catch async errors + await asyncio.gather(*workers) + await worker_queue.join() + + async def run_worker( + self, worker_queue: asyncio.Queue[EvalJob], status_queue: asyncio.Queue[bool] + ): + while True: + try: + job = worker_queue.get_nowait() + except asyncio.QueueEmpty: + # worker can end when the queue is empty + break + try: + success = await self.run_job(job) + await status_queue.put(success) + finally: + # Always mark the dequeued task as done, even on exceptions + worker_queue.task_done() + + async def run_job(self, job: EvalJob) -> bool: + try: + # Create the evaluator for this eval config/run config pair + evaluator = eval_adapter_from_type(job.eval_config.config_type)( + job.eval_config, + job.task_run_config.run_config() if job.task_run_config else None, + ) + if not isinstance(evaluator, BaseEval): + raise ValueError("Not able to create evaluator from eval config") + + task_output: str | None = None + scores: EvalScores | None = None + intermediate_outputs: Dict[str, str] | None = None + if job.type == "eval_config_eval": + # Eval config eval, we use the saved input from the task run, not invoking the task again + scores, intermediate_outputs = await evaluator.run_eval(job.item) + task_output = job.item.output.output + else: + # Task run eval, we invoke the task again to get a fresh output + ( + result_task_run, + scores, + intermediate_outputs, + ) = await evaluator.run_task_and_eval(job.item.input) + task_output = result_task_run.output.output + + # Save the job result + eval_run = EvalRun( + parent=job.eval_config, + task_run_config_id=job.task_run_config.id + if job.task_run_config + else None, + dataset_id=job.item.id, + eval_config_eval=job.type == "eval_config_eval", + scores=scores, + input=job.item.input, + output=task_output, + intermediate_outputs=intermediate_outputs, + ) + eval_run.save_to_file() + + return True + except Exception as e: + logger.error(f"Error running eval job for dataset item {job.item.id}: {e}") + return False diff --git a/libs/core/kiln_ai/adapters/eval/g_eval.py b/libs/core/kiln_ai/adapters/eval/g_eval.py new file mode 100644 index 00000000..83112b12 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/g_eval.py @@ -0,0 +1,367 @@ +import math +from typing import Dict, List, Tuple + +from kiln_ai.adapters.adapter_registry import adapter_for_task +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig, RunOutput +from kiln_ai.adapters.prompt_builders import PromptGenerators +from kiln_ai.datamodel import Project, Task, TaskRun +from kiln_ai.datamodel.eval import EvalConfig, EvalConfigType, EvalScores +from kiln_ai.datamodel.task import RunConfig +from openai.types.chat import ChatCompletionTokenLogprob + +# all the tokens we score for, and their float scores. +TOKEN_TO_SCORE_MAP: Dict[str, float] = { + "1": 1.0, + "2": 2.0, + "3": 3.0, + "4": 4.0, + "5": 5.0, + "pass": 1.0, + "fail": 0.0, + "critical": -1.0, +} + + +class GEvalTask(Task, parent_of={}): + """ + Kiln task for executing a G-Eval. Can be run on any Kiln adapter which supports logprobs. + + Note G-Eval implements both G-Eval and LLM as Judge as they are very similar. + """ + + def __init__(self, eval_config: EvalConfig): + tmp_project = Project(name="GEval") + + # Build a simple LLM as Judge system instruction + system_instruction = "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" + # Optionally add a short task description + task_description = eval_config.properties.get("task_description", None) + if task_description: + system_instruction += f"\nThe task the model was given is as follows:\n\n{task_description}\n\n" + + # Build the COT eval instructions + cot_instructions = "First, think step by step about the model's performance following these evaluation steps:\n\n" + steps = eval_config.properties.get("eval_steps", None) + if not steps or not isinstance(steps, list): + raise ValueError("eval_steps must be a list") + for i, step in enumerate(steps): + cot_instructions += f"{i + 1}) {step}\n" + + eval = eval_config.parent_eval() + if not eval: + raise ValueError("Eval config must have a parent eval") + + # Build the output schema from the eval's target output scores. + # We restrict the LLM's output scoring schema to discrete scores (pass/fail/critical/1-5) - allow_float_scores=False + # However, the final scores from the evaluator can be a float (see later logprob calculation, which requires discrete token outputs) + output_schema = BaseEval.build_score_schema(eval, allow_float_scores=False) + + super().__init__( + name="GEval Task", + parent=tmp_project, + instruction=system_instruction, + thinking_instruction=cot_instructions, + output_json_schema=output_schema, + ) + + +class GEval(BaseEval): + """ + A evaluator which implements G-Eval and LLM as Judge. + + G-Eval is a method of evaluating the quality of a model's output. It is a weighted average of the scores of the tokens in the output. The weights are the log probabilities of the tokens in the output. https://arxiv.org/abs/2303.16634 + + LLM as Judge is a method of evaluating the quality of a model's output. It simply asks the LLM to score, and uses the returned output (no logprobs needed). Also called direct evaluation. + + @misc{liu2023gevalnlgevaluationusing, + title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, + author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, + year={2023}, + eprint={2303.16634}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2303.16634}, + } + """ + + def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None): + if ( + eval_config.config_type != EvalConfigType.g_eval + and eval_config.config_type != EvalConfigType.llm_as_judge + ): + raise ValueError( + f"GEval must be initialized with a GEval or LLM as Judge config_type. Got {eval_config.config_type}" + ) + + super().__init__(eval_config, run_config) + + self.geval_task = GEvalTask(eval_config) + + async def run_eval( + self, task_run: TaskRun + ) -> tuple[EvalScores, Dict[str, str] | None]: + """ + Run this eval on the given task run. + """ + + model_name, provider = self.model_and_provider() + + # Only fetch logprobs for G-Eval + # There are at most 5 valid rating tokens per rating type (five_star being largest), so 10 is more than enough to get to the very very unlikely + top_logprobs = ( + 10 if self.eval_config.config_type == EvalConfigType.g_eval else None + ) + + adapter = adapter_for_task( + self.geval_task, + model_name, + provider, + # We always use Simple COT for G-Eval and LLM as Judge + prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, + base_adapter_config=AdapterConfig( + # Don't save this run into the task_runs. It will be saved into an eval_run where it belongs + allow_saving=False, + top_logprobs=top_logprobs, + ), + ) + + input = f"""The model was given the following input for the task: + +{task_run.input} + + +The model produced the following output for the task: + +{task_run.output} + +""" + + # We don't need the run, but invoke_returning_run_output() runs validations for us over _run() + _, run_output = await adapter.invoke_returning_run_output(input) + + if self.eval_config.config_type == EvalConfigType.llm_as_judge: + return self.build_llm_as_judge_score( + run_output + ), run_output.intermediate_outputs + else: + return self.build_g_eval_score(run_output), run_output.intermediate_outputs + + def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores: + """ + Build the LLM as Judge score for the given run and run output. + """ + # Convert the output format we asked for (discreet values) to our float scores + scores: EvalScores = {} + if not isinstance(run_output.output, dict): + raise ValueError("LLM as Judge output must be a dictionary") + + for metric, score in run_output.output.items(): + token_score = self.score_from_token_string(f"{score}") + if token_score is None: + raise ValueError( + f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." + ) + scores[metric] = token_score + return scores + + def build_g_eval_score(self, run_output: RunOutput) -> EvalScores: + """ + Build the G-Eval score for the given run and run output. + + We create a weighted average of each rating using the logprobs. + + @misc{liu2023gevalnlgevaluationusing, + title={G-Eval: NLG Evaluation using GPT-4 with Better Human Alignment}, + author={Yang Liu and Dan Iter and Yichong Xu and Shuohang Wang and Ruochen Xu and Chenguang Zhu}, + year={2023}, + eprint={2303.16634}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2303.16634}, + } + """ + # We use structured output + outputs = run_output.output + assert isinstance(outputs, dict) + + # Build raw string output from the logprobs, which is easier to work with than Dict for the next bit + raw_output = self.raw_output_from_logprobs(run_output) + + # find the offset the start of each metric in the raw output json + metrics: List[str] = list(outputs.keys()) + metric_offsets = self.metric_offsets(raw_output, metrics) + + final_scores: EvalScores = {} + for metric in metrics: + score = self.g_eval_single_metric( + run_output, metric, metric_offsets, raw_output + ) + if score is None: + raise ValueError( + f"No score found for metric: {metric}. The LLM failed to follow the scoring rubric/instructions/schema." + ) + final_scores[metric] = score + + return final_scores + + def g_eval_single_metric( + self, + run_output: RunOutput, + metric: str, + metric_offsets: Dict[str, int], + raw_output: str, + ) -> float | None: + """ + Run the G-Eval for a single metric. + + Scan the logprobs for the metric and return the weighted score of the rating token. + """ + + start_offset, end_offset = self.token_search_range( + raw_output, metric, metric_offsets + ) + + offset = 0 + + if ( + run_output.output_logprobs is None + or run_output.output_logprobs.content is None + ): + raise RuntimeError( + "No logprobs found for output - can not calculate g-eval" + ) + + # scan the tokens in the range, looking for the rating token + for _, chat_logprob in enumerate(run_output.output_logprobs.content): + if offset >= end_offset: + break + if offset >= start_offset: + score = self.rating_token_to_score(chat_logprob) + if score is not None: + return score + offset += len(chat_logprob.token) + + return None + + def raw_output_from_logprobs(self, run_output: RunOutput) -> str: + """ + Build the raw output string from the logprobs. Generate from logprobs so it's guaranteed to match the logprobs offsets + """ + if ( + run_output.output_logprobs is None + or run_output.output_logprobs.content is None + ): + raise RuntimeError( + "No logprobs found for output - can not calculate g-eval" + ) + + raw = "" + for chat_logprob in run_output.output_logprobs.content: + raw += chat_logprob.token + return raw + + def token_search_range( + self, raw_output: str, metric: str, metric_offsets: Dict[str, int] + ) -> Tuple[int, int]: + """ + Find the start and end offsets of the metric in the raw output. + + Start searching after the end of the target metric json entry ("overall_rating":), and before the start of the next metric ("some_other_score"). + """ + start_offset = metric_offsets[metric] + len(metric) + + # Find the lowest end offset that is greater than the start offset + end_offset = len(raw_output) + for v in list(metric_offsets.values()): + if v < end_offset and v > start_offset: + end_offset = v + + return start_offset, end_offset + + def rating_token_to_score( + self, token_logprob: ChatCompletionTokenLogprob + ) -> float | None: + """ + Convert a rating token to a score using weighted average of top logprobs. + + Only includes tokens that have valid scores. + + Some cleanup for upper case, whitespace and quotes. LLMs aren't always consistent. + """ + primary_token_score = self.score_from_token_string(token_logprob.token) + # check this is a real rating token, it could just be the ": ", "," or whitespace + if not primary_token_score: + return None + + total_score = 0.0 + total_probability = 0.0 + + # Process all valid scoring tokens + for top_logprob in token_logprob.top_logprobs: + token_score = self.score_from_token_string(top_logprob.token) + if token_score is not None: + # Convert logprob to probability + probability = math.exp(top_logprob.logprob) + total_score += token_score * probability + total_probability += probability + + if total_probability <= 0.0: + raise RuntimeError( + f"No valid scoring tokens found for {token_logprob.token}. This should never happen. Please file a bug if you see this." + ) + + # Normalize by total probability of valid tokens (LLM may have wanted to generate other non-rating tokens, these shouldn't lower score of rating tokens) + weighted_score = total_score / total_probability + + return weighted_score + + def score_from_token_string(self, token: str) -> float | None: + if token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[token] + + # handle more token variations like '"1"' and '"pass"' and ' paSS' and 'PASS' + unquoted_token = token.strip().strip('"').lower() + if unquoted_token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[unquoted_token] + + # handle numeric tokens like "1.0" + try: + float_value = float(token) + if float_value.is_integer(): + str_token = str(int(float_value)) + if str_token in TOKEN_TO_SCORE_MAP: + return TOKEN_TO_SCORE_MAP[str_token] + except ValueError: + pass + + return None + + def metric_offsets(self, raw_output: str, metrics: List[str]) -> Dict[str, int]: + """ + Find the offset to the start of each metric in the raw output json + + For the example json: `{"overall_rating": 1}` == 1 + + should return: + { + "overall_rating": 1 # it's 1 character into the json string + } + """ + metric_offsets: Dict[str, int] = {} + for metric in metrics: + # the quoted metric name is expected in the json: `{"overall_rating": 1}` == 1 + metric_name = f'"{metric}"' + + # we expect it exactly once + count = raw_output.count(metric_name) + if count != 1: + raise ValueError( + f"Metric {metric} should appear exactly once in the output. Found {count} times" + ) + + offset = raw_output.find(metric_name) + if offset == -1: + raise ValueError(f"Metric {metric} not found in raw output") + metric_offsets[metric] = offset + return metric_offsets diff --git a/libs/core/kiln_ai/adapters/eval/registry.py b/libs/core/kiln_ai/adapters/eval/registry.py new file mode 100644 index 00000000..b4b6722e --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/registry.py @@ -0,0 +1,16 @@ +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.g_eval import GEval +from kiln_ai.datamodel.eval import EvalConfigType +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error + + +def eval_adapter_from_type(eval_config_type: EvalConfigType) -> type[BaseEval]: + match eval_config_type: + case EvalConfigType.g_eval: + return GEval + case EvalConfigType.llm_as_judge: + # Also implemented by GEval + return GEval + case _: + # type checking will catch missing cases + raise_exhaustive_enum_error(eval_config_type) diff --git a/libs/core/kiln_ai/adapters/eval/test_base_eval.py b/libs/core/kiln_ai/adapters/eval/test_base_eval.py new file mode 100644 index 00000000..93f9a8cc --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_base_eval.py @@ -0,0 +1,324 @@ +import json + +import pytest +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.datamodel import BasePrompt, DataSource, DataSourceType +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalOutputScore +from kiln_ai.datamodel.task import ( + RunConfigProperties, + Task, + TaskOutputRatingType, + TaskRequirement, + TaskRunConfig, +) + + +def test_score_schema_five_star(): + # Create an eval with a five-star score + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="Quality Score", + instruction="Rate the quality", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(eval) + schema = json.loads(schema_str) + + # Check basic schema structure + assert schema["type"] == "object" + assert schema["required"] == ["quality_score", "overall_rating"] + + # Check score property, and that it's an enum of 1-5 + score_prop = schema["properties"]["quality_score"] + assert score_prop["enum"] == [1, 2, 3, 4, 5] + assert "Quality Score" in score_prop["title"] + assert "Rate the quality" in score_prop["description"] + assert "between 1 and 5" in score_prop["description"] + + # Check overall rating property, and that it's an enum of 1-5 + assert "overall_rating" in schema["properties"] + overall = schema["properties"]["overall_rating"] + assert overall["enum"] == [1, 2, 3, 4, 5] + assert "Overall Rating" in overall["title"] + assert "The overall rating for the task output" in overall["description"] + assert "between 1 and 5" in overall["description"] + + +def test_score_schema_five_star_float(): + # Create an eval with a five-star score + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="Quality Score", + instruction="Rate the quality", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) + schema = json.loads(schema_str) + + # Check basic schema structure + assert schema["type"] == "object" + assert schema["required"] == ["quality_score", "overall_rating"] + + # Check score property + score_prop = schema["properties"]["quality_score"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == 1 + assert score_prop["maximum"] == 5 + assert "Quality Score" in score_prop["title"] + assert "Rate the quality" in score_prop["description"] + assert "between 1 and 5" in score_prop["description"] + + # Check overall rating property + assert "overall_rating" in schema["properties"] + overall = schema["properties"]["overall_rating"] + assert overall["type"] == "number" + assert overall["minimum"] == 1 + assert overall["maximum"] == 5 + assert "Overall Rating" in overall["title"] + assert "The overall rating for the task output" in overall["description"] + assert "between 1 and 5" in overall["description"] + + +def test_score_schema_pass_fail(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="Pass Fail Test", + instruction="Check if it passes", + type=TaskOutputRatingType.pass_fail, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(eval) + schema = json.loads(schema_str) + + score_prop = schema["properties"]["pass_fail_test"] + assert score_prop["enum"] == ["pass", "fail"] + assert "Pass Fail Test" in score_prop["title"] + assert "Check if it passes" in score_prop["description"] + assert "'pass' or 'fail'" in score_prop["description"] + + assert schema["properties"]["overall_rating"] is not None + + # Now check that we can allow float scores with the proper float structure + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) + schema = json.loads(schema_str) + + score_prop = schema["properties"]["pass_fail_test"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == 0 + assert score_prop["maximum"] == 1 + assert ( + "between 0 and 1, with 0 being a failure and 1 being a pass" + in score_prop["description"] + ) + + +def test_score_schema_pass_fail_critical(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="Critical Test", + instruction="Check for critical issues", + type=TaskOutputRatingType.pass_fail_critical, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(eval) + schema = json.loads(schema_str) + + score_prop = schema["properties"]["critical_test"] + assert "enum" in score_prop + assert score_prop["enum"] == ["pass", "fail", "critical"] + assert "'pass', 'fail', or 'critical'" in score_prop["description"] + + assert schema["properties"]["overall_rating"] is not None + + # Now check that we can allow float scores with the proper float structure + schema_str = BaseEval.build_score_schema(eval, allow_float_scores=True) + schema = json.loads(schema_str) + + score_prop = schema["properties"]["critical_test"] + assert score_prop["type"] == "number" + assert score_prop["minimum"] == -1 + assert score_prop["maximum"] == 1 + assert "between -1 and 1, with 1 being a pass" in score_prop["description"] + + +def test_score_schema_multiple_scores(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="Pass Check", + instruction="Basic pass check", + type=TaskOutputRatingType.pass_fail, + ), + EvalOutputScore( + name="Security", + instruction="Check security", + type=TaskOutputRatingType.pass_fail_critical, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + schema_str = BaseEval.build_score_schema(eval) + schema = json.loads(schema_str) + + # Verify order is maintained + assert list(schema["properties"].keys()) == [ + "quality", + "pass_check", + "security", + "overall_rating", + ] + + +def test_score_schema_no_scores(): + # This should raise an error since at least one score is required + with pytest.raises(ValueError, match="output_scores are required"): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[], + ) + BaseEval.build_score_schema(eval) + + +class EvalTester(BaseEval): + """Test implementation of BaseEval""" + + async def run_eval(self, task_run): + return {"overall_rating": 5, "quality": 4} + + +@pytest.mark.paid +@pytest.mark.asyncio +async def test_run_method(): + task = Task( + name="Test Task", + instruction="Test instruction", + requirements=[ + TaskRequirement( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + ], + ) + + eval_config = EvalConfig( + name="Test Eval Config", + model=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4o", + "model_provider": "openai", + "adapter_name": "test", + }, + ), + parent=Eval( + name="Test Eval", + parent=task, + eval_set_filter_id="all", + eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Quality", + instruction="Rate quality", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="Overall Rating", + instruction="The overall rating for the task output", + type=TaskOutputRatingType.five_star, + ), + ], + ), + prompt=BasePrompt( + name="Test Prompt", + prompt="Test prompt", + ), + properties={"eval_steps": ["test_step"]}, + ) + + run_config = TaskRunConfig( + name="Test Run Config", + run_config_properties=RunConfigProperties( + model_name="llama_3_1_8b", + model_provider_name="groq", + prompt_id="simple_prompt_builder", + ), + parent=task, + ) + + evaluator = EvalTester(eval_config, run_config.run_config()) + + # Run the evaluation + task_run, eval_scores = await evaluator.run("test input") + + # Verify task run was created + assert task_run.input == "test input" + assert isinstance(task_run.output.output, str) + + # Verify eval scores match schema and contain expected values + assert eval_scores["overall_rating"] == 5 + assert eval_scores["quality"] == 4 + + # Verify schema validation worked (these keys should exist per schema) + assert set(eval_scores.keys()) == {"overall_rating", "quality"} diff --git a/libs/core/kiln_ai/adapters/eval/test_eval_runner.py b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py new file mode 100644 index 00000000..07bf61bb --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_eval_runner.py @@ -0,0 +1,640 @@ +from typing import Dict +from unittest.mock import AsyncMock, patch + +import pytest +from kiln_ai.adapters.eval.base_eval import BaseEval +from kiln_ai.adapters.eval.eval_runner import EvalJob, EvalRunner +from kiln_ai.datamodel import ( + DataSource, + DataSourceType, + Task, + TaskOutput, + TaskOutputRatingType, + TaskRun, +) +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalOutputScore, + EvalRun, + EvalScores, +) +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig + + +@pytest.fixture +def mock_task(tmp_path): + task = Task( + name="test", + description="test", + instruction="do the thing", + path=tmp_path / "task.kiln", + ) + task.save_to_file() + return task + + +@pytest.fixture +def mock_eval(mock_task): + eval = Eval( + id="test", + name="test", + description="test", + eval_set_filter_id="all", + eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Accuracy", + instruction="Check if the output is accurate", + type=TaskOutputRatingType.pass_fail, + ), + ], + parent=mock_task, + ) + eval.save_to_file() + return eval + + +@pytest.fixture +def data_source(): + return DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "gpt-4", + "model_provider": "openai", + "adapter_name": "test_adapter", + }, + ) + + +@pytest.fixture +def mock_eval_config(mock_eval): + eval_config = EvalConfig( + name="test", + model_name="gpt-4", + model_provider="openai", + parent=mock_eval, + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + ) + eval_config.save_to_file() + return eval_config + + +@pytest.fixture +def mock_run_config( + mock_task, +): + rc = TaskRunConfig( + name="test", + description="test", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + rc.save_to_file() + return rc + + +@pytest.fixture +def mock_eval_runner(mock_eval, mock_task, mock_eval_config, mock_run_config): + return EvalRunner( + eval_configs=[mock_eval_config], + run_configs=[mock_run_config], + eval_run_type="task_run_eval", + ) + + +# Test with and without concurrency +@pytest.mark.parametrize("concurrency", [1, 25]) +@pytest.mark.asyncio +async def test_async_eval_runner_status_updates(mock_eval_runner, concurrency): + # Real async testing! + + job_count = 50 + # Job objects are not the right type, but since we're mocking run_job, it doesn't matter + jobs = [{} for _ in range(job_count)] + + # Mock collect_tasks to return our fake jobs + mock_eval_runner.collect_tasks = lambda: jobs + + # Mock run_job to return True immediately + mock_eval_runner.run_job = AsyncMock(return_value=True) + + # Expect the status updates in order, and 1 for each job + expected_compelted_count = 0 + async for progress in mock_eval_runner.run(concurrency=concurrency): + assert progress.complete == expected_compelted_count + expected_compelted_count += 1 + assert progress.errors == 0 + assert progress.total == job_count + + # Verify last status update was complete + assert expected_compelted_count == job_count + 1 + + # Verify run_job was called for each job + assert mock_eval_runner.run_job.call_count == job_count + + +def test_collect_tasks_filtering( + mock_eval, + mock_eval_runner, + mock_task, + mock_eval_config, + data_source, + mock_run_config, +): + """Test that tasks are properly filtered based on eval filters""" + tags = ["tag1", "tag2", "tag3"] + task_runs = [] + for tag in tags: + # Create some task runs with different tags + task_run = TaskRun( + parent=mock_task, + input="test1", + input_source=data_source, + output=TaskOutput( + output="test1", + ), + tags=[tag], + ) + task_run.save_to_file() + task_runs.append(task_run) + + mock_eval.eval_set_filter_id = "tag::tag1" + mock_eval.eval_configs_filter_id = "tag::tag2" + + # Create a new runner of type task run eval + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=[mock_run_config], + eval_run_type="task_run_eval", + ) + jobs = runner.collect_tasks() + + # Should only get task_run1 jobs, the one with tag1 + assert len(jobs) == 1 + job = jobs[0] + # job should be the tag1 item, and setup as a task run eval for mock_run_config + assert job.item.tags == ["tag1"] + assert job.task_run_config.id == mock_run_config.id + assert job.eval_config.id == mock_eval_config.id + + # Change to an eval config set filter + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + + # Should only get eval_config1 jobs + assert len(jobs) == 1 + job = jobs[0] + # job should be the tag2 item, and setup as a eval config eval for mock_eval_config + assert job.item.tags == ["tag2"] + assert job.eval_config.id == mock_eval_config.id + assert job.task_run_config is None + + # Add a second task run config, and call a new runner with multiple run configs + rc = TaskRunConfig( + name="test2", + description="test2", + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + rc.save_to_file() + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=[mock_run_config, rc], + eval_run_type="task_run_eval", + ) + jobs = runner.collect_tasks() + assert len(jobs) == 2 + for job in jobs: + assert job.item.tags == ["tag1"] + assert job.task_run_config.id in [mock_run_config.id, rc.id] + assert job.eval_config.id == mock_eval_config.id + assert jobs[0].task_run_config.id != jobs[1].task_run_config.id + + # add a second eval config, and call a new runner with multiple eval configs + eval_config = EvalConfig( + name="test2", + model_name="gpt-4", + model_provider="openai", + parent=mock_eval, + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + ) + eval_config.save_to_file() + runner = EvalRunner( + eval_configs=[mock_eval_config, eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + # Check we get 2 jobs, one for each eval config + assert len(jobs) == 2 + for job in jobs: + assert job.item.tags == ["tag2"] + assert job.eval_config.id in [mock_eval_config.id, eval_config.id] + assert job.task_run_config is None + assert jobs[0].eval_config.id != jobs[1].eval_config.id + + +def test_validate_same_task( + mock_eval_runner, + mock_task, + data_source, + tmp_path, + mock_eval_config, + mock_run_config, +): + # second eval config has a different task + eval_config = EvalConfig( + name="test2", + model_name="gpt-4", + model_provider="openai", + properties={ + "eval_steps": ["step1", "step2", "step3"], + }, + parent=Eval( + name="test", + description="test", + eval_set_filter_id="all", + eval_configs_filter_id="all", + output_scores=[ + EvalOutputScore( + name="Accuracy", + instruction="Check if the output is accurate", + type=TaskOutputRatingType.pass_fail, + ), + ], + parent=Task( + name="test", + description="test", + instruction="do the thing", + ), + ), + ) + + with pytest.raises( + ValueError, match="All eval configs must have the same parent eval" + ): + EvalRunner( + eval_configs=[mock_eval_config, eval_config], + run_configs=[mock_run_config], + eval_run_type="eval_config_eval", + ) + + +def test_collect_tasks_excludes_already_run_task_run_eval( + mock_eval_runner, mock_task, data_source, mock_eval_config, mock_run_config +): + """Test that already run tasks are excluded""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + # Prior to any eval runs, we should get the task run + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 1 + assert jobs[0].item.id == task_run.id + assert jobs[0].task_run_config.id == mock_run_config.id + assert jobs[0].eval_config.id == mock_eval_config.id + + # Create an eval run for this task + EvalRun( + parent=mock_eval_config, + dataset_id=task_run.id, + task_run_config_id=mock_run_config.id, + input="test", + output="test", + scores={"accuracy": 1.0}, + ).save_to_file() + + # Set filter to match the task + mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" + mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent" + + jobs = mock_eval_runner.collect_tasks() + + # Should get no jobs since the task was already run + assert len(jobs) == 0 + + +def test_collect_tasks_excludes_already_run_eval_config_eval( + mock_task, data_source, mock_eval_config, mock_eval, mock_run_config +): + """Test that already run tasks are excluded""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + mock_eval.eval_set_filter_id = "tag::nonexistent" + mock_eval.eval_configs_filter_id = "tag::tag1" + mock_eval.save_to_file() + + # Prior to any eval runs, we should get 1 job for the eval config + runner = EvalRunner( + eval_configs=[mock_eval_config], + run_configs=None, + eval_run_type="eval_config_eval", + ) + jobs = runner.collect_tasks() + assert len(jobs) == 1 + assert jobs[0].item.id == task_run.id + assert jobs[0].eval_config.id == mock_eval_config.id + assert jobs[0].task_run_config is None + + # Create an eval run for this eval config task run pair, so now we should get no jobs (already run) + EvalRun( + parent=mock_eval_config, + dataset_id=task_run.id, + task_run_config_id=None, + eval_config_eval=True, + input="test", + output="test", + scores={ + "accuracy": 1.0, + }, + ).save_to_file() + + jobs = runner.collect_tasks() + + # Should get no jobs since the task was already run + assert len(jobs) == 0 + + +def test_collect_tasks_multiple_run_configs( + mock_eval_runner, mock_task, data_source, mock_run_config +): + """Test handling multiple run configs""" + # Create a task run + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["tag1"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + # Add another run config + second_config = TaskRunConfig( + name="test2", + description="test2", + run_config_properties=RunConfigProperties( + model_name="gpt-3.5", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + parent=mock_task, + ) + second_config.save_to_file() + mock_eval_runner.run_configs.append(second_config) + + # Set filter to match the task + mock_eval_runner.eval.eval_set_filter_id = "tag::tag1" + + jobs = mock_eval_runner.collect_tasks() + + # Should get 2 jobs, one for each config + assert len(jobs) == 2 + assert {job.task_run_config.id for job in jobs} == { + second_config.id, + mock_run_config.id, + } + + +def test_collect_tasks_empty_cases(mock_eval_runner, mock_task, data_source): + """Test empty cases - no matching tasks or no tasks at all""" + # Set filter that won't match anything + mock_eval_runner.eval.eval_set_filter_id = "tag::nonexistent" + mock_eval_runner.eval.eval_configs_filter_id = "tag::nonexistent" + + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 0 + + # Create task run with non-matching tag + task_run = TaskRun( + parent=mock_task, + input="test", + input_source=data_source, + tags=["other_tag"], + output=TaskOutput( + output="test", + ), + ) + task_run.save_to_file() + + jobs = mock_eval_runner.collect_tasks() + assert len(jobs) == 0 + + +@pytest.mark.asyncio +async def test_run_job_success_task_run_eval( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config +): + # Create a task run to evaluate + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + + # Create eval job + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) + + # Mock the evaluator + mock_result_run = TaskRun( + input="test input", + input_source=data_source, + output=TaskOutput(output="evaluated output"), + intermediate_outputs={"intermediate_output": "intermediate output"}, + ) + mock_scores = {"accuracy": 0.95} + + class MockEvaluator(BaseEval): + async def run_task_and_eval(self, input_text): + return ( + mock_result_run, + mock_scores, + {"intermediate_output": "intermediate output"}, + ) + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: MockEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is True + + # Verify eval run was saved + eval_runs = mock_eval_config.runs() + assert len(eval_runs) == 1 + saved_run = eval_runs[0] + assert saved_run.dataset_id == task_run.id + assert saved_run.task_run_config_id == mock_run_config.id + assert saved_run.scores == mock_scores + assert saved_run.input == "test input" + assert saved_run.output == "evaluated output" + assert saved_run.intermediate_outputs == { + "intermediate_output": "intermediate output" + } + assert saved_run.parent_eval_config().id == mock_eval_config.id + assert saved_run.eval_config_eval is False + + +@pytest.mark.asyncio +async def test_run_job_success_eval_config_eval( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config +): + # Create a task run to evaluate + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + + # Create eval job + job = EvalJob( + item=task_run, + type="eval_config_eval", + eval_config=mock_eval_config, + ) + + # Mock the evaluator + mock_result_run = TaskRun( + input="test input", + input_source=data_source, + output=TaskOutput(output="evaluated output"), + ) + mock_scores: EvalScores = {"accuracy": 0.95} + + class MockEvaluator(BaseEval): + async def run_task_and_eval(self, input_text): + raise ValueError("Attempted to run task and eval for a config eval") + + async def run_eval( + self, task_run: TaskRun + ) -> tuple[EvalScores, Dict[str, str] | None]: + return mock_scores, {"intermediate_output": "intermediate output"} + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: MockEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is True + + # Verify eval run was saved + eval_runs = mock_eval_config.runs() + assert len(eval_runs) == 1 + saved_run = eval_runs[0] + assert saved_run.dataset_id == task_run.id + assert saved_run.task_run_config_id is None + assert saved_run.scores == mock_scores + assert saved_run.input == "test input" + assert saved_run.output == "test output" + assert saved_run.parent_eval_config().id == mock_eval_config.id + assert saved_run.eval_config_eval is True + + +@pytest.mark.asyncio +async def test_run_job_invalid_evaluator( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config +): + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) + + # Return an invalid evaluator type + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: object(), + ): + success = await mock_eval_runner.run_job(job) + + assert success is False + assert len(mock_eval_config.runs()) == 0 + + +@pytest.mark.asyncio +async def test_run_job_evaluator_error( + mock_eval_runner, mock_task, data_source, mock_run_config, mock_eval_config +): + task_run = TaskRun( + parent=mock_task, + input="test input", + input_source=data_source, + output=TaskOutput(output="test output"), + ) + task_run.save_to_file() + job = EvalJob( + item=task_run, + task_run_config=mock_run_config, + type="task_run_eval", + eval_config=mock_eval_config, + ) + + class ErrorEvaluator(BaseEval): + async def run_task_and_eval(self, input_text): + raise ValueError("Evaluation failed") + + with patch( + "kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type", + return_value=lambda *args: ErrorEvaluator(*args), + ): + success = await mock_eval_runner.run_job(job) + + assert success is False + assert len(mock_eval_config.runs()) == 0 diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval.py b/libs/core/kiln_ai/adapters/eval/test_g_eval.py new file mode 100644 index 00000000..e6c7fdf7 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval.py @@ -0,0 +1,497 @@ +import math +import pickle + +import pytest +from kiln_ai.adapters.eval.g_eval import TOKEN_TO_SCORE_MAP, GEval, GEvalTask +from kiln_ai.adapters.eval.test_g_eval_data import serialized_run_output +from kiln_ai.adapters.ml_model_list import built_in_models +from kiln_ai.adapters.model_adapters.base_adapter import RunOutput +from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers +from kiln_ai.datamodel import ( + DataSource, + DataSourceType, + Project, + Task, + TaskOutput, + TaskOutputRatingType, + TaskRequirement, + TaskRun, +) +from kiln_ai.datamodel.eval import Eval, EvalConfig, EvalConfigType, EvalOutputScore +from kiln_ai.datamodel.task import RunConfig + + +@pytest.fixture +def test_task(tmp_path): + project = Project(name="Test Project", path=tmp_path / "project.kiln") + project.save_to_file() + + task = Task( + name="Joke Generator", + instruction="Generate a joke, given a topic", + parent=project, + requirements=[ + TaskRequirement( + name="Topic alignment", + instruction="Rate how aligned the joke is to the provided topic", + type=TaskOutputRatingType.five_star, + ), + TaskRequirement( + name="Appropriateness", + instruction="Check if the content is appropriate for all audiences", + type=TaskOutputRatingType.pass_fail, + ), + ], + ) + task.save_to_file() + return task + + +@pytest.fixture +def test_eval_config(test_task): + eval = Eval( + name="Joke Quality Eval", + parent=test_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="appropriateness", + type=TaskOutputRatingType.pass_fail, + ), + EvalOutputScore( + name="topic_alignment", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="overall_rating", + type=TaskOutputRatingType.five_star, + ), + ], + ) + eval.save_to_file() + + config = EvalConfig( + name="Llama 8b Joke Generator Eval", + parent=eval, + config_type=EvalConfigType.g_eval, + model_name="gpt_4o_mini", + model_provider="openai", + properties={ + "eval_steps": [ + "Is the joke funny?", + "Is the content appropriate for all audiences?", + "Is the joke culturally sensitive?", + "Is the joke politically correct?", + "Is the joke aligned with the provided topic?", + ] + }, + ) + config.save_to_file() + return config + + +@pytest.fixture +def test_run_config(test_task): + return RunConfig( + model_name="llama_3_1_8b", + model_provider_name="groq", + task=test_task, + prompt_id="simple_prompt_builder", + ) + + +@pytest.fixture +def test_task_run(test_task): + task_run = TaskRun( + parent=test_task, + input="Tell me a chicken joke", + input_source=DataSource( + type=DataSourceType.human, properties={"created_by": "test_user"} + ), + output=TaskOutput( + output="Why did the chicken cross the road? To get to the other side!", + source=DataSource( + type=DataSourceType.synthetic, + properties={ + "model_name": "llama_3_1_8b", + "model_provider": "groq", + "adapter_name": "langchain", + }, + ), + ), + ) + task_run.save_to_file() + return task_run + + +async def run_g_eval_test( + test_task, + test_eval_config, + test_task_run, + config_type, + test_run_config, + model_name: str | None = None, + provider_name: str | None = None, +): + # Create G-Eval instance + test_eval_config.config_type = config_type + if model_name is not None and provider_name is not None: + test_eval_config.model_name = model_name + test_eval_config.model_provider = provider_name + g_eval = GEval(test_eval_config, test_run_config) + + # Run the evaluation + eval_result, intermediate_outputs = await g_eval.run_eval(test_task_run) + + # Should have 1 intermediate output (thinking or chain of thought) + assert len(intermediate_outputs) == 1 + + assert "topic_alignment" in eval_result + topic_alignment = eval_result["topic_alignment"] + assert isinstance(topic_alignment, float) + assert 1 <= topic_alignment <= 5 + + assert "appropriateness" in eval_result + appropriateness = eval_result["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 + + assert "overall_rating" in eval_result + overall = eval_result["overall_rating"] + assert isinstance(overall, float) + assert 1.0 <= overall <= 5.0 + + +@pytest.mark.parametrize( + "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] +) +@pytest.mark.paid +async def test_run_g_eval_paid( + test_task, test_eval_config, test_task_run, config_type, test_run_config +): + await run_g_eval_test( + test_task, test_eval_config, test_task_run, config_type, test_run_config + ) + + +@pytest.mark.parametrize( + "config_type", [EvalConfigType.g_eval, EvalConfigType.llm_as_judge] +) +@pytest.mark.paid +async def test_run_g_eval_e2e( + test_task, test_eval_config, test_task_run, config_type, test_run_config +): + # Create G-Eval instance + test_eval_config.config_type = config_type + g_eval = GEval(test_eval_config, test_run_config) + + # Run the evaluation + task_run, scores, intermediate_outputs = await g_eval.run_task_and_eval("chickens") + + # Verify the evaluation results + assert isinstance(scores, dict) + + # Should have 1 intermediate output (thinking or chain of thought) + assert len(intermediate_outputs) == 1 + + assert "topic_alignment" in scores + topic_alignment = scores["topic_alignment"] + assert isinstance(topic_alignment, float) + assert 1 <= topic_alignment <= 5 + + assert "appropriateness" in scores + appropriateness = scores["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 + + assert "overall_rating" in scores + overall = scores["overall_rating"] + assert isinstance(overall, float) + assert 1.0 <= overall <= 5.0 + + +async def test_g_eval_logprobs( + test_task, test_eval_config, test_task_run, test_run_config +): + # Create G-Eval instance + run_output = pickle.loads(serialized_run_output) + assert isinstance(run_output, RunOutput) + assert run_output.output_logprobs is not None + g_eval = GEval(test_eval_config, test_run_config) + result = g_eval.build_g_eval_score(run_output) + + assert "overall_rating" in result + overall = result["overall_rating"] + assert isinstance(overall, float) + assert overall >= 1.0 and overall <= 5.0 + # Confirm weighted value, and confirm the approx isn't why it's passing + assert pytest.approx(overall) == 3.99752802363598 + assert pytest.approx(overall) != 4.0 + + # Check topic_alignment + assert "topic_alignment" in result + topic_alignment = result["topic_alignment"] + assert isinstance(topic_alignment, float) + assert topic_alignment >= 1.0 and topic_alignment <= 5.0 + # Confirm weighted value, and confirm the approx isn't why it's passing + assert pytest.approx(topic_alignment) == 4.999983298485167 + assert pytest.approx(topic_alignment) != 5.0 + + # Check appropriateness + assert "appropriateness" in result + appropriateness = result["appropriateness"] + assert isinstance(appropriateness, float) + assert appropriateness >= 0.0 and appropriateness <= 1.0 + # Fail chance so low, we need to specify the precision + assert pytest.approx(appropriateness, 1e-12) == 0.9999999999572222 + assert pytest.approx(appropriateness, 1e-12) != 1.0 + + +async def test_llm_as_judge( + test_task, test_eval_config, test_task_run, test_run_config +): + # Create G-Eval instance, set to LLM as Judge + run_output = pickle.loads(serialized_run_output) + test_eval_config.config_type = EvalConfigType.llm_as_judge + g_eval = GEval(test_eval_config, test_run_config) + + assert isinstance(run_output, RunOutput) + assert run_output.output_logprobs is not None + result = g_eval.build_llm_as_judge_score(run_output) + + # unlike g_eval, llm_as_judge returns the main token converted to our float scores + assert result["overall_rating"] == 4.0 + assert result["topic_alignment"] == 5.0 + assert result["appropriateness"] == 1.0 + + +def test_token_case(): + # we assume the token is lower case in the logprobs token fuzzy matching code. This will catch if we ever add a token that's not. + for token in TOKEN_TO_SCORE_MAP.keys(): + assert token.lower() == token + + +def test_metric_offsets_and_search_ranges( + test_eval_config, test_run_config, test_task_run +): + g_eval = GEval(test_eval_config, test_run_config) + raw_output = ( + '{"topic_alignment": 4, "appropriateness": "pass", "overall_rating": 5}' + ) + metrics = ["topic_alignment", "appropriateness", "overall_rating"] + + offsets = g_eval.metric_offsets(raw_output, metrics) + + assert len(offsets) == 3 + assert offsets["topic_alignment"] == 1 # Position after opening { + assert offsets["appropriateness"] == 23 # Position after "appropriateness": + assert offsets["overall_rating"] == 50 # Position after "overall_rating": + + # Test search ranges + + # Test first metric + start, end = g_eval.token_search_range(raw_output, "topic_alignment", offsets) + assert start == 16 # Position after "topic_alignment" + assert end == 23 # Position after "appropriateness" + + # Test middle metric + start, end = g_eval.token_search_range(raw_output, "appropriateness", offsets) + assert start == 38 # Position after "appropriateness" + assert end == 50 # Position after "overall_rating" + + # Test last metric + start, end = g_eval.token_search_range(raw_output, "overall_rating", offsets) + assert start == 64 # Position after "overall_rating" + assert end == len(raw_output) # end of string + + +def test_metric_offsets_invalid(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) + raw_output = '{"topic_alignment": 4, "topic_alignment": 5}' + metrics = ["topic_alignment"] + + with pytest.raises(ValueError, match="should appear exactly once"): + g_eval.metric_offsets(raw_output, metrics) + + raw_output = '{"something_else": 4}' + with pytest.raises(ValueError, match="should appear exactly once"): + g_eval.metric_offsets(raw_output, metrics) + + +@pytest.mark.parametrize( + "token_string,expected_score", + [ + # Direct matches + ("1", 1.0), + ("5", 5.0), + ("pass", 1.0), + ("fail", 0.0), + ("critical", -1.0), + # Variations with quotes and spacing + ('"1"', 1.0), + (" pass ", 1.0), + ("PASS", 1.0), + ('"FAIL"', 0.0), + ('"pAss"', 1.0), + ("1.0", 1.0), + ("2.0", 2.0), + ("3.0", 3.0), + ("4.0", 4.0), + ("5.0", 5.0), + ("5.0000", 5.0), + # Invalid tokens + ("invalid", None), + ("6", None), + ("0", None), + ("", None), + ("4.9999999", None), + ], +) +def test_score_from_token_string( + test_eval_config, token_string, expected_score, test_run_config +): + g_eval = GEval(test_eval_config, test_run_config) + assert g_eval.score_from_token_string(token_string) == expected_score + + +def test_raw_output_from_logprobs(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) + + # Create a minimal RunOutput with some logprobs + class MockLogprob: + def __init__(self, token): + self.token = token + + class MockLogprobs: + def __init__(self): + self.content = [ + MockLogprob('{"'), + MockLogprob("score"), + MockLogprob('": '), + MockLogprob("5"), + MockLogprob("}"), + ] + + run_output = RunOutput( + output={"score": 5}, + output_logprobs=MockLogprobs(), + intermediate_outputs={}, + ) + + raw = g_eval.raw_output_from_logprobs(run_output) + assert raw == '{"score": 5}' + + +def test_rating_token_to_score(test_eval_config, test_run_config): + g_eval = GEval(test_eval_config, test_run_config) + + class MockTopLogprob: + def __init__(self, token, logprob): + self.token = token + self.logprob = logprob + + class MockTokenLogprob: + def __init__(self, token, top_logprobs): + self.token = token + self.top_logprobs = [MockTopLogprob(t, lp) for t, lp in top_logprobs] + + # Test single token case + token_logprob = MockTokenLogprob("5", [("5", 0.0)]) # log(1) = 0 + score = g_eval.rating_token_to_score(token_logprob) + assert score == 5.0 + + # Test weighted average case + token_logprob = MockTokenLogprob( + "4", + [ + ("4", math.log(0.6)), # 60% probability + ("5", math.log(0.4)), # 40% probability + ], + ) + score = g_eval.rating_token_to_score(token_logprob) + assert pytest.approx(score) == 4.4 # (4 * 0.6 + 5 * 0.4) + + # Test invalid token + token_logprob = MockTokenLogprob(":", [(":", 0.0)]) + assert g_eval.rating_token_to_score(token_logprob) is None + + # Test no valid scoring tokens + token_logprob = MockTokenLogprob("5", []) + with pytest.raises(RuntimeError, match="No valid scoring tokens found"): + g_eval.rating_token_to_score(token_logprob) + + +def test_g_eval_system_instruction(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore(name="overall_rating", type=TaskOutputRatingType.five_star), + ], + ) + eval_config = EvalConfig( + parent=eval, + name="Test Eval", + model_name="gpt_4o_mini", + model_provider="openai", + config_type=EvalConfigType.g_eval, + properties={ + "task_description": "Test task description", + "eval_steps": ["Step 1", "Step 2"], + }, + ) + g_eval_task = GEvalTask(eval_config) + assert g_eval_task.instruction == ( + "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n\n" + "The task the model was given is as follows:\n\n" + "Test task description\n" + "\n" + ) + + # Test without task description + eval_config.properties = {"eval_steps": ["Step 1", "Step 2"]} + g_eval_task = GEvalTask(eval_config) + assert ( + g_eval_task.instruction + == "Your job to evaluate a model's performance on a task. Blocks will be marked with tags.\n" + ) + + +def check_supports_logprobs(model_name: str, provider_name: str): + for model in built_in_models: + if model.name != model_name: + continue + for provider in model.providers: + if provider.name != provider_name: + continue + if not provider.supports_logprobs: + pytest.skip( + f"Skipping {model.name} {provider.name} because it does not support logprobs" + ) + return + raise RuntimeError(f"No model {model_name} {provider_name} found") + + +@pytest.mark.paid +@pytest.mark.ollama +@pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) +async def test_all_built_in_models_logprobs_geval( + model_name, + provider_name, + test_task, + test_eval_config, + test_task_run, + test_run_config, +): + check_supports_logprobs(model_name, provider_name) + await run_g_eval_test( + test_task, + test_eval_config, + test_task_run, + EvalConfigType.g_eval, + test_run_config, + model_name, + provider_name.value, + ) diff --git a/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py b/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py new file mode 100644 index 00000000..a36bdc49 --- /dev/null +++ b/libs/core/kiln_ai/adapters/eval/test_g_eval_data.py @@ -0,0 +1,4 @@ +# Saved a real RunOutput, with real logprobs via: +# po = pickle.dumps(result) +# print(f"\n\nPickled result: \n{po}\n\n") +serialized_run_output = b"\x80\x04\x95\xe8:\x00\x00\x00\x00\x00\x00\x8c\x1bkiln_ai.adapters.run_output\x94\x8c\tRunOutput\x94\x93\x94)\x81\x94}\x94(\x8c\x06output\x94}\x94(\x8c\x0ftopic_alignment\x94K\x05\x8c\x0fappropriateness\x94\x8c\x04pass\x94\x8c\x0eoverall_rating\x94K\x04u\x8c\x14intermediate_outputs\x94}\x94\x8c\x10chain_of_thought\x94X\x08\x06\x00\x001) **Is the joke funny?**\n The joke \"Why did the chicken cross the road? To get to the other side!\" is a classic joke that many consider to be humorous due to its simplicity and unexpected nature. However, as it's a very well-known punchline, some may find it less amusing for being overly familiar. Overall, it can elicit a chuckle, but it may not be considered original or particularly funny by everyone.\n\n2) **Is the content appropriate for all audiences?**\n Yes, the joke is appropriate for all audiences. It does not contain any offensive language or themes, making it suitable for children and adults alike.\n\n3) **Is the joke culturally sensitive?**\n Yes, the joke is culturally sensitive. It does not touch on any potentially sensitive topics or stereotypes. It\xe2\x80\x99s a universal humor that transcends cultural boundaries.\n\n4) **Is the joke politically correct?**\n Yes, the joke is politically correct. It does not make any political statements or discriminatory remarks. It simply presents a light-hearted situation involving a chicken, which is neutral and inoffensive.\n\n5) **Is the joke aligned with the provided topic?**\n Yes, the joke is aligned with the provided topic of a \"chicken joke.\" It directly references a chicken and is structured as a joke, fulfilling the prompt's requirements.\n\nIn summary, while the joke may lack originality, it is appropriate, sensitive, politically correct, and aligns well with the topic. The humor level can vary depending on personal taste, but overall, it meets the evaluation criteria.\x94s\x8c\x0foutput_logprobs\x94\x8c!openai.types.chat.chat_completion\x94\x8c\x0eChoiceLogprobs\x94\x93\x94)\x81\x94}\x94(\x8c\x08__dict__\x94}\x94(\x8c\x07content\x94]\x94(\x8c/openai.types.chat.chat_completion_token_logprob\x94\x8c\x1aChatCompletionTokenLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(\x8c\x05token\x94\x8c\x02{\"\x94\x8c\x05bytes\x94]\x94(K{K\"e\x8c\x07logprob\x94G\xbf5\xfe.\xba\x97\xb1\xde\x8c\x0ctop_logprobs\x94]\x94(h\x19\x8c\nTopLogprob\x94\x93\x94)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\"\x94h!]\x94(K{K\"eh#G\xbf5\xfe.\xba\x97\xb1\xdeu\x8c\x12__pydantic_extra__\x94}\x94\x8c\x17__pydantic_fields_set__\x94\x8f\x94(h\x1fh#h!\x90\x8c\x14__pydantic_private__\x94Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{\n\x94h!]\x94(K{K\neh#G\xc0 \x00,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01{\x94h!]\x94K{ah#G\xc0/\x80,\nJ\x05\xdeuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\r\n\x94h!]\x94(K{K\rK\neh#G\xc01@\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03{\n\n\x94h!]\x94(K{K\nK\neh#G\xc03\xc0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\"\x94h!]\x94(K K{K\"eh#G\xc05\x00\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 {\n\x94h!]\x94(K K{K\neh#G\xc06\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\n\x94h!]\x94K\nah#G\xc07\xe0\x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02{}\x94h!]\x94(K{K}eh#G\xc08 \x16\x05%\x02\xefuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Oh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05topic\x94h!]\x94(KtKoKpKiKceh#G\xbfS\x8a+<\x99\xb9Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc0\x1b\x818\xa2\x07\xfd%uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04type\x94h!]\x94(KtKyKpKeeh#G\xc0!\x80\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03top\x94h!]\x94(KtKoKpeh#G\xc0-\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05theme\x94h!]\x94(KtKhKeKmKeeh#G\xc0.\x00\x9c^o\xf7\xe0uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc00\x00N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 topic\x94h!]\x94(K KtKoKpKiKceh#G\xc00@N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Topic\x94h!]\x94(KTKoKpKiKceh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc00\xa0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05title\x94h!]\x94(KtKiKtKlKeeh#G\xc00\xc0N\x1eq\x04Ouh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_alignment\x94h!]\x94(K_KaKlKiKgKnKmKeKnKteh#G\xbe\xc1\x9f\x96D1\x8b\xf2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n alignment\x94h!]\x94(K KaKlKiKgKnKmKeKnKteh#G\xc0+\x00\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06_align\x94h!]\x94(K_KaKlKiKgKneh#G\xc0.@\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n_ALIGNMENT\x94h!]\x94(K_KAKLKIKGKNKMKEKNKTeh#G\xc0.\x80\x00C\x1b\xde\x83uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\tAlignment\x94h!]\x94(KAKlKiKgKnKmKeKnKteh#G\xc00\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_assignment\x94h!]\x94(K_KaKsKsKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\n Alignment\x94h!]\x94(K KAKlKiKgKnKmKeKnKteh#G\xc01@\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03_al\x94h!]\x94(K_KaKleh#G\xc01\xa0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0b_similarity\x94h!]\x94(K_KsKiKmKiKlKaKrKiKtKyeh#G\xc01\xe0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xc02 \x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc03\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\xc2\xbb:\x94h!]\x94(K\xc2K\xbbK:eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03+\":\x94h!]\x94(K+K\"K:eh#G\xc07@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc07\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x015\x94h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xbe\xf1\x93\xc3:x\xd77uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x014\x94h!]\x94K4ah#G\xc0&\x00\x02:l\xe3Xuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01 \x94h!]\x94K ah#G\xc01\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x013\x94h!]\x94K3ah#G\xc07\xc0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc08\xa0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01-\x94h!]\x94K-ah#G\xc0; \x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01f\x94h!]\x94Kfah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\t\x94h!]\x94K\tah#G\xc0;0\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \x94h!]\x94(K K K eh#G\xc0;@\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01\"\x94h!]\x94K\"ah#G\xc0;p\x01\x1d6q\xacuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01,\x94h!]\x94K,ah#G\xc05\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 ,\"\x94h!]\x94(K K,K\"eh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"\\\x94h!]\x94(K,K\"K\\eh#G\xc07`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\"%\x94h!]\x94(K,K\"K%eh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\",\x94h!]\x94(K,K\"K,eh#G\xc0:\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\n\x94h!]\x94(K,K\neh#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03,\r\n\x94h!]\x94(K,K\rK\neh#G\xc0< \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x8f\x01\x00\x00h!]\x94K\tah#G\xc0=p\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01.\x94h!]\x94K.ah#G\xc0>@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07appropr\x94h!]\x94(KaKpKpKrKoKpKreh#G\xbf\x1d\x1c\xa4[(\x97\x91uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05appro\x94h!]\x94(KaKpKpKrKoeh#G\xc0\"\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x0bappropriate\x94h!]\x94(KaKpKpKrKoKpKrKiKaKtKeeh#G\xc0&\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc0*\x80\x0e\x8c\x8a\xbd^uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02in\x94h!]\x94(KiKneh#G\xc00\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05Appro\x94h!]\x94(KAKpKpKrKoeh#G\xc02\x80\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06 Appro\x94h!]\x94(K KAKpKpKrKoeh#G\xc02\xa0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xc02\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04apro\x94h!]\x94(KaKpKrKoeh#G\xc03\xe0\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\rapproximately\x94h!]\x94(KaKpKpKrKoKxKiKmKaKtKeKlKyeh#G\xc04@\x075~g\x0euh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01i\x94h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fjA\x02\x00\x00h!]\x94Kiah#G\xbe\xaa~\xe0\xee\xab\x86\xb2uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iation\x94h!]\x94(KiKaKtKiKoKneh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03iat\x94h!]\x94(KiKaKteh#G\xc0.\xc0\x00!\x8d\xefAuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xc00 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iten\x94h!]\x94(KiKtKeKneh#G\xc00`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04iann\x94h!]\x94(KiKaKnKneh#G\xc01\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t appropri\x94h!]\x94(K KaKpKpKrKoKpKrKieh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02ri\x94h!]\x94(KrKieh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06iately\x94h!]\x94(KiKaKtKeKlKyeh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05laten\x94h!]\x94(KlKaKtKeKneh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07ateness\x94h!]\x94(KaKtKeKnKeKsKseh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04aten\x94h!]\x94(KaKtKeKneh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05ensen\x94h!]\x94(KeKnKsKeKneh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ated\x94h!]\x94(KaKtKeKdeh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06teness\x94h!]\x94(KtKeKnKeKsKseh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04ates\x94h!]\x94(KaKtKeKseh#G\xc06`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05eness\x94h!]\x94(KeKnKeKsKseh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04onen\x94h!]\x94(KoKnKeKneh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04uten\x94h!]\x94(KuKtKeKneh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06enness\x94h!]\x94(KeKnKnKeKsKseh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"'\x94h!]\x94(K\"K:K\"K'eh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \":\"\x94h!]\x94(K K\"K:K\"eh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\":\"\",\"\x94h!]\x94(K\"K:K\"K\"K,K\"eh#G\xc04\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":[\"\x94h!]\x94(K\"K:K[K\"eh#G\xc05\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":\"+\x94h!]\x94(K\"K:K\"K+eh#G\xc05\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\":{\"\x94h!]\x94(K\"K:K{K\"eh#G\xc06@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03':'\x94h!]\x94(K'K:K'eh#G\xc06\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04pass\x94h!]\x94(KpKaKsKseh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05 pass\x94h!]\x94(K KpKaKsKseh#G\xc03 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04fail\x94h!]\x94(KfKaKiKleh#G\xc07\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03pas\x94h!]\x94(KpKaKseh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05.pass\x94h!]\x94(K.KpKaKsKseh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04Pass\x94h!]\x94(KPKaKsKseh#G\xc09\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04PASS\x94h!]\x94(KPKAKSKSeh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passed\x94h!]\x94(KpKaKsKsKeKdeh#G\xc09\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05-pass\x94h!]\x94(K-KpKaKsKseh#G\xc09\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06passes\x94h!]\x94(KpKaKsKsKeKseh#G\xc0: \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\",\"\x94h!]\x94(K\"K,K\"eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04 \",\"\x94h!]\x94(K K\"K,K\"eh#G\xc02\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc04\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04.\",\"\x94h!]\x94(K.K\"K,K\"eh#G\xc04@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc05\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03','\x94h!]\x94(K'K,K'eh#G\xc06 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"#\x94h!]\x94(K\"K,K\"K#eh#G\xc07 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"+\x94h!]\x94(K\"K,K\"K+eh#G\xc07\xf0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05\\\",\\\"\x94h!]\x94(K\\K\"K,K\\K\"eh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\",\"\\\x94h!]\x94(K\"K,K\"K\\eh#G\xc08\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9dh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07overall\x94h!]\x94(KoKvKeKrKaKlKleh#G\xbe\x89\xfcz\xe12u\x9duh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07Overall\x94h!]\x94(KOKvKeKrKaKlKleh#G\xc00\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 overall\x94h!]\x94(K KoKvKeKrKaKlKleh#G\xc02@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01c\x94h!]\x94Kcah#G\xc06\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08overview\x94h!]\x94(KoKvKeKrKvKiKeKweh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05total\x94h!]\x94(KtKoKtKaKleh#G\xc08@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04over\x94h!]\x94(KoKvKeKreh#G\xc08\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x08 Overall\x94h!]\x94(K KOKvKeKrKaKlKleh#G\xc09 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe6\x95\xb4\xe4\xbd\x93\x94h!]\x94(K\xe6K\x95K\xb4K\xe4K\xbdK\x93eh#G\xc09`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05polit\x94h!]\x94(KpKoKlKiKteh#G\xc0:\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIh$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07_rating\x94h!]\x94(K_KrKaKtKiKnKgeh#G\xbe\x94\xfe$\xc4\xceLIuh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 rating\x94h!]\x94(K KrKaKtKiKnKgeh#G\xc0/@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06rating\x94h!]\x94(KrKaKtKiKnKgeh#G\xc01\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07 Rating\x94h!]\x94(K KRKaKtKiKnKgeh#G\xc01\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06Rating\x94h!]\x94(KRKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07-rating\x94h!]\x94(K-KrKaKtKiKnKgeh#G\xc01\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07.rating\x94h!]\x94(K.KrKaKtKiKnKgeh#G\xc02\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05_rate\x94h!]\x94(K_KrKaKtKeeh#G\xc03\x80\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\t_rotation\x94h!]\x94(K_KrKoKtKaKtKiKoKneh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02_r\x94h!]\x94(K_Kreh#G\xc04 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\":\x94h!]\x94(K\"K:eh#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04\xe2\x80\x9d:\x94h!]\x94(K\xe2K\x80K\x9dK:eh#G\xc04\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\\\":\x94h!]\x94(K\\K\"K:eh#G\xc04\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02':\x94h!]\x94(K'K:eh#G\xc05@\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":\"\x94h!]\x94(K\"K:K\"eh#G\xc06\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07<|end|>\x94h!Nh#G\xc06\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x06\xe2\x80\x9d\xef\xbc\x9a\x94h!]\x94(K\xe2K\x80K\x9dK\xefK\xbcK\x9aeh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02`:\x94h!]\x94(K`K:eh#G\xc07\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03\":[\x94h!]\x94(K\"K:K[eh#G\xc08\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03 \":\x94h!]\x94(K K\"K:eh#G\xc08 \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fje\x01\x00\x00h!]\x94K4ah#G\xbfdI\x15\x1e\x7f\x84\xe1uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjs\x01\x00\x00h!]\x94K3ah#G\xc0\x18\x02\x89\x11\x8c\x19~uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjY\x01\x00\x00h!]\x94K5ah#G\xc0,\x81D\xaaS\xfc\x01uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fjl\x01\x00\x00h!]\x94K ah#G\xc05\x10\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x012\x94h!]\x94K2ah#G\xc070\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x81\x01\x00\x00h!]\x94K-ah#G\xc08\xd0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02\n\n\x94h!]\x94(K\nK\neh#G\xc09\x80\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fh_h!]\x94K\nah#G\xc09\xc0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 \x94h!]\x94(K K eh#G\xc09\xf0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\x88\x01\x00\x00h!]\x94Kfah#G\xc0:0\xa2Dc\x06`uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nubh\x1b)\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x01}\x94h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00h$]\x94(h')\x81\x94}\x94(h\x15}\x94(h\x1fj\xf3\x04\x00\x00h!]\x94K}ah#G\x00\x00\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02 }\x94h!]\x94(K K}eh#G\xc01\xe0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02,\"\x94h!]\x94(K,K\"eh#G\xc05`\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x02}\n\x94h!]\x94(K}K\neh#G\xc07\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\n\n\x94h!]\x94(K}K\nK\neh#G\xc08\xc0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1fj\xea\x01\x00\x00h!]\x94K.ah#G\xc0:\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x03}\r\n\x94h!]\x94(K}K\rK\neh#G\xc0; \x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x05}\r\n\r\n\x94h!]\x94(K}K\rK\nK\rK\neh#G\xc0=\x90\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x04}\n\n\n\x94h!]\x94(K}K\nK\nK\neh#G\xc0=\xa0\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubh')\x81\x94}\x94(h\x15}\x94(h\x1f\x8c\x07}\n\n\n\n\n\n\x94h!]\x94(K}K\nK\nK\nK\nK\nK\neh#G\xc0>\x00\x00\x00\x00\x00\x00uh-}\x94h/\x8f\x94(h\x1fh#h!\x90h1Nubeuh-}\x94h/\x8f\x94(h\x1fh#h!h$\x90h1Nube\x8c\x07refusal\x94Nuh-}\x94h/\x8f\x94(h\x17j<\x05\x00\x00\x90h1Nubub." diff --git a/libs/core/kiln_ai/adapters/ml_model_list.py b/libs/core/kiln_ai/adapters/ml_model_list.py index 08638982..9c8b71fd 100644 --- a/libs/core/kiln_ai/adapters/ml_model_list.py +++ b/libs/core/kiln_ai/adapters/ml_model_list.py @@ -43,6 +43,7 @@ class ModelFamily(str, Enum): mixtral = "mixtral" qwen = "qwen" deepseek = "deepseek" + dolphin = "dolphin" # Where models have instruct and raw versions, instruct is default and raw is specified @@ -90,6 +91,7 @@ class ModelName(str, Enum): deepseek_r1_distill_qwen_1p5b = "deepseek_r1_distill_qwen_1p5b" deepseek_r1_distill_qwen_7b = "deepseek_r1_distill_qwen_7b" deepseek_r1_distill_llama_8b = "deepseek_r1_distill_llama_8b" + dolphin_2_9_8x22b = "dolphin_2_9_8x22b" class ModelParserID(str, Enum): @@ -125,6 +127,7 @@ class KilnModelProvider(BaseModel): structured_output_mode: StructuredOutputMode = StructuredOutputMode.default parser: ModelParserID | None = None reasoning_capable: bool = False + supports_logprobs: bool = False class KilnModel(BaseModel): @@ -157,11 +160,13 @@ class KilnModel(BaseModel): provider_options={"model": "gpt-4o-mini"}, provider_finetune_id="gpt-4o-mini-2024-07-18", structured_output_mode=StructuredOutputMode.json_schema, + supports_logprobs=True, ), KilnModelProvider( name=ModelProviderName.openrouter, provider_options={"model": "openai/gpt-4o-mini"}, structured_output_mode=StructuredOutputMode.json_schema, + supports_logprobs=True, ), ], ), @@ -176,11 +181,13 @@ class KilnModel(BaseModel): provider_options={"model": "gpt-4o"}, provider_finetune_id="gpt-4o-2024-08-06", structured_output_mode=StructuredOutputMode.json_schema, + supports_logprobs=True, ), KilnModelProvider( name=ModelProviderName.openrouter, provider_options={"model": "openai/gpt-4o"}, structured_output_mode=StructuredOutputMode.json_schema, + supports_logprobs=True, ), ], ), @@ -192,7 +199,7 @@ class KilnModel(BaseModel): providers=[ KilnModelProvider( name=ModelProviderName.openrouter, - structured_output_mode=StructuredOutputMode.function_calling, + structured_output_mode=StructuredOutputMode.json_instruction_and_object, provider_options={"model": "anthropic/claude-3-5-haiku"}, ), ], @@ -205,7 +212,7 @@ class KilnModel(BaseModel): providers=[ KilnModelProvider( name=ModelProviderName.openrouter, - structured_output_mode=StructuredOutputMode.function_calling, + structured_output_mode=StructuredOutputMode.json_instruction_and_object, provider_options={"model": "anthropic/claude-3.5-sonnet"}, ), ], @@ -416,8 +423,10 @@ class KilnModel(BaseModel): KilnModelProvider( name=ModelProviderName.openrouter, supports_data_gen=False, - structured_output_mode=StructuredOutputMode.function_calling, + # Need to not pass "strict=True" to the function call to get this to work with logprobs for some reason. Openrouter issue. + structured_output_mode=StructuredOutputMode.function_calling_weak, provider_options={"model": "meta-llama/llama-3.1-70b-instruct"}, + supports_logprobs=True, ), KilnModelProvider( name=ModelProviderName.ollama, @@ -985,4 +994,26 @@ class KilnModel(BaseModel): ), ], ), + # Dolphin 2.9 Mixtral 8x22B + KilnModel( + family=ModelFamily.dolphin, + name=ModelName.dolphin_2_9_8x22b, + friendly_name="Dolphin 2.9 8x22B", + providers=[ + KilnModelProvider( + name=ModelProviderName.ollama, + structured_output_mode=StructuredOutputMode.json_schema, + supports_data_gen=True, + provider_options={"model": "dolphin-mixtral:8x22b"}, + ), + KilnModelProvider( + name=ModelProviderName.openrouter, + provider_options={ + "model": "cognitivecomputations/dolphin-mixtral-8x22b" + }, + supports_data_gen=True, + structured_output_mode=StructuredOutputMode.json_instruction_and_object, + ), + ], + ), ] diff --git a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py index 9ae8f9a2..40b60649 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/base_adapter.py @@ -5,7 +5,7 @@ from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode from kiln_ai.adapters.parsers.parser_registry import model_parser_from_id -from kiln_ai.adapters.prompt_builders import BasePromptBuilder, SimplePromptBuilder +from kiln_ai.adapters.prompt_builders import prompt_builder_from_id from kiln_ai.adapters.provider_tools import kiln_model_provider_from from kiln_ai.adapters.run_output import RunOutput from kiln_ai.datamodel import ( @@ -16,16 +16,21 @@ TaskRun, ) from kiln_ai.datamodel.json_schema import validate_schema +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config @dataclass -class AdapterInfo: - adapter_name: str - model_name: str - model_provider: str - prompt_builder_name: str - prompt_id: str | None = None +class AdapterConfig: + """ + An adapter config is config options that do NOT impact the output of the model. + + For example: if it's saved, of if we request additional data like logprobs. + """ + + allow_saving: bool = True + top_logprobs: int | None = None + default_tags: list[str] | None = None COT_FINAL_ANSWER_PROMPT = "Considering the above, return a final result." @@ -47,35 +52,36 @@ class BaseAdapter(metaclass=ABCMeta): def __init__( self, - kiln_task: Task, - model_name: str, - model_provider_name: str, - prompt_builder: BasePromptBuilder | None = None, - tags: list[str] | None = None, + run_config: RunConfig, + config: AdapterConfig | None = None, ): - self.prompt_builder = prompt_builder or SimplePromptBuilder(kiln_task) - self.kiln_task = kiln_task - self.output_schema = self.kiln_task.output_json_schema - self.input_schema = self.kiln_task.input_json_schema - self.default_tags = tags - self.model_name = model_name - self.model_provider_name = model_provider_name + self.run_config = run_config + self.prompt_builder = prompt_builder_from_id( + run_config.prompt_id, run_config.task + ) self._model_provider: KilnModelProvider | None = None + self.output_schema = self.task().output_json_schema + self.input_schema = self.task().input_json_schema + self.base_adapter_config = config or AdapterConfig() + + def task(self) -> Task: + return self.run_config.task + def model_provider(self) -> KilnModelProvider: """ Lazy load the model provider for this adapter. """ if self._model_provider is not None: return self._model_provider - if not self.model_name or not self.model_provider_name: + if not self.run_config.model_name or not self.run_config.model_provider_name: raise ValueError("model_name and model_provider_name must be provided") self._model_provider = kiln_model_provider_from( - self.model_name, self.model_provider_name + self.run_config.model_name, self.run_config.model_provider_name ) if not self._model_provider: raise ValueError( - f"model_provider_name {self.model_provider_name} not found for model {self.model_name}" + f"model_provider_name {self.run_config.model_provider_name} not found for model {self.run_config.model_name}" ) return self._model_provider @@ -85,7 +91,7 @@ async def invoke_returning_raw( input_source: DataSource | None = None, ) -> Dict | str: result = await self.invoke(input, input_source) - if self.kiln_task.output_json_schema is None: + if self.task().output_json_schema is None: return result.output.output else: return json.loads(result.output.output) @@ -95,6 +101,14 @@ async def invoke( input: Dict | str, input_source: DataSource | None = None, ) -> TaskRun: + run_output, _ = await self.invoke_returning_run_output(input, input_source) + return run_output + + async def invoke_returning_run_output( + self, + input: Dict | str, + input_source: DataSource | None = None, + ) -> Tuple[TaskRun, RunOutput]: # validate input if self.input_schema is not None: if not isinstance(input, dict): @@ -128,19 +142,23 @@ async def invoke( run = self.generate_run(input, input_source, parsed_output) # Save the run if configured to do so, and we have a path to save to - if Config.shared().autosave_runs and self.kiln_task.path is not None: + if ( + self.base_adapter_config.allow_saving + and Config.shared().autosave_runs + and self.task().path is not None + ): run.save_to_file() else: # Clear the ID to indicate it's not persisted run.id = None - return run + return run, run_output def has_structured_output(self) -> bool: return self.output_schema is not None @abstractmethod - def adapter_info(self) -> AdapterInfo: + def adapter_name(self) -> str: pass @abstractmethod @@ -203,7 +221,7 @@ def generate_run( ) new_task_run = TaskRun( - parent=self.kiln_task, + parent=self.task(), input=input_str, input_source=input_source, output=TaskOutput( @@ -215,7 +233,7 @@ def generate_run( ), ), intermediate_outputs=run_output.intermediate_outputs, - tags=self.default_tags or [], + tags=self.base_adapter_config.default_tags or [], ) return new_task_run @@ -224,12 +242,9 @@ def _properties_for_task_output(self) -> Dict[str, str | int | float]: props = {} # adapter info - adapter_info = self.adapter_info() - props["adapter_name"] = adapter_info.adapter_name - props["model_name"] = adapter_info.model_name - props["model_provider"] = adapter_info.model_provider - props["prompt_builder_name"] = adapter_info.prompt_builder_name - if adapter_info.prompt_id is not None: - props["prompt_id"] = adapter_info.prompt_id + props["adapter_name"] = self.adapter_name() + props["model_name"] = self.run_config.model_name + props["model_provider"] = self.run_config.model_provider_name + props["prompt_id"] = self.run_config.prompt_id return props diff --git a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py index 3aaa4513..9d19a32b 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py +++ b/libs/core/kiln_ai/adapters/model_adapters/langchain_adapters.py @@ -20,9 +20,8 @@ ) from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, - AdapterInfo, + AdapterConfig, BaseAdapter, - BasePromptBuilder, RunOutput, ) from kiln_ai.adapters.ollama_tools import ( @@ -30,6 +29,8 @@ ollama_base_url, ollama_model_installed, ) +from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -45,8 +46,8 @@ def __init__( custom_model: BaseChatModel | None = None, model_name: str | None = None, provider: str | None = None, - prompt_builder: BasePromptBuilder | None = None, - tags: list[str] | None = None, + prompt_id: PromptId | None = None, + base_adapter_config: AdapterConfig | None = None, ): if custom_model is not None: self._model = custom_model @@ -78,12 +79,16 @@ def __init__( if model_name is None: raise ValueError("model_name must be provided") - super().__init__( - kiln_task, + run_config = RunConfig( + task=kiln_task, model_name=model_name, model_provider_name=provider, - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id or datamodel.PromptGenerators.SIMPLE, + ) + + super().__init__( + run_config=run_config, + config=base_adapter_config, ) async def model(self) -> LangChainModelType: @@ -111,15 +116,15 @@ async def model(self) -> LangChainModelType: f"model {self._model} does not support structured output, cannot use output_json_schema" ) # Langchain expects title/description to be at top level, on top of json schema - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() if output_schema is None: raise ValueError( - f"output_json_schema is not valid json: {self.kiln_task.output_json_schema}" + f"output_json_schema is not valid json: {self.task().output_json_schema}" ) output_schema["title"] = "task_response" output_schema["description"] = "A response from the task" with_structured_output_options = self.get_structured_output_options( - self.model_name, self.model_provider_name + self.run_config.model_name, self.run_config.model_provider_name ) self._model = self._model.with_structured_output( output_schema, @@ -129,6 +134,11 @@ async def model(self) -> LangChainModelType: return self._model async def _run(self, input: Dict | str) -> RunOutput: + if self.base_adapter_config.top_logprobs is not None: + raise ValueError( + "Kiln's Langchain adapter does not support logprobs/top_logprobs. Select a model from an OpenAI compatible provider (openai, openrouter, etc) instead." + ) + provider = self.model_provider() model = await self.model() chain = model @@ -191,14 +201,8 @@ async def _run(self, input: Dict | str) -> RunOutput: intermediate_outputs=intermediate_outputs, ) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - model_name=self.model_name, - model_provider=self.model_provider_name, - adapter_name="kiln_langchain_adapter", - prompt_builder_name=self.prompt_builder.__class__.prompt_builder_name(), - prompt_id=self.prompt_builder.prompt_id(), - ) + def adapter_name(self) -> str: + return "kiln_langchain_adapter" def _munge_response(self, response: Dict) -> Dict: # Mistral Large tool calling format is a bit different. Convert to standard format. @@ -220,6 +224,9 @@ def get_structured_output_options( options = {} # We may need to add some provider specific logic here if providers use different names for the same mode, but everyone is copying openai for now match provider.structured_output_mode: + case StructuredOutputMode.function_calling_weak: + # Langchaing doesn't handle weak/strict separately + options["method"] = "function_calling" case StructuredOutputMode.function_calling: options["method"] = "function_calling" case StructuredOutputMode.json_mode: @@ -246,7 +253,7 @@ def get_structured_output_options( async def langchain_model_from(self) -> BaseChatModel: provider = self.model_provider() - return await langchain_model_from_provider(provider, self.model_name) + return await langchain_model_from_provider(provider, self.run_config.model_name) async def langchain_model_from_provider( diff --git a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py index f66526aa..4069c320 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/openai_model_adapter.py @@ -9,18 +9,19 @@ ) import kiln_ai.datamodel as datamodel -from kiln_ai.adapters.ml_model_list import StructuredOutputMode +from kiln_ai.adapters.ml_model_list import ModelProviderName, StructuredOutputMode from kiln_ai.adapters.model_adapters.base_adapter import ( COT_FINAL_ANSWER_PROMPT, - AdapterInfo, + AdapterConfig, BaseAdapter, - BasePromptBuilder, RunOutput, ) from kiln_ai.adapters.model_adapters.openai_compatible_config import ( OpenAICompatibleConfig, ) from kiln_ai.adapters.parsers.json_parser import parse_json_string +from kiln_ai.datamodel import PromptGenerators, PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error @@ -29,8 +30,8 @@ def __init__( self, config: OpenAICompatibleConfig, kiln_task: datamodel.Task, - prompt_builder: BasePromptBuilder | None = None, - tags: list[str] | None = None, + prompt_id: PromptId | None = None, + base_adapter_config: AdapterConfig | None = None, ): self.config = config self.client = AsyncOpenAI( @@ -39,12 +40,16 @@ def __init__( default_headers=config.default_headers, ) - super().__init__( - kiln_task, + run_config = RunConfig( + task=kiln_task, model_name=config.model_name, model_provider_name=config.provider_name, - prompt_builder=prompt_builder, - tags=tags, + prompt_id=prompt_id or PromptGenerators.SIMPLE, + ) + + super().__init__( + run_config=run_config, + config=base_adapter_config, ) async def _run(self, input: Dict | str) -> RunOutput: @@ -93,7 +98,8 @@ async def _run(self, input: Dict | str) -> RunOutput: ] ) - # OpenRouter specific options for reasoning models + # OpenRouter specific options for reasoning models and logprobs. + # TODO: this isn't a good place for this and I should refactor. But big usability improvement so keeping it here for now. extra_body = {} require_or_reasoning = ( self.config.openrouter_style_reasoning and provider.reasoning_capable @@ -108,6 +114,15 @@ async def _run(self, input: Dict | str) -> RunOutput: # fp8 quants are awful "ignore": ["DeepInfra"], } + elif ( + self.run_config.model_provider_name == ModelProviderName.openrouter + and self.base_adapter_config.top_logprobs is not None + ): + # OpenRouter specific options related to logprobs. Bit of a hack but really does improve usability. + extra_body["provider"] = { + "require_parameters": True, + "ignore": ["DeepInfra"], + } # Main completion call response_format_options = await self.response_format_options() @@ -115,6 +130,8 @@ async def _run(self, input: Dict | str) -> RunOutput: model=provider.provider_options["model"], messages=messages, extra_body=extra_body, + logprobs=self.base_adapter_config.top_logprobs is not None, + top_logprobs=self.base_adapter_config.top_logprobs, **response_format_options, ) @@ -133,6 +150,11 @@ async def _run(self, input: Dict | str) -> RunOutput: ) message = response.choices[0].message + logprobs = response.choices[0].logprobs + + # Check logprobs worked, if requested + if self.base_adapter_config.top_logprobs is not None and logprobs is None: + raise RuntimeError("Logprobs were required, but no logprobs were returned.") # Save reasoning if it exists (OpenRouter specific format) if require_or_reasoning: @@ -164,26 +186,19 @@ async def _run(self, input: Dict | str) -> RunOutput: if not isinstance(response_content, str): raise RuntimeError(f"response is not a string: {response_content}") + # Parse to dict if we have structured output + output: Dict | str = response_content if self.has_structured_output(): - structured_response = parse_json_string(response_content) - return RunOutput( - output=structured_response, - intermediate_outputs=intermediate_outputs, - ) + output = parse_json_string(response_content) return RunOutput( - output=response_content, + output=output, intermediate_outputs=intermediate_outputs, + output_logprobs=logprobs, ) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - model_name=self.model_name, - model_provider=self.model_provider_name, - adapter_name="kiln_openai_compatible_adapter", - prompt_builder_name=self.prompt_builder.__class__.prompt_builder_name(), - prompt_id=self.prompt_builder.prompt_id(), - ) + def adapter_name(self) -> str: + return "kiln_openai_compatible_adapter" async def response_format_options(self) -> dict[str, Any]: # Unstructured if task isn't structured @@ -195,7 +210,7 @@ async def response_format_options(self) -> dict[str, Any]: case StructuredOutputMode.json_mode: return {"response_format": {"type": "json_object"}} case StructuredOutputMode.json_schema: - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() return { "response_format": { "type": "json_schema", @@ -205,8 +220,10 @@ async def response_format_options(self) -> dict[str, Any]: }, } } + case StructuredOutputMode.function_calling_weak: + return self.tool_call_params(strict=False) case StructuredOutputMode.function_calling: - return self.tool_call_params() + return self.tool_call_params(strict=True) case StructuredOutputMode.json_instructions: # JSON done via instructions in prompt, not the API response format. Do not ask for json_object (see option below). return {} @@ -215,28 +232,32 @@ async def response_format_options(self) -> dict[str, Any]: return {"response_format": {"type": "json_object"}} case StructuredOutputMode.default: # Default to function calling -- it's older than the other modes. Higher compatibility. - return self.tool_call_params() + return self.tool_call_params(strict=True) case _: raise_exhaustive_enum_error(provider.structured_output_mode) - def tool_call_params(self) -> dict[str, Any]: + def tool_call_params(self, strict: bool) -> dict[str, Any]: # Add additional_properties: false to the schema (OpenAI requires this for some models) - output_schema = self.kiln_task.output_schema() + output_schema = self.task().output_schema() if not isinstance(output_schema, dict): raise ValueError( "Invalid output schema for this task. Can not use tool calls." ) output_schema["additionalProperties"] = False + function_params = { + "name": "task_response", + "parameters": output_schema, + } + # This should be on, but we allow setting function_calling_weak for APIs that don't support it. + if strict: + function_params["strict"] = True + return { "tools": [ { "type": "function", - "function": { - "name": "task_response", - "parameters": output_schema, - "strict": True, - }, + "function": function_params, } ], "tool_choice": { diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py index c80c409a..8160294b 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_base_adapter.py @@ -3,8 +3,9 @@ import pytest from kiln_ai.adapters.ml_model_list import KilnModelProvider, StructuredOutputMode -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.datamodel import Task +from kiln_ai.datamodel.task import RunConfig class MockAdapter(BaseAdapter): @@ -13,13 +14,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input): return None - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="test", - model_name=self.model_name, - model_provider=self.model_provider_name, - prompt_builder_name="test", - ) + def adapter_name(self) -> str: + return "test" @pytest.fixture @@ -37,9 +33,12 @@ def base_task(): @pytest.fixture def adapter(base_task): return MockAdapter( - kiln_task=base_task, - model_name="test_model", - model_provider_name="test_provider", + run_config=RunConfig( + task=base_task, + model_name="test_model", + model_provider_name="test_provider", + prompt_id="simple_prompt_builder", + ), ) @@ -85,7 +84,12 @@ async def test_model_provider_missing_names(base_task): """Test error when model or provider name is missing""" # Test with missing model name adapter = MockAdapter( - kiln_task=base_task, model_name="", model_provider_name="test_provider" + run_config=RunConfig( + task=base_task, + model_name="", + model_provider_name="", + prompt_id="simple_prompt_builder", + ), ) with pytest.raises( ValueError, match="model_name and model_provider_name must be provided" @@ -94,7 +98,12 @@ async def test_model_provider_missing_names(base_task): # Test with missing provider name adapter = MockAdapter( - kiln_task=base_task, model_name="test_model", model_provider_name="" + run_config=RunConfig( + task=base_task, + model_name="test_model", + model_provider_name="", + prompt_id="simple_prompt_builder", + ), ) with pytest.raises( ValueError, match="model_name and model_provider_name must be provided" diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py index 05006f3e..5a7dd705 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_langchain_adapter.py @@ -18,8 +18,8 @@ LangchainAdapter, langchain_model_from_provider, ) -from kiln_ai.adapters.prompt_builders import SimpleChainOfThoughtPromptBuilder from kiln_ai.adapters.test_prompt_adaptors import build_test_task +from kiln_ai.datamodel.task import RunConfig @pytest.fixture @@ -56,9 +56,8 @@ def test_langchain_adapter_infer_model_name(tmp_path): lca = LangchainAdapter(kiln_task=task, custom_model=custom) - model_info = lca.adapter_info() - assert model_info.model_name == "custom.langchain:llama-3.1-8b-instant" - assert model_info.model_provider == "custom.langchain:ChatGroq" + assert lca.run_config.model_name == "custom.langchain:llama-3.1-8b-instant" + assert lca.run_config.model_provider_name == "custom.langchain:ChatGroq" def test_langchain_adapter_info(tmp_path): @@ -66,10 +65,9 @@ def test_langchain_adapter_info(tmp_path): lca = LangchainAdapter(kiln_task=task, model_name="llama_3_1_8b", provider="ollama") - model_info = lca.adapter_info() - assert model_info.adapter_name == "kiln_langchain_adapter" - assert model_info.model_name == "llama_3_1_8b" - assert model_info.model_provider == "ollama" + assert lca.adapter_name() == "kiln_langchain_adapter" + assert lca.run_config.model_name == "llama_3_1_8b" + assert lca.run_config.model_provider_name == "ollama" async def test_langchain_adapter_with_cot(tmp_path): @@ -81,7 +79,7 @@ async def test_langchain_adapter_with_cot(tmp_path): kiln_task=task, model_name="llama_3_1_8b", provider="ollama", - prompt_builder=SimpleChainOfThoughtPromptBuilder(task), + prompt_id="simple_chain_of_thought_prompt_builder", ) # Mock the base model and its invoke method diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py index de45caf2..4f75c46f 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_openai_model_adapter.py @@ -5,7 +5,7 @@ from openai import AsyncOpenAI from kiln_ai.adapters.ml_model_list import StructuredOutputMode -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BasePromptBuilder +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig from kiln_ai.adapters.model_adapters.openai_compatible_config import ( OpenAICompatibleConfig, ) @@ -37,63 +37,47 @@ def mock_task(tmp_path): return task -@pytest.fixture -def mock_prompt_builder(): - builder = Mock(spec=BasePromptBuilder) - type(builder).prompt_builder_name = Mock(return_value="test_prompt_builder") - builder.prompt_id = Mock(return_value="test_prompt_id") - return builder - - @pytest.fixture def config(): return OpenAICompatibleConfig( api_key="test_key", base_url="https://api.test.com", model_name="test-model", - provider_name="test-provider", + provider_name="openrouter", default_headers={"X-Test": "test"}, ) -def test_initialization(config, mock_task, mock_prompt_builder): +def test_initialization(config, mock_task): adapter = OpenAICompatibleAdapter( config=config, kiln_task=mock_task, - prompt_builder=mock_prompt_builder, - tags=["test-tag"], + prompt_id="simple_prompt_builder", + base_adapter_config=AdapterConfig(default_tags=["test-tag"]), ) assert isinstance(adapter.client, AsyncOpenAI) assert adapter.config == config - assert adapter.kiln_task == mock_task - assert adapter.prompt_builder == mock_prompt_builder - assert adapter.default_tags == ["test-tag"] - assert adapter.model_name == config.model_name - assert adapter.model_provider_name == config.provider_name + assert adapter.run_config.task == mock_task + assert adapter.run_config.prompt_id == "simple_prompt_builder" + assert adapter.base_adapter_config.default_tags == ["test-tag"] + assert adapter.run_config.model_name == config.model_name + assert adapter.run_config.model_provider_name == config.provider_name -def test_adapter_info(config, mock_task, mock_prompt_builder): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +def test_adapter_info(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) - info = adapter.adapter_info() - assert isinstance(info, AdapterInfo) - assert info.model_name == config.model_name - assert info.model_provider == config.provider_name - assert info.adapter_name == "kiln_openai_compatible_adapter" - assert info.prompt_builder_name == "base_prompt_builder" - assert info.prompt_id == "test_prompt_id" + assert adapter.adapter_name() == "kiln_openai_compatible_adapter" + + assert adapter.run_config.model_name == config.model_name + assert adapter.run_config.model_provider_name == config.provider_name + assert adapter.run_config.prompt_id == "simple_prompt_builder" @pytest.mark.asyncio -async def test_response_format_options_unstructured( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_unstructured(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) # Mock has_structured_output to return False with patch.object(adapter, "has_structured_output", return_value=False): @@ -109,12 +93,8 @@ async def test_response_format_options_unstructured( ], ) @pytest.mark.asyncio -async def test_response_format_options_json_mode( - config, mock_task, mock_prompt_builder, mode -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_mode(config, mock_task, mode): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -134,12 +114,8 @@ async def test_response_format_options_json_mode( ], ) @pytest.mark.asyncio -async def test_response_format_options_function_calling( - config, mock_task, mock_prompt_builder, mode -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_function_calling(config, mock_task, mode): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -153,12 +129,8 @@ async def test_response_format_options_function_calling( @pytest.mark.asyncio -async def test_response_format_options_json_instructions( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_instructions(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -172,12 +144,8 @@ async def test_response_format_options_json_instructions( @pytest.mark.asyncio -async def test_response_format_options_json_schema( - config, mock_task, mock_prompt_builder -): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +async def test_response_format_options_json_schema(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) with ( patch.object(adapter, "has_structured_output", return_value=True), @@ -198,12 +166,35 @@ async def test_response_format_options_json_schema( } -def test_tool_call_params(config, mock_task, mock_prompt_builder): - adapter = OpenAICompatibleAdapter( - config=config, kiln_task=mock_task, prompt_builder=mock_prompt_builder - ) +def test_tool_call_params_weak(config, mock_task): + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) + + params = adapter.tool_call_params(strict=False) + expected_schema = mock_task.output_schema() + expected_schema["additionalProperties"] = False + + assert params == { + "tools": [ + { + "type": "function", + "function": { + "name": "task_response", + "parameters": expected_schema, + }, + } + ], + "tool_choice": { + "type": "function", + "function": {"name": "task_response"}, + }, + } + + +def test_tool_call_params_strict(config, mock_task): + config.provider_name = "openai" + adapter = OpenAICompatibleAdapter(config=config, kiln_task=mock_task) - params = adapter.tool_call_params() + params = adapter.tool_call_params(strict=True) expected_schema = mock_task.output_schema() expected_schema["additionalProperties"] = False diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py index 64a9b6fd..0c904507 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_saving_adapter_results.py @@ -3,7 +3,6 @@ import pytest from kiln_ai.adapters.model_adapters.base_adapter import ( - AdapterInfo, BaseAdapter, RunOutput, ) @@ -13,6 +12,7 @@ Project, Task, ) +from kiln_ai.datamodel.task import RunConfig from kiln_ai.utils.config import Config @@ -20,14 +20,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input: dict | str) -> dict | str: return RunOutput(output="Test output", intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - prompt_builder_name="mock_prompt_builder", - prompt_id="mock_prompt_id", - ) + def adapter_name(self) -> str: + return "mock_adapter" @pytest.fixture @@ -45,7 +39,14 @@ def test_task(tmp_path): @pytest.fixture def adapter(test_task): - return MockAdapter(test_task, model_name="phi_3_5", model_provider_name="ollama") + return MockAdapter( + run_config=RunConfig( + task=test_task, + model_name="phi_3_5", + model_provider_name="ollama", + prompt_id="simple_chain_of_thought_prompt_builder", + ), + ) def test_save_run_isolation(test_task, adapter): @@ -94,13 +95,12 @@ def test_save_run_isolation(test_task, adapter): assert reloaded_output.source.type == DataSourceType.synthetic assert reloaded_output.rating is None assert reloaded_output.source.properties["adapter_name"] == "mock_adapter" - assert reloaded_output.source.properties["model_name"] == "mock_model" - assert reloaded_output.source.properties["model_provider"] == "mock_provider" + assert reloaded_output.source.properties["model_name"] == "phi_3_5" + assert reloaded_output.source.properties["model_provider"] == "ollama" assert ( - reloaded_output.source.properties["prompt_builder_name"] - == "mock_prompt_builder" + reloaded_output.source.properties["prompt_id"] + == "simple_chain_of_thought_prompt_builder" ) - assert reloaded_output.source.properties["prompt_id"] == "mock_prompt_id" # Run again, with same input and different output. Should create a new TaskRun. different_run_output = RunOutput( output="Different output", intermediate_outputs=None @@ -118,7 +118,7 @@ def test_save_run_isolation(test_task, adapter): properties={ "model_name": "mock_model", "model_provider": "mock_provider", - "prompt_builder_name": "mock_prompt_builder", + "prompt_id": "mock_prompt_builder", "adapter_name": "mock_adapter", }, ), @@ -178,6 +178,25 @@ async def test_autosave_false(test_task, adapter): assert run.id is None +@pytest.mark.asyncio +async def test_autosave_true_with_disabled(test_task, adapter): + with patch("kiln_ai.utils.config.Config.shared") as mock_shared: + mock_config = mock_shared.return_value + mock_config.autosave_runs = True + mock_config.user_id = "test_user" + + input_data = "Test input" + + adapter.base_adapter_config.allow_saving = False + run = await adapter.invoke(input_data) + + # Check that no runs were saved + assert len(test_task.runs()) == 0 + + # Check that the run ID is not set + assert run.id is None + + @pytest.mark.asyncio async def test_autosave_true(test_task, adapter): with patch("kiln_ai.utils.config.Config.shared") as mock_shared: @@ -202,6 +221,9 @@ async def test_autosave_true(test_task, adapter): assert output.output == "Test output" assert output.source.type == DataSourceType.synthetic assert output.source.properties["adapter_name"] == "mock_adapter" - assert output.source.properties["model_name"] == "mock_model" - assert output.source.properties["model_provider"] == "mock_provider" - assert output.source.properties["prompt_builder_name"] == "mock_prompt_builder" + assert output.source.properties["model_name"] == "phi_3_5" + assert output.source.properties["model_provider"] == "ollama" + assert ( + output.source.properties["prompt_id"] + == "simple_chain_of_thought_prompt_builder" + ) diff --git a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py index db6bf7c6..2cc2bcbb 100644 --- a/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py +++ b/libs/core/kiln_ai/adapters/model_adapters/test_structured_output.py @@ -2,8 +2,6 @@ from pathlib import Path from typing import Dict -import jsonschema -import jsonschema.exceptions import pytest import kiln_ai.datamodel as datamodel @@ -12,16 +10,13 @@ built_in_models, ) from kiln_ai.adapters.model_adapters.base_adapter import ( - AdapterInfo, BaseAdapter, RunOutput, ) from kiln_ai.adapters.ollama_tools import ollama_online -from kiln_ai.adapters.prompt_builders import ( - BasePromptBuilder, - SimpleChainOfThoughtPromptBuilder, -) from kiln_ai.adapters.test_prompt_adaptors import get_all_models_and_providers +from kiln_ai.datamodel import PromptId +from kiln_ai.datamodel.task import RunConfig from kiln_ai.datamodel.test_json_schema import json_joke_schema, json_triangle_schema @@ -39,9 +34,9 @@ async def test_structured_output_gpt_4o_mini(tmp_path): await run_structured_output_test(tmp_path, "gpt_4o_mini", "openai") -@pytest.mark.parametrize("model_name", ["llama_3_1_8b"]) +@pytest.mark.parametrize("model_name", ["llama_3_1_8b", "gemma_2_2b"]) @pytest.mark.ollama -async def test_structured_output_ollama_llama(tmp_path, model_name): +async def test_structured_output_ollama(tmp_path, model_name): if not await ollama_online(): pytest.skip("Ollama API not running. Expect it running on localhost:11434") await run_structured_output_test(tmp_path, model_name, "ollama") @@ -49,19 +44,21 @@ async def test_structured_output_ollama_llama(tmp_path, model_name): class MockAdapter(BaseAdapter): def __init__(self, kiln_task: datamodel.Task, response: Dict | str | None): - super().__init__(kiln_task, model_name="phi_3_5", model_provider_name="ollama") + super().__init__( + run_config=RunConfig( + task=kiln_task, + model_name="phi_3_5", + model_provider_name="ollama", + prompt_id="simple_chain_of_thought_prompt_builder", + ), + ) self.response = response async def _run(self, input: str) -> RunOutput: return RunOutput(output=self.response, intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - prompt_builder_name="mock_prompt_builder", - ) + def adapter_name(self) -> str: + return "mock_adapter" async def test_mock_unstructred_response(tmp_path): @@ -204,15 +201,21 @@ async def run_structured_input_task( task: datamodel.Task, model_name: str, provider: str, - pb: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ): a = adapter_for_task( - task, model_name=model_name, provider=provider, prompt_builder=pb + task, + model_name=model_name, + provider=provider, + prompt_id=prompt_id, ) with pytest.raises(ValueError): # not structured input in dictionary await a.invoke("a=1, b=2, c=3") - with pytest.raises(jsonschema.exceptions.ValidationError): + with pytest.raises( + ValueError, + match="This task requires a specific output schema. While the model produced JSON, that JSON didn't meet the schema.", + ): # invalid structured input await a.invoke({"a": 1, "b": 2, "d": 3}) @@ -229,13 +232,14 @@ async def run_structured_input_task( assert "[[equilateral]]" in response else: assert response["is_equilateral"] is True - adapter_info = a.adapter_info() + expected_pb_name = "simple_prompt_builder" - if pb is not None: - expected_pb_name = pb.__class__.prompt_builder_name() - assert adapter_info.prompt_builder_name == expected_pb_name - assert adapter_info.model_name == model_name - assert adapter_info.model_provider == provider + if prompt_id is not None: + expected_pb_name = prompt_id + assert a.run_config.prompt_id == expected_pb_name + + assert a.run_config.model_name == model_name + assert a.run_config.model_provider_name == provider @pytest.mark.paid @@ -257,8 +261,9 @@ async def test_all_built_in_models_structured_input( @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) async def test_structured_input_cot_prompt_builder(tmp_path, model_name, provider_name): task = build_structured_input_test_task(tmp_path) - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_structured_input_task(task, model_name, provider_name, pb) + await run_structured_input_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) @pytest.mark.paid @@ -302,5 +307,6 @@ async def test_structured_output_cot_prompt_builder( """ task.output_json_schema = json.dumps(triangle_schema) task.save_to_file() - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_structured_input_task(task, model_name, provider_name, pb) + await run_structured_input_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) diff --git a/libs/core/kiln_ai/adapters/prompt_builders.py b/libs/core/kiln_ai/adapters/prompt_builders.py index 94fbdb59..b54d4832 100644 --- a/libs/core/kiln_ai/adapters/prompt_builders.py +++ b/libs/core/kiln_ai/adapters/prompt_builders.py @@ -2,8 +2,8 @@ from abc import ABCMeta, abstractmethod from typing import Dict -from kiln_ai.datamodel import Task, TaskRun -from kiln_ai.utils.formatting import snake_case +from kiln_ai.datamodel import PromptGenerators, PromptId, Task, TaskRun +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error class BasePromptBuilder(metaclass=ABCMeta): @@ -53,17 +53,6 @@ def build_base_prompt(self) -> str: """ pass - @classmethod - def prompt_builder_name(cls) -> str: - """Returns the name of the prompt builder, to be used for persisting into the datastore. - - Default implementation gets the name of the prompt builder in snake case. If you change the class name, you should override this so prior saved data is compatible. - - Returns: - str: The prompt builder name in snake_case format. - """ - return snake_case(cls.__name__) - def build_user_message(self, input: Dict | str) -> str: """Build a user message from the input. @@ -300,6 +289,57 @@ def chain_of_thought_prompt(self) -> str | None: return self.prompt_model.chain_of_thought_instructions +class TaskRunConfigPromptBuilder(BasePromptBuilder): + """A prompt builder that looks up a static prompt in a task run config.""" + + def __init__(self, task: Task, run_config_prompt_id: str): + parts = run_config_prompt_id.split("::") + if len(parts) != 4: + raise ValueError( + f"Invalid task run config prompt ID: {run_config_prompt_id}. Expected format: 'task_run_config::[project_id]::[task_id]::[run_config_id]'." + ) + + task_id = parts[2] + if task_id != task.id: + raise ValueError( + f"Task run config prompt ID: {run_config_prompt_id}. Task ID mismatch. Expected: {task.id}, got: {task_id}." + ) + + run_config_id = parts[3] + run_config = next( + ( + run_config + for run_config in task.run_configs(readonly=True) + if run_config.id == run_config_id + ), + None, + ) + if not run_config: + raise ValueError( + f"Task run config ID not found: {run_config_id} for prompt id {run_config_prompt_id}" + ) + if run_config.prompt is None: + raise ValueError( + f"Task run config ID {run_config_id} does not have a stored prompt. Used as prompt id {run_config_prompt_id}" + ) + + # Load the prompt from the model + self.prompt = run_config.prompt.prompt + self.cot_prompt = run_config.prompt.chain_of_thought_instructions + self.id = run_config_prompt_id + + super().__init__(task) + + def prompt_id(self) -> str | None: + return self.id + + def build_base_prompt(self) -> str: + return self.prompt + + def chain_of_thought_prompt(self) -> str | None: + return self.cot_prompt + + class FineTunePromptBuilder(BasePromptBuilder): """A prompt builder that looks up a fine-tune prompt.""" @@ -337,25 +377,12 @@ def chain_of_thought_prompt(self) -> str | None: return self.fine_tune_model.thinking_instructions -# TODO P2: we end up with 2 IDs for these: the keys here (ui_name) and the prompt_builder_name from the class -# We end up maintaining this in _prompt_generators as well. -prompt_builder_registry = { - "simple_prompt_builder": SimplePromptBuilder, - "multi_shot_prompt_builder": MultiShotPromptBuilder, - "few_shot_prompt_builder": FewShotPromptBuilder, - "repairs_prompt_builder": RepairsPromptBuilder, - "simple_chain_of_thought_prompt_builder": SimpleChainOfThoughtPromptBuilder, - "few_shot_chain_of_thought_prompt_builder": FewShotChainOfThoughtPromptBuilder, - "multi_shot_chain_of_thought_prompt_builder": MultiShotChainOfThoughtPromptBuilder, -} - - # Our UI has some names that are not the same as the class names, which also hint parameters. -def prompt_builder_from_ui_name(ui_name: str, task: Task) -> BasePromptBuilder: +def prompt_builder_from_id(prompt_id: PromptId, task: Task) -> BasePromptBuilder: """Convert a name used in the UI to the corresponding prompt builder class. Args: - ui_name (str): The UI name for the prompt builder type. + prompt_id (PromptId): The prompt ID. Returns: type[BasePromptBuilder]: The corresponding prompt builder class. @@ -365,29 +392,40 @@ def prompt_builder_from_ui_name(ui_name: str, task: Task) -> BasePromptBuilder: """ # Saved prompts are prefixed with "id::" - if ui_name.startswith("id::"): - prompt_id = ui_name[4:] + if prompt_id.startswith("id::"): + prompt_id = prompt_id[4:] return SavedPromptBuilder(task, prompt_id) + # Task run config prompts are prefixed with "task_run_config::" + # task_run_config::[project_id]::[task_id]::[run_config_id] + if prompt_id.startswith("task_run_config::"): + return TaskRunConfigPromptBuilder(task, prompt_id) + # Fine-tune prompts are prefixed with "fine_tune_prompt::" - if ui_name.startswith("fine_tune_prompt::"): - fine_tune_id = ui_name[18:] - return FineTunePromptBuilder(task, fine_tune_id) + if prompt_id.startswith("fine_tune_prompt::"): + prompt_id = prompt_id[18:] + return FineTunePromptBuilder(task, prompt_id) + + # Check if the prompt_id matches any enum value + if prompt_id not in [member.value for member in PromptGenerators]: + raise ValueError(f"Unknown prompt generator: {prompt_id}") + typed_prompt_generator = PromptGenerators(prompt_id) - match ui_name: - case "basic": + match typed_prompt_generator: + case PromptGenerators.SIMPLE: return SimplePromptBuilder(task) - case "few_shot": + case PromptGenerators.FEW_SHOT: return FewShotPromptBuilder(task) - case "many_shot": + case PromptGenerators.MULTI_SHOT: return MultiShotPromptBuilder(task) - case "repairs": + case PromptGenerators.REPAIRS: return RepairsPromptBuilder(task) - case "simple_chain_of_thought": + case PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT: return SimpleChainOfThoughtPromptBuilder(task) - case "few_shot_chain_of_thought": + case PromptGenerators.FEW_SHOT_CHAIN_OF_THOUGHT: return FewShotChainOfThoughtPromptBuilder(task) - case "multi_shot_chain_of_thought": + case PromptGenerators.MULTI_SHOT_CHAIN_OF_THOUGHT: return MultiShotChainOfThoughtPromptBuilder(task) case _: - raise ValueError(f"Unknown prompt builder: {ui_name}") + # Type checking will find missing cases + raise_exhaustive_enum_error(typed_prompt_generator) diff --git a/libs/core/kiln_ai/adapters/repair/repair_task.py b/libs/core/kiln_ai/adapters/repair/repair_task.py index 43690935..6163a62b 100644 --- a/libs/core/kiln_ai/adapters/repair/repair_task.py +++ b/libs/core/kiln_ai/adapters/repair/repair_task.py @@ -6,7 +6,7 @@ from kiln_ai.adapters.prompt_builders import ( BasePromptBuilder, SavedPromptBuilder, - prompt_builder_registry, + prompt_builder_from_id, ) from kiln_ai.datamodel import Priority, Project, Task, TaskRequirement, TaskRun @@ -49,28 +49,16 @@ def _original_prompt(cls, run: TaskRun, task: Task) -> str: if run.output.source is None or run.output.source.properties is None: raise ValueError("No source properties found") - # Try ID first, then builder name - prompt_id = run.output.source.properties.get("prompt_id", None) + # Get the prompt builder id. Need the second check because we used to store this in a prompt_builder_name field, so loading legacy runs will need this. + prompt_id = run.output.source.properties.get( + "prompt_id" + ) or run.output.source.properties.get("prompt_builder_name", None) if prompt_id is not None and isinstance(prompt_id, str): - static_prompt_builder = SavedPromptBuilder(task, prompt_id) - return static_prompt_builder.build_prompt(include_json_instructions=False) + prompt_builder = prompt_builder_from_id(prompt_id, task) + if isinstance(prompt_builder, BasePromptBuilder): + return prompt_builder.build_prompt(include_json_instructions=False) - prompt_builder_class: Type[BasePromptBuilder] | None = None - prompt_builder_name = run.output.source.properties.get( - "prompt_builder_name", None - ) - if prompt_builder_name is not None and isinstance(prompt_builder_name, str): - prompt_builder_class = prompt_builder_registry.get( - prompt_builder_name, None - ) - if prompt_builder_class is None: - raise ValueError(f"No prompt builder found for name: {prompt_builder_name}") - prompt_builder = prompt_builder_class(task=task) - if not isinstance(prompt_builder, BasePromptBuilder): - raise ValueError( - f"Prompt builder {prompt_builder_name} is not a valid prompt builder" - ) - return prompt_builder.build_prompt(include_json_instructions=False) + raise ValueError(f"Prompt builder '{prompt_id}' is not a valid prompt builder") @classmethod def build_repair_task_input( diff --git a/libs/core/kiln_ai/adapters/repair/test_repair_task.py b/libs/core/kiln_ai/adapters/repair/test_repair_task.py index 9c63d974..2d7d261f 100644 --- a/libs/core/kiln_ai/adapters/repair/test_repair_task.py +++ b/libs/core/kiln_ai/adapters/repair/test_repair_task.py @@ -95,7 +95,7 @@ def sample_task_run(sample_task): "model_name": "gpt_4o", "model_provider": "openai", "adapter_name": "langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ), ), @@ -201,7 +201,7 @@ async def test_live_run(sample_task, sample_task_run, sample_repair_data): "adapter_name": "kiln_langchain_adapter", "model_name": "llama_3_1_8b", "model_provider": "groq", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } @@ -238,7 +238,7 @@ async def test_mocked_repair_task_run(sample_task, sample_task_run, sample_repai "adapter_name": "kiln_langchain_adapter", "model_name": "llama_3_1_8b", "model_provider": "ollama", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } assert run.input_source.type == DataSourceType.human assert "created_by" in run.input_source.properties diff --git a/libs/core/kiln_ai/adapters/run_output.py b/libs/core/kiln_ai/adapters/run_output.py index 7c34cae6..e407ac15 100644 --- a/libs/core/kiln_ai/adapters/run_output.py +++ b/libs/core/kiln_ai/adapters/run_output.py @@ -1,8 +1,11 @@ from dataclasses import dataclass from typing import Dict +from openai.types.chat.chat_completion import ChoiceLogprobs + @dataclass class RunOutput: output: Dict | str intermediate_outputs: Dict[str, str] | None + output_logprobs: ChoiceLogprobs | None = None diff --git a/libs/core/kiln_ai/adapters/test_adapter_registry.py b/libs/core/kiln_ai/adapters/test_adapter_registry.py index 6a70d11b..2fa55227 100644 --- a/libs/core/kiln_ai/adapters/test_adapter_registry.py +++ b/libs/core/kiln_ai/adapters/test_adapter_registry.py @@ -5,6 +5,7 @@ from kiln_ai import datamodel from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig from kiln_ai.adapters.model_adapters.langchain_adapters import LangchainAdapter from kiln_ai.adapters.model_adapters.openai_model_adapter import OpenAICompatibleAdapter from kiln_ai.adapters.prompt_builders import BasePromptBuilder @@ -84,24 +85,19 @@ def test_langchain_adapter_creation(mock_config, basic_task, provider): ) assert isinstance(adapter, LangchainAdapter) - assert adapter.model_name == "test-model" + assert adapter.run_config.model_name == "test-model" # TODO should run for all cases def test_custom_prompt_builder(mock_config, basic_task): - class TestPromptBuilder(BasePromptBuilder): - def build_base_prompt(self, kiln_task) -> str: - return "test-prompt" - - prompt_builder = TestPromptBuilder(basic_task) adapter = adapter_for_task( kiln_task=basic_task, model_name="gpt-4", provider=ModelProviderName.openai, - prompt_builder=prompt_builder, + prompt_id="simple_chain_of_thought_prompt_builder", ) - assert adapter.prompt_builder == prompt_builder + assert adapter.run_config.prompt_id == "simple_chain_of_thought_prompt_builder" # TODO should run for all cases @@ -111,10 +107,12 @@ def test_tags_passed_through(mock_config, basic_task): kiln_task=basic_task, model_name="gpt-4", provider=ModelProviderName.openai, - tags=tags, + base_adapter_config=AdapterConfig( + default_tags=tags, + ), ) - assert adapter.default_tags == tags + assert adapter.base_adapter_config.default_tags == tags def test_invalid_provider(mock_config, basic_task): @@ -129,6 +127,7 @@ def test_openai_compatible_adapter(mock_compatible_config, mock_config, basic_ta mock_compatible_config.return_value.model_name = "test-model" mock_compatible_config.return_value.api_key = "test-key" mock_compatible_config.return_value.base_url = "https://test.com/v1" + mock_compatible_config.return_value.provider_name = "CustomProvider99" adapter = adapter_for_task( kiln_task=basic_task, @@ -141,6 +140,7 @@ def test_openai_compatible_adapter(mock_compatible_config, mock_config, basic_ta assert adapter.config.model_name == "test-model" assert adapter.config.api_key == "test-key" assert adapter.config.base_url == "https://test.com/v1" + assert adapter.config.provider_name == "CustomProvider99" def test_custom_openai_compatible_provider(mock_config, basic_task): diff --git a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py index e7b97f90..c5f53324 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_adaptors.py +++ b/libs/core/kiln_ai/adapters/test_prompt_adaptors.py @@ -13,6 +13,7 @@ BasePromptBuilder, SimpleChainOfThoughtPromptBuilder, ) +from kiln_ai.datamodel import PromptId def get_all_models_and_providers(): @@ -132,7 +133,7 @@ async def test_mock_returning_run(tmp_path): "adapter_name": "kiln_langchain_adapter", "model_name": "custom.langchain:unknown_model", "model_provider": "ollama", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", } @@ -149,8 +150,9 @@ async def test_all_models_providers_plaintext(tmp_path, model_name, provider_nam @pytest.mark.parametrize("model_name,provider_name", get_all_models_and_providers()) async def test_cot_prompt_builder(tmp_path, model_name, provider_name): task = build_test_task(tmp_path) - pb = SimpleChainOfThoughtPromptBuilder(task) - await run_simple_task(task, model_name, provider_name, pb) + await run_simple_task( + task, model_name, provider_name, "simple_chain_of_thought_prompt_builder" + ) def build_test_task(tmp_path: Path): @@ -186,20 +188,20 @@ async def run_simple_test( tmp_path: Path, model_name: str, provider: str | None = None, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ): task = build_test_task(tmp_path) - return await run_simple_task(task, model_name, provider, prompt_builder) + return await run_simple_task(task, model_name, provider, prompt_id) async def run_simple_task( task: datamodel.Task, model_name: str, provider: str, - prompt_builder: BasePromptBuilder | None = None, + prompt_id: PromptId | None = None, ) -> datamodel.TaskRun: adapter = adapter_for_task( - task, model_name=model_name, provider=provider, prompt_builder=prompt_builder + task, model_name=model_name, provider=provider, prompt_id=prompt_id ) run = await adapter.invoke( @@ -212,13 +214,14 @@ async def run_simple_task( ) assert "64" in run.output.output source_props = run.output.source.properties - assert source_props["adapter_name"] == "kiln_langchain_adapter" + assert source_props["adapter_name"] in [ + "kiln_langchain_adapter", + "kiln_openai_compatible_adapter", + ] assert source_props["model_name"] == model_name assert source_props["model_provider"] == provider - expected_prompt_builder_name = ( - prompt_builder.__class__.prompt_builder_name() - if prompt_builder - else "simple_prompt_builder" - ) - assert source_props["prompt_builder_name"] == expected_prompt_builder_name + if prompt_id is None: + assert source_props["prompt_id"] == "simple_prompt_builder" + else: + assert source_props["prompt_id"] == prompt_id return run diff --git a/libs/core/kiln_ai/adapters/test_prompt_builders.py b/libs/core/kiln_ai/adapters/test_prompt_builders.py index 5c720f84..fca03058 100644 --- a/libs/core/kiln_ai/adapters/test_prompt_builders.py +++ b/libs/core/kiln_ai/adapters/test_prompt_builders.py @@ -3,7 +3,7 @@ import pytest -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.adapters.model_adapters.test_structured_output import ( build_structured_output_test_task, ) @@ -17,8 +17,9 @@ SavedPromptBuilder, SimpleChainOfThoughtPromptBuilder, SimplePromptBuilder, + TaskRunConfigPromptBuilder, chain_of_thought_prompt, - prompt_builder_from_ui_name, + prompt_builder_from_id, ) from kiln_ai.adapters.test_prompt_adaptors import build_test_task from kiln_ai.datamodel import ( @@ -33,6 +34,7 @@ TaskOutputRating, TaskRun, ) +from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig logger = logging.getLogger(__name__) @@ -60,12 +62,8 @@ class MockAdapter(BaseAdapter): def _run(self, input: str) -> str: return "mock response" - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="mock_adapter", - model_name="mock_model", - model_provider="mock_provider", - ) + def adapter_name(self) -> str: + return "mock_adapter" def test_simple_prompt_builder_structured_output(tmp_path): @@ -316,54 +314,53 @@ def check_example_outputs(task: Task, count: int): assert f"## Example {count}" in prompt -def test_prompt_builder_name(): - assert SimplePromptBuilder.prompt_builder_name() == "simple_prompt_builder" - assert MultiShotPromptBuilder.prompt_builder_name() == "multi_shot_prompt_builder" - assert RepairsPromptBuilder.prompt_builder_name() == "repairs_prompt_builder" - - -def test_prompt_builder_from_ui_name(task_with_examples): +def test_prompt_builder_from_id(task_with_examples): task = task_with_examples - assert isinstance(prompt_builder_from_ui_name("basic", task), SimplePromptBuilder) assert isinstance( - prompt_builder_from_ui_name("few_shot", task), FewShotPromptBuilder + prompt_builder_from_id("simple_prompt_builder", task), SimplePromptBuilder ) assert isinstance( - prompt_builder_from_ui_name("many_shot", task), MultiShotPromptBuilder + prompt_builder_from_id("few_shot_prompt_builder", task), + FewShotPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("repairs", task), RepairsPromptBuilder + prompt_builder_from_id("multi_shot_prompt_builder", task), + MultiShotPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("simple_chain_of_thought", task), + prompt_builder_from_id("repairs_prompt_builder", task), + RepairsPromptBuilder, + ) + assert isinstance( + prompt_builder_from_id("simple_chain_of_thought_prompt_builder", task), SimpleChainOfThoughtPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("few_shot_chain_of_thought", task), + prompt_builder_from_id("few_shot_chain_of_thought_prompt_builder", task), FewShotChainOfThoughtPromptBuilder, ) assert isinstance( - prompt_builder_from_ui_name("multi_shot_chain_of_thought", task), + prompt_builder_from_id("multi_shot_chain_of_thought_prompt_builder", task), MultiShotChainOfThoughtPromptBuilder, ) - with pytest.raises(ValueError, match="Unknown prompt builder: invalid_name"): - prompt_builder_from_ui_name("invalid_name", task) + with pytest.raises(ValueError, match="Unknown prompt generator: invalid_name"): + prompt_builder_from_id("invalid_name", task) with pytest.raises(ValueError, match="Prompt ID not found: 123"): - prompt_builder_from_ui_name("id::123", task) + prompt_builder_from_id("id::123", task) with pytest.raises( ValueError, match="Invalid fine-tune ID format. Expected 'project_id::task_id::fine_tune_id'", ): - prompt_builder_from_ui_name("fine_tune_prompt::123", task) + prompt_builder_from_id("fine_tune_prompt::123", task) with pytest.raises( ValueError, match="Fine-tune ID not found", ): - prompt_builder_from_ui_name("fine_tune_prompt::123::456::789", task) + prompt_builder_from_id("fine_tune_prompt::123::456::789", task) prompt = Prompt( name="test_prompt_name", @@ -372,7 +369,7 @@ def test_prompt_builder_from_ui_name(task_with_examples): parent=task, ) prompt.save_to_file() - pb = prompt_builder_from_ui_name("id::" + prompt.id, task) + pb = prompt_builder_from_id("id::" + prompt.id, task) assert isinstance(pb, SavedPromptBuilder) assert pb.prompt_id() == prompt.id assert pb.build_prompt(include_json_instructions=False) == "test_prompt" @@ -392,7 +389,7 @@ def test_prompt_builder_from_ui_name(task_with_examples): nested_fine_tune_id = ( task_with_examples.parent.id + "::" + task_with_examples.id + "::" + finetune.id ) - pb = prompt_builder_from_ui_name( + pb = prompt_builder_from_id( "fine_tune_prompt::" + nested_fine_tune_id, task_with_examples, ) @@ -589,3 +586,64 @@ def test_build_prompt_with_json_instructions(tmp_path): assert task.instruction in prompt_with_json for requirement in task.requirements: assert requirement.instruction in prompt_with_json + + +def test_task_run_config_prompt_builder(tmp_path): + task = build_test_task(tmp_path) + + run_config = TaskRunConfig( + name="test_run_config", + parent=task, + run_config_properties=RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id="simple_prompt_builder", + ), + prompt=Prompt( + name="test prompt name", + prompt="test prompt content", + chain_of_thought_instructions="test step by step", + ), + ) + run_config.save_to_file() + + # Construct the eval prompt ID + run_config_prompt_id = ( + f"task_run_config::{task.parent.id}::{task.id}::{run_config.id}" + ) + + # Test successful creation 2 ways: constructor and ID creation + builders = [ + TaskRunConfigPromptBuilder( + task=task, run_config_prompt_id=run_config_prompt_id + ), + prompt_builder_from_id(run_config_prompt_id, task), + ] + + for builder in builders: + assert ( + builder.build_prompt(include_json_instructions=False) + == "test prompt content" + ) + assert builder.chain_of_thought_prompt() == "test step by step" + assert builder.prompt_id() == run_config_prompt_id + + +def test_task_run_config_prompt_builder_validation_errors(tmp_path): + task = build_test_task(tmp_path) + + # Test invalid format + with pytest.raises(ValueError, match="Invalid task run config prompt ID"): + TaskRunConfigPromptBuilder( + task=task, run_config_prompt_id="task_run_config::wrong::format" + ) + + # Test task ID mismatch + wrong_task_id = f"task_run_config::{task.parent.id}::wrong_task_id::config_id" + with pytest.raises(ValueError, match="Task ID mismatch"): + TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=wrong_task_id) + + # Test eval not found + nonexistent_eval = f"task_run_config::{task.parent.id}::{task.id}::nonexistent_id" + with pytest.raises(ValueError, match="Task run config ID not found"): + TaskRunConfigPromptBuilder(task=task, run_config_prompt_id=nonexistent_eval) diff --git a/libs/core/kiln_ai/datamodel/__init__.py b/libs/core/kiln_ai/datamodel/__init__.py index 0d622418..f53f76ea 100644 --- a/libs/core/kiln_ai/datamodel/__init__.py +++ b/libs/core/kiln_ai/datamodel/__init__.py @@ -11,6 +11,7 @@ from __future__ import annotations +from kiln_ai.datamodel import dataset_split, eval, strict_mode from kiln_ai.datamodel.datamodel_enums import ( FinetuneDataStrategy, FineTuneStatusType, @@ -26,7 +27,12 @@ Finetune, ) from kiln_ai.datamodel.project import Project -from kiln_ai.datamodel.prompt import Prompt +from kiln_ai.datamodel.prompt import BasePrompt, Prompt +from kiln_ai.datamodel.prompt_id import ( + PromptGenerators, + PromptId, + prompt_generator_values, +) from kiln_ai.datamodel.task import Task, TaskRequirement from kiln_ai.datamodel.task_output import ( DataSource, @@ -43,6 +49,7 @@ __all__ = [ "strict_mode", "dataset_split", + "eval", "Task", "Project", "TaskRun", @@ -59,8 +66,12 @@ "DatasetSplit", "RequirementRating", "TaskRequirement", + "BasePrompt", "Prompt", "TaskOutputRating", "StructuredOutputMode", "FinetuneDataStrategy", + "PromptId", + "PromptGenerators", + "prompt_generator_values", ] diff --git a/libs/core/kiln_ai/datamodel/datamodel_enums.py b/libs/core/kiln_ai/datamodel/datamodel_enums.py index a588765e..2c93f1aa 100644 --- a/libs/core/kiln_ai/datamodel/datamodel_enums.py +++ b/libs/core/kiln_ai/datamodel/datamodel_enums.py @@ -34,6 +34,7 @@ class StructuredOutputMode(str, Enum): default = "default" json_schema = "json_schema" + function_calling_weak = "function_calling_weak" function_calling = "function_calling" json_mode = "json_mode" json_instructions = "json_instructions" diff --git a/libs/core/kiln_ai/datamodel/dataset_filters.py b/libs/core/kiln_ai/datamodel/dataset_filters.py new file mode 100644 index 00000000..bbc69e9f --- /dev/null +++ b/libs/core/kiln_ai/datamodel/dataset_filters.py @@ -0,0 +1,114 @@ +from enum import Enum +from typing import Annotated, Protocol + +from pydantic import AfterValidator + +from kiln_ai.datamodel.task_run import TaskRun + + +class DatasetFilter(Protocol): + """A protocol defining the interface for dataset filters. + + This allows both stateless function-based filters and stateful class-based filters + to be used interchangeably, as long as they implement the __call__ method. + """ + + def __call__(self, task_run: TaskRun) -> bool: + """Return True if the task run should be included in the dataset.""" + ... + + +def AllDatasetFilter(_: TaskRun) -> bool: + return True + + +def HighRatingDatasetFilter(task_run: TaskRun) -> bool: + if task_run.output is None: + return False + if task_run.repaired_output is not None: + # Repairs always considered high quality + return True + if task_run.output.rating is None: + return False + return task_run.output.rating.is_high_quality() + + +def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool: + """ + A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought) + """ + return task_run.has_thinking_training_data() + + +def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool: + """ + A filter that returns True if the task has thinking data and the output is high quality + """ + return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run) + + +class TagFilter: + """ + A filter that returns True if the task has a tag matching the given tag. + """ + + def __init__(self, tag: str): + self.tag = tag + + def __call__(self, task_run: TaskRun) -> bool: + return self.tag in task_run.tags + + +class StaticDatasetFilters(str, Enum): + """Dataset filter names.""" + + ALL = "all" + HIGH_RATING = "high_rating" + THINKING_MODEL = "thinking_model" + THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated" + + +static_dataset_filters = { + StaticDatasetFilters.ALL: AllDatasetFilter, + StaticDatasetFilters.HIGH_RATING: HighRatingDatasetFilter, + StaticDatasetFilters.THINKING_MODEL: ThinkingModelDatasetFilter, + StaticDatasetFilters.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter, +} + +DatasetFilterId = Annotated[ + str, + AfterValidator(lambda v: _check_dataset_filter_id(v)), +] +""" +A pydantic type that validates strings containing a valid dataset filter ID. + +Dataset filter IDs can be one of: +- A built-in dataset filter name +- A tag:: filter, where is a string +""" + + +def _check_dataset_filter_id(id: str) -> str: + """ + Check that the dataset filter ID is valid. + """ + if id in static_dataset_filters: + return id + + if id.startswith("tag::") and len(id) > 5: + return id + + raise ValueError(f"Invalid dataset filter ID: {id}") + + +def dataset_filter_from_id(id: DatasetFilterId) -> DatasetFilter: + """ + Get a dataset filter from an ID. + """ + if id.startswith("tag::") and len(id) > 5: + return TagFilter(id[5:]) + + if id in static_dataset_filters: + return static_dataset_filters[id] + + raise ValueError(f"Invalid dataset filter ID: {id}") diff --git a/libs/core/kiln_ai/datamodel/dataset_split.py b/libs/core/kiln_ai/datamodel/dataset_split.py index bb1c3833..00c88341 100644 --- a/libs/core/kiln_ai/datamodel/dataset_split.py +++ b/libs/core/kiln_ai/datamodel/dataset_split.py @@ -4,69 +4,21 @@ import math import random -from enum import Enum -from typing import TYPE_CHECKING, Callable +from typing import TYPE_CHECKING from pydantic import BaseModel, Field, model_validator from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel -from kiln_ai.datamodel.task_run import TaskRun +from kiln_ai.datamodel.dataset_filters import ( + DatasetFilter, + DatasetFilterId, + dataset_filter_from_id, +) if TYPE_CHECKING: from kiln_ai.datamodel.task import Task -# A type alias that takes a TaskRun and returns a boolean indicating whether the task run should be included in the split. -# Several filters are defined below like AllDatasetFilter, HighRatingDatasetFilter, etc. -DatasetFilter = Callable[[TaskRun], bool] - - -def AllDatasetFilter(_: TaskRun) -> bool: - return True - - -def HighRatingDatasetFilter(task_run: TaskRun) -> bool: - if task_run.output is None: - return False - if task_run.repaired_output is not None: - # Repairs always considered high quality - return True - if task_run.output.rating is None: - return False - return task_run.output.rating.is_high_quality() - - -def ThinkingModelDatasetFilter(task_run: TaskRun) -> bool: - """ - A filter that returns True if the task has intermediate outputs we can training a 'thinking' model on (reasoning or chain of thought) - """ - return task_run.has_thinking_training_data() - - -def ThinkingModelHighRatedFilter(task_run: TaskRun) -> bool: - """ - A filter that returns True if the task has thinking data and the output is high quality - """ - return ThinkingModelDatasetFilter(task_run) and HighRatingDatasetFilter(task_run) - - -class DatasetFilterType(str, Enum): - """Dataset filter names.""" - - ALL = "all" - HIGH_RATING = "high_rating" - THINKING_MODEL = "thinking_model" - THINKING_MODEL_HIGH_RATED = "thinking_model_high_rated" - - -dataset_filters = { - DatasetFilterType.ALL: AllDatasetFilter, - DatasetFilterType.HIGH_RATING: HighRatingDatasetFilter, - DatasetFilterType.THINKING_MODEL: ThinkingModelDatasetFilter, - DatasetFilterType.THINKING_MODEL_HIGH_RATED: ThinkingModelHighRatedFilter, -} - - class DatasetSplitDefinition(BaseModel): """ A definition of a split in a dataset. @@ -126,7 +78,7 @@ class DatasetSplit(KilnParentedModel): split_contents: dict[str, list[str]] = Field( description="The contents of each split in the dataset. The key is the split name, and the value is a list of task run IDs.", ) - filter: DatasetFilterType | None = Field( + filter: DatasetFilterId | None = Field( default=None, description="The filter used to build the dataset.", ) @@ -144,13 +96,13 @@ def from_task( name: str, task: "Task", splits: list[DatasetSplitDefinition], - filter_type: DatasetFilterType = DatasetFilterType.ALL, + filter_id: DatasetFilterId = "all", description: str | None = None, ): """ Build a dataset split from a task. """ - filter = dataset_filters[filter_type] + filter = dataset_filter_from_id(filter_id) split_contents = cls.build_split_contents(task, splits, filter) return cls( parent=task, @@ -158,7 +110,7 @@ def from_task( description=description, splits=splits, split_contents=split_contents, - filter=filter_type, + filter=filter_id, ) @classmethod diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py new file mode 100644 index 00000000..db3938f9 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/eval.py @@ -0,0 +1,298 @@ +import json +from enum import Enum +from typing import TYPE_CHECKING, Any, Dict, List, Union + +from pydantic import BaseModel, Field, model_validator +from typing_extensions import Self + +from kiln_ai.datamodel.basemodel import ( + ID_TYPE, + NAME_FIELD, + KilnParentedModel, + KilnParentModel, +) +from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType +from kiln_ai.datamodel.dataset_filters import DatasetFilterId +from kiln_ai.datamodel.json_schema import string_to_json_key +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error + +if TYPE_CHECKING: + from kiln_ai.datamodel.task import Task + +EvalScores = Dict[str, float] + + +class EvalTemplateId(str, Enum): + """ + An eval template is a pre-defined eval that can be used as a starting point for a new eval. + """ + + kiln_requirements = "kiln_requirements" + toxicity = "toxicity" + bias = "bias" + maliciousness = "maliciousness" + factual_correctness = "factual_correctness" + jailbreak = "jailbreak" + + +class EvalConfigType(str, Enum): + g_eval = "g_eval" + llm_as_judge = "llm_as_judge" + + +class EvalOutputScore(BaseModel): + """ + A definition of a score that an evaluator will produce. + + Very similar to TaskRequirement, but conceptually different keeping in a separate models. + """ + + name: str = Field( + description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." + ) + instruction: str | None = Field( + default=None, + description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", + ) + type: TaskOutputRatingType = Field( + description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." + ) + + def json_key(self) -> str: + """ + The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. + + For example, "Overall Rating" -> "overall_rating" + """ + return string_to_json_key(self.name) + + @model_validator(mode="after") + def validate_type(self) -> Self: + if self.type == TaskOutputRatingType.custom: + raise ValueError( + f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." + ) + return self + + +class EvalRun(KilnParentedModel): + """ + The results of running an eval on a single dataset item. + + This is a child of an EvalConfig, which specifies how the scores were generated. + + Eval runs can be one of 2 types: + 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. + 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. + """ + + dataset_id: ID_TYPE = Field( + description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." + ) + task_run_config_id: ID_TYPE | None = Field( + description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." + ) + eval_config_eval: bool = Field( + description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", + default=False, + ) + # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. + input: str = Field( + description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." + ) + output: str = Field( + description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." + ) + intermediate_outputs: Dict[str, str] | None = Field( + default=None, + description="The intermediate outputs of the task (example, eval thinking).", + ) + scores: EvalScores = Field( + description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." + ) + + def parent_eval_config(self) -> Union["EvalConfig", None]: + if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": + raise ValueError("parent must be an EvalConfig") + return self.parent # type: ignore + + @model_validator(mode="after") + def validate_eval_run_types(self) -> Self: + if self.eval_config_eval and self.task_run_config_id is not None: + raise ValueError( + "task_run_config_id must be None if eval_config_eval is true" + ) + if not self.eval_config_eval and self.task_run_config_id is None: + raise ValueError( + "task_run_config_id must be set if eval_config_eval is false" + ) + return self + + @model_validator(mode="after") + def validate_scores(self) -> Self: + # We're checking the scores have the expected keys from the grand-parent eval + if self.scores is None or len(self.scores) == 0: + raise ValueError("scores are required, and must have at least one score.") + + parent_eval_config = self.parent_eval_config() + eval = parent_eval_config.parent_eval() if parent_eval_config else None + if not eval: + # Can't validate without the grand-parent eval, allow it to be validated later + return self + + output_score_keys = [score.json_key() for score in eval.output_scores] + if set(output_score_keys) != set(self.scores.keys()): + raise ValueError( + f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" + ) + + # Check that each score is expected in this eval and the correct type + for output_score in eval.output_scores: + match output_score.type: + case TaskOutputRatingType.five_star: + five_star_score = self.scores[output_score.json_key()] + if ( + not isinstance(five_star_score, float) + or five_star_score < 1.0 + or five_star_score > 5.0 + ): + raise ValueError( + f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" + ) + case TaskOutputRatingType.pass_fail: + pass_fail_score = self.scores[output_score.json_key()] + if ( + not isinstance(pass_fail_score, float) + or pass_fail_score < 0.0 + or pass_fail_score > 1.0 + ): + raise ValueError( + f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" + ) + case TaskOutputRatingType.pass_fail_critical: + pass_fail_critical_score = self.scores[output_score.json_key()] + if ( + not isinstance(pass_fail_critical_score, float) + or pass_fail_critical_score < -1.0 + or pass_fail_critical_score > 1.0 + ): + raise ValueError( + f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" + ) + case TaskOutputRatingType.custom: + raise ValueError( + f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." + ) + case _: + # Catch missing cases + raise_exhaustive_enum_error(output_score.type) + return self + + +class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): + """ + A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. + + A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. + """ + + name: str = NAME_FIELD + model_name: str = Field( + description="The name of the model to use for this eval config. ", + ) + model_provider: str = Field( + description="The provider of the model to use for this eval config.", + ) + config_type: EvalConfigType = Field( + default=EvalConfigType.g_eval, + description="This is used to determine the type of eval to run.", + ) + properties: dict[str, Any] = Field( + default={}, + description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", + ) + + def parent_eval(self) -> Union["Eval", None]: + if self.parent is not None and self.parent.__class__.__name__ != "Eval": + raise ValueError("parent must be an Eval") + return self.parent # type: ignore + + def runs(self, readonly: bool = False) -> list[EvalRun]: + return super().runs(readonly=readonly) # type: ignore + + @model_validator(mode="after") + def validate_properties(self) -> Self: + if ( + self.config_type == EvalConfigType.g_eval + or self.config_type == EvalConfigType.llm_as_judge + ): + if "eval_steps" not in self.properties or not isinstance( + self.properties["eval_steps"], list + ): + raise ValueError("eval_steps is required and must be a list for g_eval") + if "task_description" in self.properties and not isinstance( + self.properties["task_description"], str + ): + raise ValueError( + "task_description is optional, but if provided must be a string" + ) + return self + else: + raise ValueError(f"Invalid eval config type: {self.config_type}") + + @model_validator(mode="after") + def validate_json_serializable(self) -> "EvalConfig": + try: + # This will raise a TypeError if the dict contains non-JSON-serializable objects + json.dumps(self.properties) + except TypeError as e: + raise ValueError(f"Properties must be JSON serializable: {str(e)}") + return self + + +class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): + name: str = NAME_FIELD + description: str | None = Field( + default=None, description="The description of the eval" + ) + template: EvalTemplateId | None = Field( + default=None, + description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", + ) + current_config_id: ID_TYPE = Field( + default=None, + description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", + ) + eval_set_filter_id: DatasetFilterId = Field( + description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." + ) + eval_configs_filter_id: DatasetFilterId = Field( + description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." + ) + output_scores: List[EvalOutputScore] = Field( + description="The scores this evaluator should produce." + ) + + # Workaround to return typed parent without importing Task + def parent_task(self) -> Union["Task", None]: + if self.parent is not None and self.parent.__class__.__name__ != "Task": + raise ValueError("parent must be a Task") + return self.parent # type: ignore + + def configs(self, readonly: bool = False) -> list[EvalConfig]: + return super().configs(readonly=readonly) # type: ignore + + @model_validator(mode="after") + def validate_scores(self) -> Self: + if self.output_scores is None or len(self.output_scores) == 0: + raise ValueError( + "output_scores are required, and must have at least one score." + ) + + # check for duplicate names (once transformed to JSON keys) + output_score_keys = [score.json_key() for score in self.output_scores] + if len(output_score_keys) != len(set(output_score_keys)): + raise ValueError( + f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" + ) + return self diff --git a/libs/core/kiln_ai/datamodel/eval_datamodel.py b/libs/core/kiln_ai/datamodel/eval_datamodel.py deleted file mode 100644 index 6cf4a23b..00000000 --- a/libs/core/kiln_ai/datamodel/eval_datamodel.py +++ /dev/null @@ -1,10 +0,0 @@ -from pydantic import Field - -from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnBaseModel - - -class Eval(KilnBaseModel): - name: str = NAME_FIELD - description: str | None = Field( - default=None, description="The description of the eval" - ) diff --git a/libs/core/kiln_ai/datamodel/json_schema.py b/libs/core/kiln_ai/datamodel/json_schema.py index ffa1267e..146e4ca3 100644 --- a/libs/core/kiln_ai/datamodel/json_schema.py +++ b/libs/core/kiln_ai/datamodel/json_schema.py @@ -1,4 +1,5 @@ import json +import re from typing import Annotated, Dict import jsonschema @@ -83,3 +84,8 @@ def schema_from_json_str(v: str) -> Dict: raise ValueError(f"Invalid JSON: {v}\n {e}") except Exception as e: raise ValueError(f"Unexpected error parsing JSON schema: {v}\n {e}") + + +def string_to_json_key(s: str) -> str: + """Convert a string to a valid JSON key.""" + return re.sub(r"[^a-z0-9_]", "", s.strip().lower().replace(" ", "_")) diff --git a/libs/core/kiln_ai/datamodel/prompt.py b/libs/core/kiln_ai/datamodel/prompt.py index c4ec7d5e..5ffd2875 100644 --- a/libs/core/kiln_ai/datamodel/prompt.py +++ b/libs/core/kiln_ai/datamodel/prompt.py @@ -1,14 +1,24 @@ -from pydantic import Field +from pydantic import BaseModel, Field from kiln_ai.datamodel.basemodel import NAME_FIELD, KilnParentedModel -class Prompt(KilnParentedModel): +class BasePrompt(BaseModel): """ - A prompt for a task. + A prompt for a task. This is the basic data storage format which can be used throughout a project. + + The "Prompt" model name is reserved for the custom prompts parented by a task. """ name: str = NAME_FIELD + description: str | None = Field( + default=None, + description="A more detailed description of the prompt.", + ) + generator_id: str | None = Field( + default=None, + description="The id of the generator that created this prompt.", + ) prompt: str = Field( description="The prompt for the task.", min_length=1, @@ -17,3 +27,11 @@ class Prompt(KilnParentedModel): default=None, description="Instructions for the model 'thinking' about the requirement prior to answering. Used for chain of thought style prompting. COT will not be used unless this is provided.", ) + + +class Prompt(KilnParentedModel, BasePrompt): + """ + A prompt for a task. This is the custom prompt parented by a task. + """ + + pass diff --git a/libs/core/kiln_ai/datamodel/prompt_id.py b/libs/core/kiln_ai/datamodel/prompt_id.py new file mode 100644 index 00000000..19ca455a --- /dev/null +++ b/libs/core/kiln_ai/datamodel/prompt_id.py @@ -0,0 +1,83 @@ +from enum import Enum +from typing import Annotated + +from pydantic import AfterValidator + + +# Generators that can take any task and build a prompt +class PromptGenerators(str, Enum): + SIMPLE = "simple_prompt_builder" + MULTI_SHOT = "multi_shot_prompt_builder" + FEW_SHOT = "few_shot_prompt_builder" + REPAIRS = "repairs_prompt_builder" + SIMPLE_CHAIN_OF_THOUGHT = "simple_chain_of_thought_prompt_builder" + FEW_SHOT_CHAIN_OF_THOUGHT = "few_shot_chain_of_thought_prompt_builder" + MULTI_SHOT_CHAIN_OF_THOUGHT = "multi_shot_chain_of_thought_prompt_builder" + + +prompt_generator_values = [pg.value for pg in PromptGenerators] + + +PromptId = Annotated[ + str, + AfterValidator(lambda v: _check_prompt_id(v)), +] +""" +A pydantic type that validates strings containing a valid prompt ID. + +Prompt IDs can be one of: +- A saved prompt ID +- A fine-tune prompt ID +- A task run config ID +- A prompt generator name +""" + + +def _check_prompt_id(id: str) -> str: + """ + Check that the prompt ID is valid. + """ + if id in prompt_generator_values: + return id + + if id.startswith("id::"): + # check it has 4 parts divided by :: -- 'id::project_id::task_id::prompt_id' + parts = id.split("::") + if len(parts) != 2 or len(parts[1]) == 0: + raise ValueError( + f"Invalid saved prompt ID: {id}. Expected format: 'id::[prompt_id]'." + ) + return id + + if id.startswith("task_run_config::"): + # check it had a eval_id after the :: -- 'project_id::task_id::task_run_config_id' + parts = id.split("::") + if len(parts) != 4: + raise ValueError( + f"Invalid task run config prompt ID: {id}. Expected format: 'task_run_config::[project_id]::[task_id]::[task_run_config_id]'." + ) + return id + + if id.startswith("fine_tune_prompt::"): + # check it had a fine_tune_id after the :: -- 'fine_tune_prompt::fine_tune_id' + fine_tune_id = id[18:] + if len(fine_tune_id) == 0: + raise ValueError( + f"Invalid fine-tune prompt ID: {id}. Expected format: 'fine_tune_prompt::[fine_tune_id]'." + ) + return id + + raise ValueError(f"Invalid prompt ID: {id}") + + +def is_frozen_prompt(id: PromptId) -> bool: + """ + Check if the prompt ID is a frozen prompt. + """ + if id.startswith("id::"): + return True + if id.startswith("task_run_config::"): + return True + if id.startswith("fine_tune_prompt::"): + return True + return False diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py index 38ac7885..03c8c756 100644 --- a/libs/core/kiln_ai/datamodel/task.py +++ b/libs/core/kiln_ai/datamodel/task.py @@ -1,4 +1,4 @@ -from typing import Dict, List +from typing import TYPE_CHECKING, Dict, List, Union from pydantic import BaseModel, Field @@ -13,10 +13,15 @@ ) from kiln_ai.datamodel.datamodel_enums import Priority, TaskOutputRatingType from kiln_ai.datamodel.dataset_split import DatasetSplit +from kiln_ai.datamodel.eval import Eval from kiln_ai.datamodel.json_schema import JsonObjectSchema, schema_from_json_str -from kiln_ai.datamodel.prompt import Prompt +from kiln_ai.datamodel.prompt import BasePrompt, Prompt +from kiln_ai.datamodel.prompt_id import PromptId from kiln_ai.datamodel.task_run import TaskRun +if TYPE_CHECKING: + from kiln_ai.datamodel.project import Project + class TaskRequirement(BaseModel): """ @@ -34,6 +39,76 @@ class TaskRequirement(BaseModel): type: TaskOutputRatingType = Field(default=TaskOutputRatingType.five_star) +class RunConfigProperties(BaseModel): + """ + A configuration for running a task. + + This includes everything needed to run a task, except the input and task ID. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + """ + + model_name: str = Field(description="The model to use for this run config.") + model_provider_name: str = Field( + description="The provider to use for this run config." + ) + prompt_id: PromptId = Field( + description="The prompt to use for this run config. Defaults to building a simple prompt from the task if not provided.", + ) + + +class RunConfig(RunConfigProperties): + """ + A configuration for running a task. + + This includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + + For example: task, model, provider, prompt, etc. + """ + + task: "Task" = Field(description="The task to run.") + + +class TaskRunConfig(KilnParentedModel): + """ + A Kiln model for persisting a run config in a Kiln Project, nested under a task. + + Typically used to save a method of running a task for evaluation. + + A run config includes everything needed to run a task, except the input. Running the same RunConfig with the same input should make identical calls to the model (output may vary as models are non-deterministic). + """ + + name: str = NAME_FIELD + description: str | None = Field( + default=None, description="The description of the task run config." + ) + run_config_properties: RunConfigProperties = Field( + description="The run config properties to use for this task run." + ) + # The prompt_id in the run_config_properties is the prompt ID to use for this task run. + # However, we want the prompt to be perfectly consistent, and some prompt_ids are dynamic. + # If we need to "freeze" a prompt, we can do so here (then point the prompt_id to this frozen prompt). + prompt: BasePrompt | None = Field( + default=None, + description="A prompt to use for run config.", + ) + + # Workaround to return typed parent without importing Task + def parent_task(self) -> Union["Task", None]: + if self.parent is None or self.parent.__class__.__name__ != "Task": + return None + return self.parent # type: ignore + + def run_config(self) -> RunConfig: + parent_task = self.parent_task() + if parent_task is None: + raise ValueError("Run config must be parented to a task") + return RunConfig( + task=parent_task, + model_name=self.run_config_properties.model_name, + model_provider_name=self.run_config_properties.model_provider_name, + prompt_id=self.run_config_properties.prompt_id, + ) + + class Task( KilnParentedModel, KilnParentModel, @@ -42,6 +117,8 @@ class Task( "dataset_splits": DatasetSplit, "finetunes": Finetune, "prompts": Prompt, + "evals": Eval, + "run_configs": TaskRunConfig, }, ): """ @@ -90,3 +167,15 @@ def finetunes(self, readonly: bool = False) -> list[Finetune]: def prompts(self, readonly: bool = False) -> list[Prompt]: return super().prompts(readonly=readonly) # type: ignore + + def evals(self, readonly: bool = False) -> list[Eval]: + return super().evals(readonly=readonly) # type: ignore + + def run_configs(self, readonly: bool = False) -> list[TaskRunConfig]: + return super().run_configs(readonly=readonly) # type: ignore + + # Workaround to return typed parent without importing Task + def parent_project(self) -> Union["Project", None]: + if self.parent is None or self.parent.__class__.__name__ != "Project": + return None + return self.parent # type: ignore diff --git a/libs/core/kiln_ai/datamodel/task_output.py b/libs/core/kiln_ai/datamodel/task_output.py index ae0de84d..475bb547 100644 --- a/libs/core/kiln_ai/datamodel/task_output.py +++ b/libs/core/kiln_ai/datamodel/task_output.py @@ -11,6 +11,7 @@ from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType from kiln_ai.datamodel.json_schema import validate_schema from kiln_ai.datamodel.strict_mode import strict_mode +from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error if TYPE_CHECKING: from kiln_ai.datamodel.task import Task @@ -25,6 +26,27 @@ class RequirementRating(BaseModel): type: TaskOutputRatingType = Field(description="The type of rating") +def normalize_rating(rating: float, rating_type: TaskOutputRatingType) -> float: + """Normalize a rating to a 0-1 scale. Simple normalization, not z-score.""" + match rating_type: + case TaskOutputRatingType.five_star: + if rating < 1 or rating > 5: + raise ValueError("Five star rating must be between 1 and 5") + return (rating - 1) / 4 + case TaskOutputRatingType.pass_fail: + if rating < 0 or rating > 1: + raise ValueError("Pass fail rating must 0 to 1") + return rating + case TaskOutputRatingType.pass_fail_critical: + if rating < -1 or rating > 1: + raise ValueError("Pass fail critical rating must -1 to 1") + return (rating + 1) / 2 # -1 to 1 + case TaskOutputRatingType.custom: + raise ValueError("Custom rating type can not be normalized") + case _: + raise_exhaustive_enum_error(rating_type) + + class TaskOutputRating(KilnBaseModel): """ A rating for a task output, including an overall rating and ratings for each requirement. @@ -205,13 +227,13 @@ class DataSource(BaseModel): not_allowed_for=[DataSourceType.human], ), DataSourceProperty( + # Legacy field -- allow loading from old runs, but we shouldn't be setting it. name="prompt_builder_name", type=str, not_allowed_for=[DataSourceType.human], ), DataSourceProperty( - # Optional: an ID within the scope of the prompt_builder_name. - # Used for prompt builders with IDs (like saved prompts, fine-tune prompts) + # The PromptId of the prompt. Can be a saved prompt, fine-tune, generator name, etc. See PromptId type for more details. name="prompt_id", type=str, not_allowed_for=[DataSourceType.human], diff --git a/libs/core/kiln_ai/datamodel/test_basemodel.py b/libs/core/kiln_ai/datamodel/test_basemodel.py index 460b9dea..de33f2df 100644 --- a/libs/core/kiln_ai/datamodel/test_basemodel.py +++ b/libs/core/kiln_ai/datamodel/test_basemodel.py @@ -6,7 +6,7 @@ import pytest -from kiln_ai.adapters.model_adapters.base_adapter import AdapterInfo, BaseAdapter +from kiln_ai.adapters.model_adapters.base_adapter import BaseAdapter from kiln_ai.adapters.run_output import RunOutput from kiln_ai.datamodel import Task, TaskRun from kiln_ai.datamodel.basemodel import ( @@ -15,6 +15,7 @@ string_to_valid_name, ) from kiln_ai.datamodel.model_cache import ModelCache +from kiln_ai.datamodel.task import RunConfig @pytest.fixture @@ -484,13 +485,8 @@ class MockAdapter(BaseAdapter): async def _run(self, input): return RunOutput(output="test output", intermediate_outputs=None) - def adapter_info(self) -> AdapterInfo: - return AdapterInfo( - adapter_name="test", - model_name=self.model_name, - model_provider=self.model_provider_name, - prompt_builder_name="test", - ) + def adapter_name(self) -> str: + return "test" @pytest.fixture @@ -501,9 +497,12 @@ def base_task(): @pytest.fixture def adapter(base_task): return MockAdapter( - kiln_task=base_task, - model_name="test_model", - model_provider_name="test_provider", + run_config=RunConfig( + task=base_task, + model_name="test_model", + model_provider_name="test_provider", + prompt_id="simple_prompt_builder", + ), ) diff --git a/libs/core/kiln_ai/datamodel/test_dataset_filters.py b/libs/core/kiln_ai/datamodel/test_dataset_filters.py new file mode 100644 index 00000000..43130f92 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_dataset_filters.py @@ -0,0 +1,71 @@ +import pytest +from pydantic import BaseModel + +from kiln_ai.datamodel.dataset_filters import ( + AllDatasetFilter, + DatasetFilterId, + HighRatingDatasetFilter, + StaticDatasetFilters, + TagFilter, + ThinkingModelDatasetFilter, + ThinkingModelHighRatedFilter, + dataset_filter_from_id, +) + +# Note: Many more filter tests in test_dataset_split.py + + +def test_all_dataset_filter_from_id(): + assert dataset_filter_from_id("all") == AllDatasetFilter + + +def test_high_rating_dataset_filter_from_id(): + assert dataset_filter_from_id("high_rating") == HighRatingDatasetFilter + + +def test_thinking_model_dataset_filter_from_id(): + assert dataset_filter_from_id("thinking_model") == ThinkingModelDatasetFilter + + +def test_thinking_model_high_rated_dataset_filter_from_id(): + assert ( + dataset_filter_from_id("thinking_model_high_rated") + == ThinkingModelHighRatedFilter + ) + + +def test_all_static_dataset_filters(): + for filter_id in StaticDatasetFilters: + assert dataset_filter_from_id(filter_id) is not None + + +class ModelTester(BaseModel): + dsid: DatasetFilterId + + +@pytest.mark.parametrize( + "tag,expected_error,expected_tag", + [ + ("tag::test", False, "test"), + ("tag::other", False, "other"), + ("tag::", True, None), + ("tag", True, None), + ("", True, None), + ], +) +def test_tag_filter(tag, expected_error, expected_tag): + # Check our model validators + if expected_error: + with pytest.raises(ValueError): + ModelTester(dsid=tag) + else: + ModelTester(dsid=tag) + + # Check the constructor + if expected_tag is None: + with pytest.raises(ValueError, match="Invalid dataset filter ID:"): + dataset_filter_from_id(tag) + else: + filter = dataset_filter_from_id(tag) + assert isinstance(filter, TagFilter) + assert filter.tag == expected_tag diff --git a/libs/core/kiln_ai/datamodel/test_dataset_split.py b/libs/core/kiln_ai/datamodel/test_dataset_split.py index b00d5a8e..c3b92caa 100644 --- a/libs/core/kiln_ai/datamodel/test_dataset_split.py +++ b/libs/core/kiln_ai/datamodel/test_dataset_split.py @@ -14,14 +14,16 @@ TaskRun, ) from kiln_ai.datamodel.dataset_split import ( - AllDatasetFilter, AllSplitDefinition, - DatasetFilterType, + Train60Test20Val20SplitDefinition, + Train80Test20SplitDefinition, +) +from kiln_ai.datamodel.test_dataset_filters import ( + AllDatasetFilter, HighRatingDatasetFilter, + TagFilter, ThinkingModelDatasetFilter, ThinkingModelHighRatedFilter, - Train60Test20Val20SplitDefinition, - Train80Test20SplitDefinition, ) @@ -44,6 +46,7 @@ def sample_task_runs(sample_task): task_runs = [] for i in range(10): rating = 5 if i < 6 else 1 # 6 high, 4 low ratings + tags = ["tag1"] if i < 6 else [] task_run = TaskRun( parent=sample_task, input=f"input_{i}", @@ -61,6 +64,7 @@ def sample_task_runs(sample_task): value=rating, type=TaskOutputRatingType.five_star ), ), + tags=tags, ) task_run.save_to_file() task_runs.append(task_run) @@ -201,10 +205,10 @@ def test_dataset_split_with_high_rating_filter(sample_task, sample_task_runs): "Split Name", sample_task, Train80Test20SplitDefinition, - filter_type=DatasetFilterType.HIGH_RATING, + filter_id="high_rating", ) - assert dataset.filter == DatasetFilterType.HIGH_RATING + assert dataset.filter == "high_rating" # Check that only high-rated task runs are included all_ids = [] @@ -331,3 +335,21 @@ def test_thinking_model_dataset_filter_high_rated( ) assert ThinkingModelHighRatedFilter(task_run) is expected_result + + +def test_tag_dataset_filter(sample_task_runs): + num_tagged = 0 + num_untagged = 0 + filter = TagFilter("tag1") + for task_run in sample_task_runs: + if "tag1" in task_run.tags: + num_tagged += 1 + assert "tag1" in task_run.tags + assert filter(task_run) is True + else: + num_untagged += 1 + assert "tag1" not in task_run.tags + assert filter(task_run) is False + + assert num_tagged == 6 + assert num_untagged == 4 diff --git a/libs/core/kiln_ai/datamodel/test_datasource.py b/libs/core/kiln_ai/datamodel/test_datasource.py index f10ef140..934a96a4 100644 --- a/libs/core/kiln_ai/datamodel/test_datasource.py +++ b/libs/core/kiln_ai/datamodel/test_datasource.py @@ -18,14 +18,14 @@ def test_valid_synthetic_data_source(): properties={ "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "completion", + "prompt_id": "simple_prompt_builder", "adapter_name": "langchain", }, ) assert data_source.type == DataSourceType.synthetic assert data_source.properties["model_name"] == "GPT-4" assert data_source.properties["model_provider"] == "OpenAI" - assert data_source.properties["prompt_builder_name"] == "completion" + assert data_source.properties["prompt_id"] == "simple_prompt_builder" assert data_source.properties["adapter_name"] == "langchain" @@ -85,6 +85,7 @@ def test_prompt_type_optional_for_synthetic(): }, ) assert "prompt_builder_name" not in data_source.properties + assert "prompt_id" not in data_source.properties def test_private_data_source_properties_not_serialized(): diff --git a/libs/core/kiln_ai/datamodel/test_eval_model.py b/libs/core/kiln_ai/datamodel/test_eval_model.py new file mode 100644 index 00000000..3c9cb72e --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_eval_model.py @@ -0,0 +1,635 @@ +import pytest +from pydantic import ValidationError + +from kiln_ai.datamodel import BasePrompt +from kiln_ai.datamodel.basemodel import KilnParentModel +from kiln_ai.datamodel.eval import ( + Eval, + EvalConfig, + EvalConfigType, + EvalOutputScore, + EvalRun, +) +from kiln_ai.datamodel.task import Task +from kiln_ai.datamodel.task_output import ( + TaskOutputRatingType, +) + + +@pytest.fixture +def mock_task(): + return Task(name="Test Task", instruction="Test instruction") + + +@pytest.fixture +def valid_eval_config_data(): + return { + "name": "Test Eval Config", + "config_type": EvalConfigType.g_eval, + "properties": {"eval_steps": ["step1", "step2"]}, + "model_name": "gpt-4", + "model_provider": "openai", + } + + +@pytest.fixture +def valid_eval_config(valid_eval_config_data): + return EvalConfig(**valid_eval_config_data) + + +def test_eval_config_valid(valid_eval_config): + assert valid_eval_config.name == "Test Eval Config" + assert valid_eval_config.config_type == EvalConfigType.g_eval + assert valid_eval_config.properties["eval_steps"] == ["step1", "step2"] + assert valid_eval_config.model_name == "gpt-4" + assert valid_eval_config.model_provider == "openai" + + +def test_eval_config_missing_eval_steps(valid_eval_config): + with pytest.raises( + ValueError, match="eval_steps is required and must be a list for g_eval" + ): + valid_eval_config.properties = {} + + +def test_eval_config_missing_task_description(valid_eval_config): + with pytest.raises( + ValueError, + match="task_description is optional, but if provided must be a string", + ): + valid_eval_config.properties = {"task_description": 123, "eval_steps": []} + + +def test_eval_config_invalid_json(valid_eval_config): + class InvalidClass: + pass + + with pytest.raises(ValueError, match="Properties must be JSON serializable"): + valid_eval_config.properties = { + "eval_steps": [], + "invalid_key": InvalidClass(), + } + + +def test_eval_config_invalid_eval_steps_type(valid_eval_config): + with pytest.raises( + ValueError, match="eval_steps is required and must be a list for g_eval" + ): + valid_eval_config.properties = {"eval_steps": "not a list"} + + +def test_eval_config_invalid_config_type(valid_eval_config): + # Create an invalid config type using string + with pytest.raises(ValueError): + valid_eval_config.config_type = "invalid_type" + + +def test_eval_basic_properties(): + eval = Eval( + name="Test Eval", + description="Test Description", + current_config_id="config123", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ) + ], + ) + + assert eval.name == "Test Eval" + assert eval.description == "Test Description" + assert eval.current_config_id == "config123" + assert eval.output_scores[0].name == "accuracy" + assert eval.output_scores[0].type == TaskOutputRatingType.five_star + + +def test_eval_default_values(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="quality", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + + assert eval.description is None + assert eval.current_config_id is None + + +def test_eval_parent_task_relationship(mock_task, valid_eval_config_data): + eval = Eval( + name="Test Eval", + parent=mock_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + config = EvalConfig(parent=eval, **valid_eval_config_data) + + assert eval.parent_task() == mock_task + assert eval.parent == mock_task + assert config.parent == eval + assert config.parent_eval() == eval + + +def test_eval_parent_task_none(): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + assert eval.parent_task() is None + + +def test_eval_parent_task_wrong_type(): + # Create a non-Task parent + class DummyParent(KilnParentModel, parent_of={}): + pass + + with pytest.raises(ValueError): + Eval(name="Test Eval", parent=DummyParent()) + + +def test_eval_with_persisted_children(mock_task, valid_eval_config_data, tmp_path): + task_path = tmp_path / "task.kiln" + mock_task.path = task_path + mock_task.save_to_file() + + eval = Eval( + name="Test Eval", + parent=mock_task, + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + eval.save_to_file() + + # Add config using the parent relationship + config = EvalConfig(parent=eval, **valid_eval_config_data) + config.save_to_file() + + run = EvalRun( + parent=config, + dataset_id="dataset123", + task_run_config_id="config456", + input='{"key": "value"}', + output='{"result": "success"}', + scores={"accuracy": 0.95}, + ) + run.save_to_file() + + # Test configs can be retrieved from disk + evals = mock_task.evals() + assert len(evals) == 1 + assert evals[0].name == "Test Eval" + configs = evals[0].configs() + assert len(configs) == 1 + assert configs[0].model_provider == "openai" + assert configs[0].model_name == "gpt-4" + + # and back up + assert configs[0].parent_eval().parent_task().path == task_path + + # Test runs can be retrieved from disk + runs = configs[0].runs() + assert len(runs) == 1 + assert runs[0].dataset_id == "dataset123" + assert runs[0].task_run_config_id == "config456" + assert runs[0].input == '{"key": "value"}' + assert runs[0].output == '{"result": "success"}' + assert runs[0].scores == {"accuracy": 0.95} + + # and back up + assert runs[0].parent_eval_config().parent_eval().parent_task().path == task_path + + +def test_eval_run_valid_creation(): + """Test creating an EvalRun with valid data""" + eval_run = EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input='{"key": "value"}', # JSON formatted input + output='{"result": "success"}', # JSON formatted output + scores={"accuracy": 0.95}, + ) + + assert eval_run.dataset_id == "dataset123" + assert eval_run.task_run_config_id == "config456" + assert eval_run.input == '{"key": "value"}' + assert eval_run.output == '{"result": "success"}' + assert eval_run.scores == {"accuracy": 0.95} + + +def test_eval_run_plaintext(): + """Test creating an EvalRun with plaintext input/output""" + eval_run = EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input="What is the capital of France?", + output="The capital of France is Paris.", + scores={"accuracy": 1.0}, + ) + + assert eval_run.input == "What is the capital of France?" + assert eval_run.output == "The capital of France is Paris." + + +def test_eval_run_missing_required_fields(): + """Test that omitting required fields raises ValidationError""" + with pytest.raises(ValidationError) as exc_info: + EvalRun( + dataset_id="dataset123", + # missing task_run_config_id + input="test", + output="test", + scores={"score": 1.0}, + ) + + assert "task_run_config_id" in str(exc_info.value) + + +def test_eval_run_invalid_scores(): + """Test that scores must be a dict of floats""" + with pytest.raises(ValidationError): + EvalRun( + dataset_id="dataset123", + task_run_config_id="config456", + input="test", + output="test", + scores={"score": "not a float"}, # invalid score type + ) + + +def test_eval_missing_output_scores(): + """Test that eval creation fails when output_scores is missing""" + with pytest.raises(ValidationError) as exc_info: + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + ) + assert "output_scores" in str(exc_info.value) + + +def test_eval_empty_output_scores(): + """Test that eval creation fails when output_scores is empty""" + with pytest.raises( + ValueError, match="output_scores are required, and must have at least one score" + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[], + ) + + +def test_eval_duplicate_output_scores(): + """Test that eval creation fails when output_scores has duplicate names""" + with pytest.raises( + ValueError, + match="must have unique names", + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore(name="SCORE", type=TaskOutputRatingType.pass_fail), + ], + ) + + +def test_eval_invalid_score_type(): + """Test that eval creation fails with invalid rating type in output_scores""" + with pytest.raises( + ValueError, + match="Input should be 'five_star', 'pass_fail', 'pass_fail_critical'", + ): + Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="score", + type="invalid_type", + ) + ], + ) + + +def test_eval_valid_output_scores(): + """Test that eval creation succeeds with valid output_scores""" + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="critical_check", + type=TaskOutputRatingType.pass_fail_critical, + ), + EvalOutputScore(name="basic_check", type=TaskOutputRatingType.pass_fail), + ], + ) + assert len(eval.output_scores) == 3 + assert eval.output_scores[0].type == TaskOutputRatingType.five_star + assert eval.output_scores[0].name == "accuracy" + assert eval.output_scores[1].type == TaskOutputRatingType.pass_fail_critical + assert eval.output_scores[1].name == "critical_check" + assert eval.output_scores[2].type == TaskOutputRatingType.pass_fail + assert eval.output_scores[2].name == "basic_check" + + +@pytest.fixture +def valid_eval_run_data(): + return { + "dataset_id": "dataset123", + "task_run_config_id": "config456", + "input": "test input", + "output": "test output", + "scores": {"accuracy": 4.5}, + } + + +def test_eval_run_five_star_score_validation(valid_eval_config, valid_eval_run_data): + # Setup eval with five_star rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid score + run = EvalRun(parent=valid_eval_config, **valid_eval_run_data) + assert run.scores["accuracy"] == 4.5 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 0.5}}, + ) + + with pytest.raises(ValueError, match="must be a float between 1.0 and 5.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 5.5}}, + ) + + +def test_eval_run_pass_fail_score_validation(valid_eval_config, valid_eval_run_data): + # Setup eval with pass_fail rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="check", + type=TaskOutputRatingType.pass_fail, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid scores + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 1.0}} + ) + assert run.scores["check"] == 1.0 + + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"check": 0.0}} + ) + assert run.scores["check"] == 0.0 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"check": -0.1}}, + ) + + with pytest.raises(ValueError, match="must be a float between 0.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"check": 1.1}}, + ) + + +def test_eval_run_pass_fail_critical_score_validation( + valid_eval_config, valid_eval_run_data +): + # Setup eval with pass_fail_critical rating + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="critical", + type=TaskOutputRatingType.pass_fail_critical, + ) + ], + ) + valid_eval_config.parent = eval + + # Valid scores + run = EvalRun( + parent=valid_eval_config, **{**valid_eval_run_data, "scores": {"critical": 1.0}} + ) + assert run.scores["critical"] == 1.0 + + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": -1.0}}, + ) + assert run.scores["critical"] == -1.0 + + # Invalid scores + with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": -1.1}}, + ) + + with pytest.raises(ValueError, match="must be a float between -1.0 and 1.0"): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": 1.1}}, + ) + + +def test_eval_run_score_keys_must_match(valid_eval_config, valid_eval_run_data): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="accuracy", + type=TaskOutputRatingType.five_star, + ), + EvalOutputScore( + name="critical", + type=TaskOutputRatingType.pass_fail_critical, + ), + ], + ) + valid_eval_config.parent = eval + + # Correct + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "critical": 1.0}}, + ) + + # Correct but wrong order still okay + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"critical": 1.0, "accuracy": 4.5}}, + ) + + # Missing score + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5}}, + ) + + # Extra score + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{ + **valid_eval_run_data, + "scores": {"accuracy": 4.5, "critical": 1.0, "extra": 1.0}, + }, + ) + + # Missing score w matching count + with pytest.raises( + ValueError, + match="The scores produced by the evaluator must match the scores expected by the eval", + ): + run = EvalRun( + parent=valid_eval_config, + **{**valid_eval_run_data, "scores": {"accuracy": 4.5, "wrong": 1.0}}, + ) + + +def test_eval_run_custom_scores_not_allowed(valid_eval_config, valid_eval_run_data): + with pytest.raises( + ValueError, match="Custom scores are not supported in evaluators" + ): + eval = Eval( + name="Test Eval", + eval_set_filter_id="tag::tag1", + eval_configs_filter_id="tag::tag2", + output_scores=[ + EvalOutputScore( + name="custom", + type=TaskOutputRatingType.custom, + ) + ], + ) + + +def test_eval_run_eval_config_eval_validation(): + """Test that eval_config_eval and task_run_config_id validation works correctly""" + + # Case 1: Valid configuration - eval_config_eval=True and task_run_config_id=None + valid_run1 = EvalRun( + dataset_id="dataset123", + eval_config_eval=True, + task_run_config_id=None, + input="test input", + output="test output", + scores={"score": 1.0}, + ) + assert valid_run1.eval_config_eval is True + assert valid_run1.task_run_config_id is None + + # Case 2: Valid configuration - eval_config_eval=False and task_run_config_id is set + valid_run2 = EvalRun( + dataset_id="dataset123", + eval_config_eval=False, + task_run_config_id="config456", + input="test input", + output="test output", + scores={"score": 1.0}, + ) + assert valid_run2.eval_config_eval is False + assert valid_run2.task_run_config_id == "config456" + + # Case 3: Invalid configuration - eval_config_eval=True but task_run_config_id is set + with pytest.raises( + ValueError, match="task_run_config_id must be None if eval_config_eval is true" + ): + EvalRun( + dataset_id="dataset123", + eval_config_eval=True, + task_run_config_id="config456", + input="test input", + output="test output", + scores={"score": 1.0}, + ) + + # Case 4: Invalid configuration - eval_config_eval=False but task_run_config_id is None + with pytest.raises( + ValueError, match="task_run_config_id must be set if eval_config_eval is false" + ): + EvalRun( + dataset_id="dataset123", + eval_config_eval=False, + task_run_config_id=None, + input="test input", + output="test output", + scores={"score": 1.0}, + ) diff --git a/libs/core/kiln_ai/datamodel/test_example_models.py b/libs/core/kiln_ai/datamodel/test_example_models.py index 423fa208..a126f5f0 100644 --- a/libs/core/kiln_ai/datamodel/test_example_models.py +++ b/libs/core/kiln_ai/datamodel/test_example_models.py @@ -140,7 +140,7 @@ def test_structured_output_workflow(tmp_path): # Create runs runs = [] - for source in DataSourceType: + for source in [DataSourceType.human, DataSourceType.synthetic]: for _ in range(2): task_run = TaskRun( input="Generate info for John Doe", @@ -155,7 +155,7 @@ def test_structured_output_workflow(tmp_path): "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), parent=task, @@ -214,9 +214,9 @@ def test_structured_output_workflow(tmp_path): assert loaded_task.name == "Structured Output Task" assert len(loaded_task.requirements) == 2 - assert len(loaded_task.runs()) == 5 - loaded_runs = loaded_task.runs() + assert len(loaded_runs) == 5 + for task_run in loaded_runs: output = task_run.output assert output.rating is not None @@ -470,7 +470,7 @@ def test_valid_synthetic_task_output(): "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), ) @@ -478,7 +478,7 @@ def test_valid_synthetic_task_output(): assert output.source.properties["adapter_name"] == "TestAdapter" assert output.source.properties["model_name"] == "GPT-4" assert output.source.properties["model_provider"] == "OpenAI" - assert output.source.properties["prompt_builder_name"] == "TestPromptBuilder" + assert output.source.properties["prompt_id"] == "simple_prompt_builder" def test_invalid_synthetic_task_output_missing_keys(): @@ -507,23 +507,21 @@ def test_invalid_synthetic_task_output_empty_values(): "adapter_name": "TestAdapter", "model_name": "", "model_provider": "OpenAI", - "prompt_builder_name": "TestPromptBuilder", + "prompt_id": "simple_prompt_builder", }, ), ) def test_invalid_synthetic_task_output_non_string_values(): - with pytest.raises( - ValidationError, match="'prompt_builder_name' must be of type str" - ): + with pytest.raises(ValidationError, match="'prompt_id' must be of type str"): DataSource( type=DataSourceType.synthetic, properties={ "adapter_name": "TestAdapter", "model_name": "GPT-4", "model_provider": "OpenAI", - "prompt_builder_name": 123, + "prompt_id": 123, }, ) diff --git a/libs/core/kiln_ai/datamodel/test_json_schema.py b/libs/core/kiln_ai/datamodel/test_json_schema.py index 1f574aa7..f2300078 100644 --- a/libs/core/kiln_ai/datamodel/test_json_schema.py +++ b/libs/core/kiln_ai/datamodel/test_json_schema.py @@ -4,6 +4,7 @@ from kiln_ai.datamodel.json_schema import ( JsonObjectSchema, schema_from_json_str, + string_to_json_key, validate_schema, ) @@ -123,3 +124,25 @@ def test_triangle_schema(): validate_schema({"a": 1, "b": 2, "c": 3}, json_triangle_schema) with pytest.raises(Exception): validate_schema({"a": 1, "b": 2, "c": "3"}, json_triangle_schema) + + +@pytest.mark.parametrize( + "input_str,expected", + [ + ("hello world", "hello_world"), + ("Hello World", "hello_world"), + ("hello_world", "hello_world"), + ("HELLO WORLD", "hello_world"), + ("hello123", "hello123"), + ("hello-world", "helloworld"), + ("hello!@#$%^&*()world", "helloworld"), + (" hello world ", "hello__world"), + ("hello__world", "hello__world"), + ("", ""), + ("!@#$%", ""), + ("snake_case_string", "snake_case_string"), + ("camelCaseString", "camelcasestring"), + ], +) +def test_string_to_json_key(input_str: str, expected: str): + assert string_to_json_key(input_str) == expected diff --git a/libs/core/kiln_ai/datamodel/test_prompt_id.py b/libs/core/kiln_ai/datamodel/test_prompt_id.py new file mode 100644 index 00000000..cf5d2326 --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_prompt_id.py @@ -0,0 +1,129 @@ +import pytest +from pydantic import BaseModel, ValidationError + +from kiln_ai.datamodel import ( + PromptGenerators, + PromptId, +) +from kiln_ai.datamodel.prompt_id import is_frozen_prompt + + +# Test model to validate the PromptId type +class ModelTester(BaseModel): + prompt_id: PromptId + + +def test_valid_prompt_generator_names(): + """Test that valid prompt generator names are accepted""" + for generator in PromptGenerators: + model = ModelTester(prompt_id=generator.value) + assert model.prompt_id == generator.value + + +def test_valid_saved_prompt_id(): + """Test that valid saved prompt IDs are accepted""" + valid_id = "id::prompt_789" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +def test_valid_fine_tune_prompt_id(): + """Test that valid fine-tune prompt IDs are accepted""" + valid_id = "fine_tune_prompt::ft_123456" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id", + [ + pytest.param("id::project_123::task_456", id="missing_prompt_id"), + pytest.param("id::task_456::prompt_789", id="too_many_parts"), + pytest.param("id::", id="empty_parts"), + ], +) +def test_invalid_saved_prompt_id_format(invalid_id): + """Test that invalid saved prompt ID formats are rejected""" + with pytest.raises(ValidationError, match="Invalid saved prompt ID"): + ModelTester(prompt_id=invalid_id) + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("fine_tune_prompt::", "Invalid fine-tune prompt ID: fine_tune_prompt::"), + ("fine_tune_prompt", "Invalid prompt ID: fine_tune_prompt"), + ], +) +def test_invalid_fine_tune_prompt_id_format(invalid_id, expected_error): + """Test that invalid fine-tune prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + ModelTester(prompt_id=invalid_id) + + +def test_completely_invalid_formats(): + """Test that completely invalid formats are rejected""" + invalid_ids = [ + "", # Empty string + "invalid_format", # Random string + "id:wrong_format", # Almost correct but wrong separator + "fine_tune:wrong_format", # Almost correct but wrong prefix + ":::", # Just separators + ] + + for invalid_id in invalid_ids: + with pytest.raises(ValidationError, match="Invalid prompt ID"): + ModelTester(prompt_id=invalid_id) + + +def test_prompt_generator_case_sensitivity(): + """Test that prompt generator names are case sensitive""" + # Take first generator and modify its case + first_generator = next(iter(PromptGenerators)).value + wrong_case = first_generator.upper() + if wrong_case == first_generator: + wrong_case = first_generator.lower() + + with pytest.raises(ValidationError): + ModelTester(prompt_id=wrong_case) + + +@pytest.mark.parametrize( + "valid_id", + [ + "task_run_config::project_123::task_456::config_123", # Valid task run config prompt ID + ], +) +def test_valid_task_run_config_prompt_id(valid_id): + """Test that valid eval prompt IDs are accepted""" + model = ModelTester(prompt_id=valid_id) + assert model.prompt_id == valid_id + + +@pytest.mark.parametrize( + "invalid_id,expected_error", + [ + ("task_run_config::", "Invalid task run config prompt ID"), + ("task_run_config::p1", "Invalid task run config prompt ID"), + ("task_run_config::p1::t1", "Invalid task run config prompt ID"), + ("task_run_config::p1::t1::c1::extra", "Invalid task run config prompt ID"), + ], +) +def test_invalid_eval_prompt_id_format(invalid_id, expected_error): + """Test that invalid eval prompt ID formats are rejected""" + with pytest.raises(ValidationError, match=expected_error): + ModelTester(prompt_id=invalid_id) + + +@pytest.mark.parametrize( + "id,should_be_frozen", + [ + ("simple_prompt_builder", False), + ("id::prompt_123", True), + ("task_run_config::p1::t1", True), + ("fine_tune_prompt::ft_123", True), + ], +) +def test_is_frozen_prompt(id, should_be_frozen): + """Test that the is_frozen_prompt function works""" + assert is_frozen_prompt(id) == should_be_frozen diff --git a/libs/core/kiln_ai/datamodel/test_task.py b/libs/core/kiln_ai/datamodel/test_task.py new file mode 100644 index 00000000..cf109a5c --- /dev/null +++ b/libs/core/kiln_ai/datamodel/test_task.py @@ -0,0 +1,159 @@ +import pytest +from pydantic import ValidationError + +from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType +from kiln_ai.datamodel.prompt_id import PromptGenerators +from kiln_ai.datamodel.task import RunConfig, RunConfigProperties, Task, TaskRunConfig +from kiln_ai.datamodel.task_output import normalize_rating + + +def test_runconfig_valid_creation(): + task = Task(id="task1", name="Test Task", instruction="Do something") + + config = RunConfig( + task=task, + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE, + ) + + assert config.task == task + assert config.model_name == "gpt-4" + assert config.model_provider_name == "openai" + assert config.prompt_id == PromptGenerators.SIMPLE # Check default value + + +def test_runconfig_missing_required_fields(): + with pytest.raises(ValidationError) as exc_info: + RunConfig() + + errors = exc_info.value.errors() + assert ( + len(errors) == 4 + ) # task, model_name, model_provider_name, and prompt_id are required + assert any(error["loc"][0] == "task" for error in errors) + assert any(error["loc"][0] == "model_name" for error in errors) + assert any(error["loc"][0] == "model_provider_name" for error in errors) + assert any(error["loc"][0] == "prompt_id" for error in errors) + + +def test_runconfig_custom_prompt_id(): + task = Task(id="task1", name="Test Task", instruction="Do something") + + config = RunConfig( + task=task, + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT, + ) + + assert config.prompt_id == PromptGenerators.SIMPLE_CHAIN_OF_THOUGHT + + +@pytest.fixture +def sample_task(): + return Task(name="Test Task", instruction="Test instruction") + + +@pytest.fixture +def sample_run_config_props(sample_task): + return RunConfigProperties( + model_name="gpt-4", + model_provider_name="openai", + prompt_id=PromptGenerators.SIMPLE, + ) + + +def test_task_run_config_valid_creation(sample_task, sample_run_config_props): + config = TaskRunConfig( + name="Test Config", + description="Test description", + run_config_properties=sample_run_config_props, + parent=sample_task, + ) + + assert config.name == "Test Config" + assert config.description == "Test description" + assert config.run_config_properties == sample_run_config_props + assert config.parent_task() == sample_task + + +def test_task_run_config_minimal_creation(sample_task, sample_run_config_props): + # Test creation with only required fields + config = TaskRunConfig( + name="Test Config", + run_config_properties=sample_run_config_props, + parent=sample_task, + ) + + assert config.name == "Test Config" + assert config.description is None + assert config.run_config_properties == sample_run_config_props + + +def test_task_run_config_missing_required_fields(sample_task): + # Test missing name + with pytest.raises(ValidationError) as exc_info: + TaskRunConfig( + run_config_properties=RunConfigProperties( + task=sample_task, model_name="gpt-4", model_provider_name="openai" + ), + parent=sample_task, + ) + assert "Field required" in str(exc_info.value) + + # Test missing run_config + with pytest.raises(ValidationError) as exc_info: + TaskRunConfig(name="Test Config", parent=sample_task) + assert "Field required" in str(exc_info.value) + + +def test_task_run_config_missing_task_in_run_config(sample_task): + with pytest.raises( + ValidationError, match="Input should be a valid dictionary or instance of Task" + ): + # Create a run config without a task + RunConfig( + model_name="gpt-4", + model_provider_name="openai", + task=None, # type: ignore + ) + + +@pytest.mark.parametrize( + "rating_type,rating,expected", + [ + (TaskOutputRatingType.five_star, 1, 0), + (TaskOutputRatingType.five_star, 2, 0.25), + (TaskOutputRatingType.five_star, 3, 0.5), + (TaskOutputRatingType.five_star, 4, 0.75), + (TaskOutputRatingType.five_star, 5, 1), + (TaskOutputRatingType.pass_fail, 0, 0), + (TaskOutputRatingType.pass_fail, 1, 1), + (TaskOutputRatingType.pass_fail, 0.5, 0.5), + (TaskOutputRatingType.pass_fail_critical, -1, 0), + (TaskOutputRatingType.pass_fail_critical, 0, 0.5), + (TaskOutputRatingType.pass_fail_critical, 1, 1), + (TaskOutputRatingType.pass_fail_critical, 0.5, 0.75), + ], +) +def test_normalize_rating(rating_type, rating, expected): + assert normalize_rating(rating, rating_type) == expected + + +@pytest.mark.parametrize( + "rating_type,rating", + [ + (TaskOutputRatingType.five_star, 0), + (TaskOutputRatingType.five_star, 6), + (TaskOutputRatingType.pass_fail, -0.5), + (TaskOutputRatingType.pass_fail, 1.5), + (TaskOutputRatingType.pass_fail_critical, -1.5), + (TaskOutputRatingType.pass_fail_critical, 1.5), + (TaskOutputRatingType.custom, 0), + (TaskOutputRatingType.custom, 99), + ], +) +def test_normalize_rating_errors(rating_type, rating): + with pytest.raises(ValueError): + normalize_rating(rating, rating_type) diff --git a/libs/server/kiln_server/prompt_api.py b/libs/server/kiln_server/prompt_api.py index a032ef6c..515c3697 100644 --- a/libs/server/kiln_server/prompt_api.py +++ b/libs/server/kiln_server/prompt_api.py @@ -1,19 +1,28 @@ +from datetime import datetime + from fastapi import FastAPI -from kiln_ai.datamodel import Prompt +from kiln_ai.datamodel import BasePrompt, Prompt, PromptId from pydantic import BaseModel from kiln_server.task_api import task_from_id +# This is a wrapper around the Prompt datamodel that adds an id field which represents the PromptID and not the data model ID. +class ApiPrompt(BasePrompt): + id: PromptId + created_at: datetime | None = None + created_by: str | None = None + + class PromptCreateRequest(BaseModel): name: str + description: str | None = None prompt: str chain_of_thought_instructions: str | None = None class PromptGenerator(BaseModel): id: str - ui_id: str short_description: str description: str name: str @@ -22,7 +31,7 @@ class PromptGenerator(BaseModel): class PromptResponse(BaseModel): generators: list[PromptGenerator] - prompts: list[Prompt] + prompts: list[ApiPrompt] def connect_prompt_api(app: FastAPI): @@ -34,6 +43,7 @@ async def create_prompt( prompt = Prompt( parent=parent_task, name=prompt_data.name, + description=prompt_data.description, prompt=prompt_data.prompt, chain_of_thought_instructions=prompt_data.chain_of_thought_instructions, ) @@ -44,64 +54,75 @@ async def create_prompt( async def get_prompts(project_id: str, task_id: str) -> PromptResponse: parent_task = task_from_id(project_id, task_id) + prompts: list[ApiPrompt] = [] + for prompt in parent_task.prompts(): + properties = prompt.model_dump(exclude={"id"}) + prompts.append(ApiPrompt(id=f"id::{prompt.id}", **properties)) + + # Add any task run config prompts to the list + task_run_configs = parent_task.run_configs() + for task_run_config in task_run_configs: + if task_run_config.prompt: + properties = task_run_config.prompt.model_dump(exclude={"id"}) + prompts.append( + ApiPrompt( + id=f"task_run_config::{project_id}::{task_id}::{task_run_config.id}", + **properties, + ) + ) + return PromptResponse( generators=_prompt_generators, - prompts=parent_task.prompts(), + prompts=prompts, ) +# User friendly descriptions of the prompt generators _prompt_generators = [ PromptGenerator( - id="basic", - ui_id="simple_prompt_builder", + id="simple_prompt_builder", name="Basic (Zero Shot)", short_description="Includes the instructions and requirements from your task definition.", description="A basic prompt generator. It will include the instructions and requirements from your task definition. It won't include any examples from your runs (zero-shot).", chain_of_thought=False, ), PromptGenerator( - id="few_shot", - ui_id="few_shot_prompt_builder", + id="few_shot_prompt_builder", name="Few-Shot", short_description="Includes up to 4 examples from your dataset.", description="A multi-shot prompt generator that includes up to 4 examples from your dataset (few-shot). It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="many_shot", - ui_id="multi_shot_prompt_builder", + id="multi_shot_prompt_builder", name="Many-Shot", short_description="Includes up to 25 examples from your dataset.", description="A multi-shot prompt generator that includes up to 25 examples from your dataset (many-shot). It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="repairs", - ui_id="repairs_prompt_builder", + id="repairs_prompt_builder", name="Repair Multi-Shot", short_description="Includes examples from your dataset, including human feedback about mistakes and how to correct them.", description="A multi-shot prompt that will include up to 25 examples from your dataset. This prompt will use repaired examples to show 1) the generated content which had issues, 2) the human feedback about what was incorrect, 3) the corrected and approved content. This gives the LLM examples of common errors to avoid. It also includes the instructions and requirements from your task definition.", chain_of_thought=False, ), PromptGenerator( - id="simple_chain_of_thought", - ui_id="simple_chain_of_thought_prompt_builder", + id="simple_chain_of_thought_prompt_builder", name="Chain of Thought", short_description="Gives the LLM time to 'think' before replying.", description="A chain of thought prompt generator that gives the LLM time to 'think' before replying. It will use the thinking_instruction from your task definition if it exists, or a standard 'step by step' instruction. The result will only include the final answer, not the 'thinking' tokens. The 'thinking' tokens will be available in the data model. It also includes the instructions and requirements from your task definition.", chain_of_thought=True, ), PromptGenerator( - id="few_shot_chain_of_thought", - ui_id="few_shot_chain_of_thought_prompt_builder", + id="few_shot_chain_of_thought_prompt_builder", name="Chain of Thought - Few Shot", short_description="Combines our 'Chain of Thought' generator with our 'Few-Shot' generator.", description="Combines our 'Chain of Thought' generator with our 'Few-Shot' generator, for both the thinking and the few shot examples.", chain_of_thought=True, ), PromptGenerator( - id="multi_shot_chain_of_thought", - ui_id="multi_shot_chain_of_thought_prompt_builder", + id="multi_shot_chain_of_thought_prompt_builder", name="Chain of Thought - Many Shot", short_description="Combines our 'Chain of Thought' generator with our 'Many-Shot' generator.", description="Combines our 'Chain of Thought' generator with our 'Many-Shot' generator, for both the thinking and the many shot examples.", diff --git a/libs/server/kiln_server/run_api.py b/libs/server/kiln_server/run_api.py index bd43c157..23250815 100644 --- a/libs/server/kiln_server/run_api.py +++ b/libs/server/kiln_server/run_api.py @@ -5,8 +5,14 @@ from fastapi import FastAPI, HTTPException from kiln_ai.adapters.adapter_registry import adapter_for_task from kiln_ai.adapters.ml_model_list import ModelProviderName -from kiln_ai.adapters.prompt_builders import prompt_builder_from_ui_name -from kiln_ai.datamodel import Task, TaskOutputRating, TaskOutputRatingType, TaskRun +from kiln_ai.adapters.model_adapters.base_adapter import AdapterConfig +from kiln_ai.datamodel import ( + PromptId, + Task, + TaskOutputRating, + TaskOutputRatingType, + TaskRun, +) from kiln_ai.datamodel.basemodel import ID_TYPE from pydantic import BaseModel, ConfigDict @@ -38,7 +44,7 @@ class RunTaskRequest(BaseModel): provider: str plaintext_input: str | None = None structured_input: Dict[str, Any] | None = None - ui_prompt_method: str | None = None + ui_prompt_method: PromptId | None = None tags: list[str] | None = None # Allows use of the model_name field (usually pydantic will reserve model_*) @@ -188,21 +194,12 @@ async def run_task( ) -> TaskRun: task = task_from_id(project_id, task_id) - prompt_builder = prompt_builder_from_ui_name( - request.ui_prompt_method or "basic", - task, - ) - if prompt_builder is None: - raise HTTPException( - status_code=400, - detail=f"Unknown prompt method: {request.ui_prompt_method}", - ) adapter = adapter_for_task( task, model_name=request.model_name, provider=model_provider_from_string(request.provider), - prompt_builder=prompt_builder, - tags=request.tags, + prompt_id=request.ui_prompt_method or "simple_prompt_builder", + base_adapter_config=AdapterConfig(default_tags=request.tags), ) input = request.plaintext_input diff --git a/libs/server/kiln_server/test_prompt_api.py b/libs/server/kiln_server/test_prompt_api.py index 68f62497..e3375e44 100644 --- a/libs/server/kiln_server/test_prompt_api.py +++ b/libs/server/kiln_server/test_prompt_api.py @@ -3,8 +3,7 @@ import pytest from fastapi import FastAPI from fastapi.testclient import TestClient -from kiln_ai.adapters.prompt_builders import prompt_builder_registry -from kiln_ai.datamodel import Project, Prompt, Task +from kiln_ai.datamodel import Project, Prompt, PromptGenerators, Task from kiln_server.custom_errors import connect_custom_errors from kiln_server.prompt_api import _prompt_generators, connect_prompt_api @@ -47,6 +46,7 @@ def test_create_prompt_success(client, project_and_task): prompt_data = { "name": "Test Prompt", "prompt": "This is a test prompt", + "description": "This is a test prompt description", "chain_of_thought_instructions": "Think step by step, explaining your reasoning.", } @@ -59,6 +59,7 @@ def test_create_prompt_success(client, project_and_task): assert response.status_code == 200 res = response.json() assert res["name"] == "Test Prompt" + assert res["description"] == "This is a test prompt description" assert res["prompt"] == "This is a test prompt" # Check that the prompt was saved to the task/file @@ -116,18 +117,22 @@ def test_prompt_generators_content(): from kiln_server.prompt_api import _prompt_generators # Test a few key generators - basic = next(g for g in _prompt_generators if g.id == "basic") + basic = next(g for g in _prompt_generators if g.id == "simple_prompt_builder") assert basic.chain_of_thought is False assert "zero-shot" in basic.description.lower() - cot = next(g for g in _prompt_generators if g.id == "simple_chain_of_thought") + cot = next( + g + for g in _prompt_generators + if g.id == "simple_chain_of_thought_prompt_builder" + ) assert cot.chain_of_thought is True assert "Chain of Thought" in cot.name -# If we fix the TODO about maintaining these in 2 places we can remove this test, but this ensures we don't mess it up until then -def test_all_ui_ids_are_covered(): - generator_keys = prompt_builder_registry.keys() - api_list = [g.ui_id for g in _prompt_generators] +# Check our nice UI list with descriptions covers all our generators +def test_all_ids_are_covered(): + generators = [e.value for e in PromptGenerators] + api_list = [g.id for g in _prompt_generators] - assert set(api_list) == set(generator_keys) + assert set(api_list) == set(generators) diff --git a/libs/server/kiln_server/test_run_api.py b/libs/server/kiln_server/test_run_api.py index 477b288e..e64ee3c4 100644 --- a/libs/server/kiln_server/test_run_api.py +++ b/libs/server/kiln_server/test_run_api.py @@ -84,7 +84,7 @@ def task_run_setup(tmp_path): "model_name": "gpt_4o", "model_provider": "ollama", "adapter_name": "kiln_langchain_adapter", - "prompt_builder_name": "simple_prompt_builder", + "prompt_id": "simple_prompt_builder", }, ), ), diff --git a/uv.lock b/uv.lock index 77f10d0e..6718115d 100644 --- a/uv.lock +++ b/uv.lock @@ -902,6 +902,7 @@ dependencies = [ { name = "pillow" }, { name = "pyinstaller" }, { name = "pystray" }, + { name = "scipy" }, ] [package.metadata] @@ -910,6 +911,7 @@ requires-dist = [ { name = "pillow", specifier = ">=11.0.0" }, { name = "pyinstaller", specifier = "==6.11.1" }, { name = "pystray", specifier = ">=0.19.5" }, + { name = "scipy", specifier = ">=1.15.2" }, ] [[package]] @@ -1985,6 +1987,62 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/c0/b0fba8259b61c938c9733da9346b9f93e00881a9db22aafdd72f6ae0ec05/s3transfer-0.10.3-py3-none-any.whl", hash = "sha256:263ed587a5803c6c708d3ce44dc4dfedaab4c1a32e8329bab818933d79ddcf5d", size = 82625 }, ] +[[package]] +name = "scipy" +version = "1.15.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/b9/31ba9cd990e626574baf93fbc1ac61cf9ed54faafd04c479117517661637/scipy-1.15.2.tar.gz", hash = "sha256:cd58a314d92838f7e6f755c8a2167ead4f27e1fd5c1251fd54289569ef3495ec", size = 59417316 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/95/df/ef233fff6838fe6f7840d69b5ef9f20d2b5c912a8727b21ebf876cb15d54/scipy-1.15.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a2ec871edaa863e8213ea5df811cd600734f6400b4af272e1c011e69401218e9", size = 38692502 }, + { url = "https://files.pythonhosted.org/packages/5c/20/acdd4efb8a68b842968f7bc5611b1aeb819794508771ad104de418701422/scipy-1.15.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:6f223753c6ea76983af380787611ae1291e3ceb23917393079dcc746ba60cfb5", size = 30085508 }, + { url = "https://files.pythonhosted.org/packages/42/55/39cf96ca7126f1e78ee72a6344ebdc6702fc47d037319ad93221063e6cf4/scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ecf797d2d798cf7c838c6d98321061eb3e72a74710e6c40540f0e8087e3b499e", size = 22359166 }, + { url = "https://files.pythonhosted.org/packages/51/48/708d26a4ab8a1441536bf2dfcad1df0ca14a69f010fba3ccbdfc02df7185/scipy-1.15.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:9b18aa747da280664642997e65aab1dd19d0c3d17068a04b3fe34e2559196cb9", size = 25112047 }, + { url = "https://files.pythonhosted.org/packages/dd/65/f9c5755b995ad892020381b8ae11f16d18616208e388621dfacc11df6de6/scipy-1.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:87994da02e73549dfecaed9e09a4f9d58a045a053865679aeb8d6d43747d4df3", size = 35536214 }, + { url = "https://files.pythonhosted.org/packages/de/3c/c96d904b9892beec978562f64d8cc43f9cca0842e65bd3cd1b7f7389b0ba/scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69ea6e56d00977f355c0f84eba69877b6df084516c602d93a33812aa04d90a3d", size = 37646981 }, + { url = "https://files.pythonhosted.org/packages/3d/74/c2d8a24d18acdeae69ed02e132b9bc1bb67b7bee90feee1afe05a68f9d67/scipy-1.15.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:888307125ea0c4466287191e5606a2c910963405ce9671448ff9c81c53f85f58", size = 37230048 }, + { url = "https://files.pythonhosted.org/packages/42/19/0aa4ce80eca82d487987eff0bc754f014dec10d20de2f66754fa4ea70204/scipy-1.15.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9412f5e408b397ff5641080ed1e798623dbe1ec0d78e72c9eca8992976fa65aa", size = 40010322 }, + { url = "https://files.pythonhosted.org/packages/d0/d2/f0683b7e992be44d1475cc144d1f1eeae63c73a14f862974b4db64af635e/scipy-1.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:b5e025e903b4f166ea03b109bb241355b9c42c279ea694d8864d033727205e65", size = 41233385 }, + { url = "https://files.pythonhosted.org/packages/40/1f/bf0a5f338bda7c35c08b4ed0df797e7bafe8a78a97275e9f439aceb46193/scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:92233b2df6938147be6fa8824b8136f29a18f016ecde986666be5f4d686a91a4", size = 38703651 }, + { url = "https://files.pythonhosted.org/packages/de/54/db126aad3874601048c2c20ae3d8a433dbfd7ba8381551e6f62606d9bd8e/scipy-1.15.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:62ca1ff3eb513e09ed17a5736929429189adf16d2d740f44e53270cc800ecff1", size = 30102038 }, + { url = "https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4c6676490ad76d1c2894d77f976144b41bd1a4052107902238047fb6a473e971", size = 22375518 }, + { url = "https://files.pythonhosted.org/packages/44/78/25535a6e63d3b9c4c90147371aedb5d04c72f3aee3a34451f2dc27c0c07f/scipy-1.15.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:a8bf5cb4a25046ac61d38f8d3c3426ec11ebc350246a4642f2f315fe95bda655", size = 25142523 }, + { url = "https://files.pythonhosted.org/packages/e0/22/4b4a26fe1cd9ed0bc2b2cb87b17d57e32ab72c346949eaf9288001f8aa8e/scipy-1.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a8e34cf4c188b6dd004654f88586d78f95639e48a25dfae9c5e34a6dc34547e", size = 35491547 }, + { url = "https://files.pythonhosted.org/packages/32/ea/564bacc26b676c06a00266a3f25fdfe91a9d9a2532ccea7ce6dd394541bc/scipy-1.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28a0d2c2075946346e4408b211240764759e0fabaeb08d871639b5f3b1aca8a0", size = 37634077 }, + { url = "https://files.pythonhosted.org/packages/43/c2/bfd4e60668897a303b0ffb7191e965a5da4056f0d98acfb6ba529678f0fb/scipy-1.15.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:42dabaaa798e987c425ed76062794e93a243be8f0f20fff6e7a89f4d61cb3d40", size = 37231657 }, + { url = "https://files.pythonhosted.org/packages/4a/75/5f13050bf4f84c931bcab4f4e83c212a36876c3c2244475db34e4b5fe1a6/scipy-1.15.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6f5e296ec63c5da6ba6fa0343ea73fd51b8b3e1a300b0a8cae3ed4b1122c7462", size = 40035857 }, + { url = "https://files.pythonhosted.org/packages/b9/8b/7ec1832b09dbc88f3db411f8cdd47db04505c4b72c99b11c920a8f0479c3/scipy-1.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:597a0c7008b21c035831c39927406c6181bcf8f60a73f36219b69d010aa04737", size = 41217654 }, + { url = "https://files.pythonhosted.org/packages/4b/5d/3c78815cbab499610f26b5bae6aed33e227225a9fa5290008a733a64f6fc/scipy-1.15.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c4697a10da8f8765bb7c83e24a470da5797e37041edfd77fd95ba3811a47c4fd", size = 38756184 }, + { url = "https://files.pythonhosted.org/packages/37/20/3d04eb066b471b6e171827548b9ddb3c21c6bbea72a4d84fc5989933910b/scipy-1.15.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:869269b767d5ee7ea6991ed7e22b3ca1f22de73ab9a49c44bad338b725603301", size = 30163558 }, + { url = "https://files.pythonhosted.org/packages/a4/98/e5c964526c929ef1f795d4c343b2ff98634ad2051bd2bbadfef9e772e413/scipy-1.15.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:bad78d580270a4d32470563ea86c6590b465cb98f83d760ff5b0990cb5518a93", size = 22437211 }, + { url = "https://files.pythonhosted.org/packages/1d/cd/1dc7371e29195ecbf5222f9afeedb210e0a75057d8afbd942aa6cf8c8eca/scipy-1.15.2-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b09ae80010f52efddb15551025f9016c910296cf70adbf03ce2a8704f3a5ad20", size = 25232260 }, + { url = "https://files.pythonhosted.org/packages/f0/24/1a181a9e5050090e0b5138c5f496fee33293c342b788d02586bc410c6477/scipy-1.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6fd6eac1ce74a9f77a7fc724080d507c5812d61e72bd5e4c489b042455865e", size = 35198095 }, + { url = "https://files.pythonhosted.org/packages/c0/53/eaada1a414c026673eb983f8b4a55fe5eb172725d33d62c1b21f63ff6ca4/scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b871df1fe1a3ba85d90e22742b93584f8d2b8e6124f8372ab15c71b73e428b8", size = 37297371 }, + { url = "https://files.pythonhosted.org/packages/e9/06/0449b744892ed22b7e7b9a1994a866e64895363572677a316a9042af1fe5/scipy-1.15.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:03205d57a28e18dfd39f0377d5002725bf1f19a46f444108c29bdb246b6c8a11", size = 36872390 }, + { url = "https://files.pythonhosted.org/packages/6a/6f/a8ac3cfd9505ec695c1bc35edc034d13afbd2fc1882a7c6b473e280397bb/scipy-1.15.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:601881dfb761311045b03114c5fe718a12634e5608c3b403737ae463c9885d53", size = 39700276 }, + { url = "https://files.pythonhosted.org/packages/f5/6f/e6e5aff77ea2a48dd96808bb51d7450875af154ee7cbe72188afb0b37929/scipy-1.15.2-cp312-cp312-win_amd64.whl", hash = "sha256:e7c68b6a43259ba0aab737237876e5c2c549a031ddb7abc28c7b47f22e202ded", size = 40942317 }, + { url = "https://files.pythonhosted.org/packages/53/40/09319f6e0f276ea2754196185f95cd191cb852288440ce035d5c3a931ea2/scipy-1.15.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:01edfac9f0798ad6b46d9c4c9ca0e0ad23dbf0b1eb70e96adb9fa7f525eff0bf", size = 38717587 }, + { url = "https://files.pythonhosted.org/packages/fe/c3/2854f40ecd19585d65afaef601e5e1f8dbf6758b2f95b5ea93d38655a2c6/scipy-1.15.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:08b57a9336b8e79b305a143c3655cc5bdbe6d5ece3378578888d2afbb51c4e37", size = 30100266 }, + { url = "https://files.pythonhosted.org/packages/dd/b1/f9fe6e3c828cb5930b5fe74cb479de5f3d66d682fa8adb77249acaf545b8/scipy-1.15.2-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:54c462098484e7466362a9f1672d20888f724911a74c22ae35b61f9c5919183d", size = 22373768 }, + { url = "https://files.pythonhosted.org/packages/15/9d/a60db8c795700414c3f681908a2b911e031e024d93214f2d23c6dae174ab/scipy-1.15.2-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:cf72ff559a53a6a6d77bd8eefd12a17995ffa44ad86c77a5df96f533d4e6c6bb", size = 25154719 }, + { url = "https://files.pythonhosted.org/packages/37/3b/9bda92a85cd93f19f9ed90ade84aa1e51657e29988317fabdd44544f1dd4/scipy-1.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9de9d1416b3d9e7df9923ab23cd2fe714244af10b763975bea9e4f2e81cebd27", size = 35163195 }, + { url = "https://files.pythonhosted.org/packages/03/5a/fc34bf1aa14dc7c0e701691fa8685f3faec80e57d816615e3625f28feb43/scipy-1.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fb530e4794fc8ea76a4a21ccb67dea33e5e0e60f07fc38a49e821e1eae3b71a0", size = 37255404 }, + { url = "https://files.pythonhosted.org/packages/4a/71/472eac45440cee134c8a180dbe4c01b3ec247e0338b7c759e6cd71f199a7/scipy-1.15.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5ea7ed46d437fc52350b028b1d44e002646e28f3e8ddc714011aaf87330f2f32", size = 36860011 }, + { url = "https://files.pythonhosted.org/packages/01/b3/21f890f4f42daf20e4d3aaa18182dddb9192771cd47445aaae2e318f6738/scipy-1.15.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:11e7ad32cf184b74380f43d3c0a706f49358b904fa7d5345f16ddf993609184d", size = 39657406 }, + { url = "https://files.pythonhosted.org/packages/0d/76/77cf2ac1f2a9cc00c073d49e1e16244e389dd88e2490c91d84e1e3e4d126/scipy-1.15.2-cp313-cp313-win_amd64.whl", hash = "sha256:a5080a79dfb9b78b768cebf3c9dcbc7b665c5875793569f48bf0e2b1d7f68f6f", size = 40961243 }, + { url = "https://files.pythonhosted.org/packages/4c/4b/a57f8ddcf48e129e6054fa9899a2a86d1fc6b07a0e15c7eebff7ca94533f/scipy-1.15.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:447ce30cee6a9d5d1379087c9e474628dab3db4a67484be1b7dc3196bfb2fac9", size = 38870286 }, + { url = "https://files.pythonhosted.org/packages/0c/43/c304d69a56c91ad5f188c0714f6a97b9c1fed93128c691148621274a3a68/scipy-1.15.2-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:c90ebe8aaa4397eaefa8455a8182b164a6cc1d59ad53f79943f266d99f68687f", size = 30141634 }, + { url = "https://files.pythonhosted.org/packages/44/1a/6c21b45d2548eb73be9b9bff421aaaa7e85e22c1f9b3bc44b23485dfce0a/scipy-1.15.2-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:def751dd08243934c884a3221156d63e15234a3155cf25978b0a668409d45eb6", size = 22415179 }, + { url = "https://files.pythonhosted.org/packages/74/4b/aefac4bba80ef815b64f55da06f62f92be5d03b467f2ce3668071799429a/scipy-1.15.2-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:302093e7dfb120e55515936cb55618ee0b895f8bcaf18ff81eca086c17bd80af", size = 25126412 }, + { url = "https://files.pythonhosted.org/packages/b1/53/1cbb148e6e8f1660aacd9f0a9dfa2b05e9ff1cb54b4386fe868477972ac2/scipy-1.15.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7cd5b77413e1855351cdde594eca99c1f4a588c2d63711388b6a1f1c01f62274", size = 34952867 }, + { url = "https://files.pythonhosted.org/packages/2c/23/e0eb7f31a9c13cf2dca083828b97992dd22f8184c6ce4fec5deec0c81fcf/scipy-1.15.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d0194c37037707b2afa7a2f2a924cf7bac3dc292d51b6a925e5fcb89bc5c776", size = 36890009 }, + { url = "https://files.pythonhosted.org/packages/03/f3/e699e19cabe96bbac5189c04aaa970718f0105cff03d458dc5e2b6bd1e8c/scipy-1.15.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:bae43364d600fdc3ac327db99659dcb79e6e7ecd279a75fe1266669d9a652828", size = 36545159 }, + { url = "https://files.pythonhosted.org/packages/af/f5/ab3838e56fe5cc22383d6fcf2336e48c8fe33e944b9037fbf6cbdf5a11f8/scipy-1.15.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f031846580d9acccd0044efd1a90e6f4df3a6e12b4b6bd694a7bc03a89892b28", size = 39136566 }, + { url = "https://files.pythonhosted.org/packages/0a/c8/b3f566db71461cabd4b2d5b39bcc24a7e1c119535c8361f81426be39bb47/scipy-1.15.2-cp313-cp313t-win_amd64.whl", hash = "sha256:fe8a9eb875d430d81755472c5ba75e84acc980e4a8f6204d402849234d3017db", size = 40477705 }, +] + [[package]] name = "setuptools" version = "75.3.0"
NameName & DescriptionType Prompt Preview
{prompt.name} +
+ {prompt.name} +
+
+ {prompt.description} +
+
+ {#if prompt.id.startsWith("id::")} + Custom + {:else if prompt.id.startsWith("fine_tune_prompt::")} + Fine Tuning Prompt + {:else if prompt.id.startsWith("task_run_config::")} + Eval Prompt + {:else} + Unknown + {/if} + {prompt.prompt.length > 100 - ? prompt.prompt.slice(0, 100) + "..." + ? prompt.prompt.slice(0, 200) + "..." : prompt.prompt}