Skip to content

Commit

Permalink
Merge pull request #228 from Kiln-AI/eval_config_eval
Browse files Browse the repository at this point in the history
Eval config eval
  • Loading branch information
scosman authored Feb 26, 2025
2 parents f0d4144 + ee30223 commit 23cad80
Show file tree
Hide file tree
Showing 25 changed files with 2,449 additions and 378 deletions.
331 changes: 311 additions & 20 deletions app/desktop/studio_server/eval_api.py

Large diffs are not rendered by default.

392 changes: 379 additions & 13 deletions app/desktop/studio_server/test_eval_api.py

Large diffs are not rendered by default.

242 changes: 241 additions & 1 deletion app/web_ui/src/lib/api_schema.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -742,6 +742,23 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
/** Get Eval Config */
get: operations["get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get"];
put?: never;
post?: never;
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/task_run_config": {
parameters: {
query?: never;
Expand Down Expand Up @@ -793,6 +810,40 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get?: never;
put?: never;
/** Set Default Eval Config */
post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"];
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
/** Run Eval Config Eval */
get: operations["run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get"];
put?: never;
post?: never;
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results": {
parameters: {
query?: never;
Expand Down Expand Up @@ -827,6 +878,23 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
/** Get Eval Configs Score Summary */
get: operations["get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get"];
put?: never;
post?: never;
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
}
export type webhooks = Record<string, never>;
export interface components {
Expand Down Expand Up @@ -1313,6 +1381,38 @@ export interface components {
/** Model Type */
readonly model_type: string;
};
/** EvalConfigCompareSummary */
EvalConfigCompareSummary: {
/** Results */
results: {
[key: string]: {
[key: string]: components["schemas"]["EvalConfigScoreSummary"];
};
};
/** Eval Config Percent Complete */
eval_config_percent_complete: {
[key: string]: number;
};
/** Dataset Size */
dataset_size: number;
/** Fully Rated Count */
fully_rated_count: number;
/** Partially Rated Count */
partially_rated_count: number;
/** Not Rated Count */
not_rated_count: number;
};
/** EvalConfigScoreSummary */
EvalConfigScoreSummary: {
/** Mean Absolute Error */
mean_absolute_error: number;
/** Mean Normalized Absolute Error */
mean_normalized_absolute_error: number;
/** Mean Squared Error */
mean_squared_error: number;
/** Mean Normalized Squared Error */
mean_normalized_squared_error: number;
};
/**
* EvalConfigType
* @enum {string}
Expand Down Expand Up @@ -1381,9 +1481,15 @@ export interface components {
dataset_id: string | null;
/**
* Task Run Config Id
* @description The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval.
* @description The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config.
*/
task_run_config_id: string | null;
/**
* Eval Config Eval
* @description Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.
* @default false
*/
eval_config_eval: boolean;
/**
* Input
* @description The input to the task. JSON formatted for structured input, plaintext for unstructured input.
Expand Down Expand Up @@ -4031,6 +4137,40 @@ export interface operations {
};
};
};
get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get: {
parameters: {
query?: never;
header?: never;
path: {
project_id: string;
task_id: string;
eval_id: string;
eval_config_id: string;
};
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["EvalConfig"];
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post: {
parameters: {
query?: never;
Expand Down Expand Up @@ -4141,6 +4281,73 @@ export interface operations {
};
};
};
set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: {
parameters: {
query?: never;
header?: never;
path: {
project_id: string;
task_id: string;
eval_id: string;
eval_config_id: string;
};
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["Eval"];
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: {
parameters: {
query?: never;
header?: never;
path: {
project_id: string;
task_id: string;
eval_id: string;
};
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": unknown;
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get: {
parameters: {
query?: never;
Expand Down Expand Up @@ -4210,4 +4417,37 @@ export interface operations {
};
};
};
get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get: {
parameters: {
query?: never;
header?: never;
path: {
project_id: string;
task_id: string;
eval_id: string;
};
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["EvalConfigCompareSummary"];
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
}
3 changes: 3 additions & 0 deletions app/web_ui/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export type Task = components["schemas"]["Task"]
export type TaskRun = components["schemas"]["TaskRun-Input"]
export type TaskRequirement = components["schemas"]["TaskRequirement"]
export type TaskOutputRating = components["schemas"]["TaskOutputRating-Output"]
export type TaskOutputRatingType = components["schemas"]["TaskOutputRatingType"]
export type RequirementRating = components["schemas"]["RequirementRating"]
export type RatingType = components["schemas"]["TaskOutputRatingType"]
export type AvailableModels = components["schemas"]["AvailableModels"]
Expand All @@ -27,3 +28,5 @@ export type EvalConfig = components["schemas"]["EvalConfig"]
export type TaskRunConfig = components["schemas"]["TaskRunConfig"]
export type EvalResultSummary = components["schemas"]["EvalResultSummary"]
export type EvalRunResult = components["schemas"]["EvalRunResult"]
export type EvalConfigCompareSummary =
components["schemas"]["EvalConfigCompareSummary"]
13 changes: 13 additions & 0 deletions app/web_ui/src/lib/utils/formatters.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import { type EvalConfigType } from "$lib/types"

export function formatDate(dateString: string | undefined): string {
if (!dateString) {
return "Unknown"
Expand Down Expand Up @@ -40,3 +42,14 @@ export function formatDate(dateString: string | undefined): string {
.replace(" PM", "pm")
.replace(",", "")
}

export function eval_config_to_ui_name(
eval_config_type: EvalConfigType,
): string {
return (
{
g_eval: "G-Eval",
llm_as_judge: "LLM as Judge",
}[eval_config_type] || eval_config_type
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@

<AppPage
title="Evals"
subtitle="Evaluate models, prompts, and more."
subtitle="Evaluate task performance of various models, prompts, fine-tunes, and more."
sub_subtitle={is_empty ? undefined : "Read the Docs"}
sub_subtitle_link="https://docs.getkiln.ai/docs/evaluationsTODO"
action_buttons={is_empty
? []
: [
{
label: "Create Evaluator",
label: "New Evaluator",
href: `/evals/${project_id}/${task_id}/create_evaluator`,
primary: true,
},
Expand Down
Loading

0 comments on commit 23cad80

Please sign in to comment.