Merge pull request #228 from Kiln-AI/eval_config_eval

Eval config eval
Kiln-AI · Feb 26, 2025 · 23cad80 · 23cad80
2 parents f0d4144 + ee30223
commit 23cad80
Show file tree

Hide file tree

Showing 25 changed files with 2,449 additions and 378 deletions.
diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py
diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
@@ -742,6 +742,23 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Get Eval Config */
+        get: operations["get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/projects/{project_id}/tasks/{task_id}/task_run_config": {
         parameters: {
             query?: never;
@@ -793,6 +810,40 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Set Default Eval Config */
+        post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Run Eval Config Eval */
+        get: operations["run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_config/{run_config_id}/results": {
         parameters: {
             query?: never;
@@ -827,6 +878,23 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_configs_score_summary": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        /** Get Eval Configs Score Summary */
+        get: operations["get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get"];
+        put?: never;
+        post?: never;
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
 }
 export type webhooks = Record<string, never>;
 export interface components {
@@ -1313,6 +1381,38 @@ export interface components {
             /** Model Type */
             readonly model_type: string;
         };
+        /** EvalConfigCompareSummary */
+        EvalConfigCompareSummary: {
+            /** Results */
+            results: {
+                [key: string]: {
+                    [key: string]: components["schemas"]["EvalConfigScoreSummary"];
+                };
+            };
+            /** Eval Config Percent Complete */
+            eval_config_percent_complete: {
+                [key: string]: number;
+            };
+            /** Dataset Size */
+            dataset_size: number;
+            /** Fully Rated Count */
+            fully_rated_count: number;
+            /** Partially Rated Count */
+            partially_rated_count: number;
+            /** Not Rated Count */
+            not_rated_count: number;
+        };
+        /** EvalConfigScoreSummary */
+        EvalConfigScoreSummary: {
+            /** Mean Absolute Error */
+            mean_absolute_error: number;
+            /** Mean Normalized Absolute Error */
+            mean_normalized_absolute_error: number;
+            /** Mean Squared Error */
+            mean_squared_error: number;
+            /** Mean Normalized Squared Error */
+            mean_normalized_squared_error: number;
+        };
         /**
          * EvalConfigType
          * @enum {string}
@@ -1381,9 +1481,15 @@ export interface components {
             dataset_id: string | null;
             /**
              * Task Run Config Id
-             * @description The ID of the TaskRunConfig that was run. Must belong to the same Task as this eval.
+             * @description The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config.
              */
             task_run_config_id: string | null;
+            /**
+             * Eval Config Eval
+             * @description Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.
+             * @default false
+             */
+            eval_config_eval: boolean;
             /**
              * Input
              * @description The input to the task. JSON formatted for structured input, plaintext for unstructured input.
@@ -4031,6 +4137,40 @@ export interface operations {
             };
         };
     };
+    get_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__get: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                project_id: string;
+                task_id: string;
+                eval_id: string;
+                eval_config_id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["EvalConfig"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
     create_task_run_config_api_projects__project_id__tasks__task_id__task_run_config_post: {
         parameters: {
             query?: never;
@@ -4141,6 +4281,73 @@ export interface operations {
             };
         };
     };
+    set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                project_id: string;
+                task_id: string;
+                eval_id: string;
+                eval_config_id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["Eval"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
+    run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                project_id: string;
+                task_id: string;
+                eval_id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": unknown;
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
     get_eval_run_results_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_config__run_config_id__results_get: {
         parameters: {
             query?: never;
@@ -4210,4 +4417,37 @@ export interface operations {
             };
         };
     };
+    get_eval_configs_score_summary_api_projects__project_id__tasks__task_id__eval__eval_id__eval_configs_score_summary_get: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                project_id: string;
+                task_id: string;
+                eval_id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["EvalConfigCompareSummary"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
 }
diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts
@@ -6,6 +6,7 @@ export type Task = components["schemas"]["Task"]
 export type TaskRun = components["schemas"]["TaskRun-Input"]
 export type TaskRequirement = components["schemas"]["TaskRequirement"]
 export type TaskOutputRating = components["schemas"]["TaskOutputRating-Output"]
+export type TaskOutputRatingType = components["schemas"]["TaskOutputRatingType"]
 export type RequirementRating = components["schemas"]["RequirementRating"]
 export type RatingType = components["schemas"]["TaskOutputRatingType"]
 export type AvailableModels = components["schemas"]["AvailableModels"]
@@ -27,3 +28,5 @@ export type EvalConfig = components["schemas"]["EvalConfig"]
 export type TaskRunConfig = components["schemas"]["TaskRunConfig"]
 export type EvalResultSummary = components["schemas"]["EvalResultSummary"]
 export type EvalRunResult = components["schemas"]["EvalRunResult"]
+export type EvalConfigCompareSummary =
+  components["schemas"]["EvalConfigCompareSummary"]
diff --git a/app/web_ui/src/lib/utils/formatters.ts b/app/web_ui/src/lib/utils/formatters.ts
@@ -1,3 +1,5 @@
+import { type EvalConfigType } from "$lib/types"
+
 export function formatDate(dateString: string | undefined): string {
   if (!dateString) {
     return "Unknown"
@@ -40,3 +42,14 @@ export function formatDate(dateString: string | undefined): string {
     .replace(" PM", "pm")
     .replace(",", "")
 }
+
+export function eval_config_to_ui_name(
+  eval_config_type: EvalConfigType,
+): string {
+  return (
+    {
+      g_eval: "G-Eval",
+      llm_as_judge: "LLM as Judge",
+    }[eval_config_type] || eval_config_type
+  )
+}
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte
@@ -52,14 +52,14 @@
 
 <AppPage
   title="Evals"
-  subtitle="Evaluate models, prompts, and more."
+  subtitle="Evaluate task performance of various models, prompts, fine-tunes, and more."
   sub_subtitle={is_empty ? undefined : "Read the Docs"}
   sub_subtitle_link="https://docs.getkiln.ai/docs/evaluationsTODO"
   action_buttons={is_empty
     ? []
     : [
         {
-          label: "Create Evaluator",
+          label: "New Evaluator",
           href: `/evals/${project_id}/${task_id}/create_evaluator`,
           primary: true,
         },