- All setting current eval config for an eval through UI

- Improve strings/messaging - Allow creating eval configs from /eval_configs with correct redirect - Fix a bug where eval runs without task_run_configs were causing lookup errors.
Kiln-AI · Feb 26, 2025 · 50811b1 · 50811b1
1 parent 1133e1a
commit 50811b1
Show file tree

Hide file tree

Showing 5 changed files with 183 additions and 12 deletions.
diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py
@@ -389,6 +389,21 @@ async def run_eval_config(
 
         return await run_eval_runner_with_status(eval_runner)
 
+    @app.post(
+        "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}"
+    )
+    async def set_default_eval_config(
+        project_id: str,
+        task_id: str,
+        eval_id: str,
+        eval_config_id: str,
+    ) -> Eval:
+        eval = eval_from_id(project_id, task_id, eval_id)
+        eval.current_config_id = eval_config_id
+        eval.save_to_file()
+
+        return eval
+
     @app.get(
         "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval"
     )
@@ -470,6 +485,9 @@ async def get_eval_config_score_summary(
 
         # important: readonly makes this much faster
         for eval_run in eval_config.runs(readonly=True):
+            if eval_run.task_run_config_id is None:
+                # This eval_run is not associated with a run_config, so we can't count it
+                continue
             run_config_id = str(eval_run.task_run_config_id)
 
             # Check if we should count this eval_run. Not every eval_run has to go into the stats:

diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py
@@ -1000,3 +1000,37 @@ async def test_run_eval_config_eval(
         assert eval_runner.eval_configs[0].id == mock_eval_config.id
         assert eval_runner.run_configs is None
         assert eval_runner.eval_run_type == "eval_config_eval"
+
+
+@pytest.mark.asyncio
+async def test_set_current_eval_config(
+    client, mock_task_from_id, mock_task, mock_eval, mock_eval_config
+):
+    """Test setting the current eval config for an evaluation."""
+    mock_task_from_id.return_value = mock_task
+
+    # Get the eval before updating to verify the change
+    response = client.get("/api/projects/project1/tasks/task1/eval/eval1")
+    assert response.status_code == 200
+    eval_before = response.json()
+
+    # The current_config_id might be None or different initially
+    initial_config_id = eval_before.get("current_config_id")
+    assert initial_config_id is None
+
+    # Set the current eval config
+    with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id:
+        mock_eval_from_id.return_value = mock_eval
+        response = client.post(
+            "/api/projects/project1/tasks/task1/eval/eval1/set_current_eval_config/eval_config1"
+        )
+        assert response.status_code == 200
+        updated_eval = response.json()
+
+    # Verify the current_config_id was updated
+    assert updated_eval["current_config_id"] == "eval_config1"
+    assert updated_eval["id"] == "eval1"
+
+    # Verify the change persists by fetching the eval again
+    eval_from_disk = mock_task.evals()[0]
+    assert eval_from_disk.current_config_id == "eval_config1"
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
@@ -810,6 +810,23 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": {
+        parameters: {
+            query?: never;
+            header?: never;
+            path?: never;
+            cookie?: never;
+        };
+        get?: never;
+        put?: never;
+        /** Set Default Eval Config */
+        post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"];
+        delete?: never;
+        options?: never;
+        head?: never;
+        patch?: never;
+        trace?: never;
+    };
     "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": {
         parameters: {
             query?: never;
@@ -4260,6 +4277,40 @@ export interface operations {
             };
         };
     };
+    set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: {
+        parameters: {
+            query?: never;
+            header?: never;
+            path: {
+                project_id: string;
+                task_id: string;
+                eval_id: string;
+                eval_config_id: string;
+            };
+            cookie?: never;
+        };
+        requestBody?: never;
+        responses: {
+            /** @description Successful Response */
+            200: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["Eval"];
+                };
+            };
+            /** @description Validation Error */
+            422: {
+                headers: {
+                    [name: string]: unknown;
+                };
+                content: {
+                    "application/json": components["schemas"]["HTTPValidationError"];
+                };
+            };
+        };
+    };
     run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: {
         parameters: {
             query?: never;

diff --git a/...i/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/...i/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte
@@ -204,9 +204,16 @@
         throw error
       }
       complete = true
-      goto(
-        `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`,
-      )
+      const next_page = $page.url.searchParams.get("next_page")
+      if (next_page === "eval_configs") {
+        goto(
+          `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/eval_configs`,
+        )
+      } else {
+        goto(
+          `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`,
+        )
+      }
     } catch (e) {
       create_evaluator_error = createKilnError(e)
     } finally {

diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/eval_configs/+page.svelte
@@ -51,9 +51,10 @@
       load_model_info(),
       load_available_prompts(),
       load_available_models(),
+      // Get this first, as we want to know "current" for sorting
+      get_eval(),
     ])
     // These can be parallel
-    get_eval()
     get_eval_config()
     get_score_summary()
   })
@@ -102,7 +103,12 @@
       if (error) {
         throw error
       }
-      eval_configs = data
+      // sort with current on top
+      eval_configs = data.sort((a, b) => {
+        if (evaluator && a.id === evaluator.current_config_id) return -1
+        if (evaluator && b.id === evaluator.current_config_id) return 1
+        return 0
+      })
     } catch (error) {
       eval_configs_error = createKilnError(error)
     } finally {
@@ -180,17 +186,17 @@
     const warnings: string[] = []
     if (score_summary.dataset_size === 0) {
       warnings.push(
-        "No items in your eval-config dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.",
+        "There are zero items in your config eval dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.",
       )
     }
     if (score_summary.not_rated_count > 0) {
       warnings.push(
-        `${score_summary.not_rated_count} item(s) in your eval-config dataset are not rated at all. Add human ratings to these items in the dataset tab.`,
+        `${score_summary.not_rated_count} item(s) in your config eval dataset are not rated at all. Add human ratings to these items in the dataset tab.`,
       )
     }
     if (score_summary.partially_rated_count > 0) {
       warnings.push(
-        `${score_summary.partially_rated_count} item(s) in your eval-config dataset are only partially rated. Add human ratings to these items in the dataset tab for each score.`,
+        `${score_summary.partially_rated_count} item(s) in your config eval dataset are only partially rated. Add human ratings to these items for every score.`,
       )
     }
 
@@ -209,11 +215,47 @@
 
     return warnings
   }
+
+  async function set_current_eval_config(
+    eval_config_id: string | null | undefined,
+  ) {
+    if (!eval_config_id) {
+      return
+    }
+    try {
+      const { data, error } = await client.POST(
+        "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}",
+        {
+          params: {
+            path: {
+              project_id: $page.params.project_id,
+              task_id: $page.params.task_id,
+              eval_id: $page.params.eval_id,
+              eval_config_id: eval_config_id,
+            },
+          },
+        },
+      )
+      if (error) {
+        throw error
+      }
+      // Update the evaluator with the latest
+      evaluator = data
+    } catch (error) {
+      eval_error = createKilnError(error)
+    }
+  }
 </script>
 
 <AppPage
   title="Compare Eval Configs"
   subtitle="Find the evaluator that best matches human-ratings"
+  action_buttons={[
+    {
+      label: "Add Eval Config",
+      href: `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/create_eval_config?next_page=eval_configs`,
+    },
+  ]}
 >
   {#if loading}
     <div class="w-full min-h-[50vh] flex justify-center items-center">
@@ -242,16 +284,22 @@
             </div>
           {/each}
         </div>
+        {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25}
+          <Warning
+            warning_message={`There are only ${score_summary.dataset_size} items in your eval-config dataset. This is generally too small to get a good sense of how well your eval-configs perform.`}
+            warning_color="warning"
+            tight={true}
+          />
+        {/if}
       </div>
     </div>
     <div class="mt-16">
       {#if eval_configs?.length}
         <div class="flex flex-col lg:flex-row gap-4 lg:gap-8 mb-6">
           <div class="grow">
-            <div class="text-xl font-bold">Correlation to Human Scores</div>
+            <div class="text-xl font-bold">Correlation to Human Ratings</div>
             <div class="text-xs text-gray-500">
-              How each eval config correlates to human scores (ratings from the
-              dataset tab).
+              How each eval config correlates to human ratings.
             </div>
             {#if score_summary_error}
               <div class="text-error text-sm">
@@ -279,13 +327,14 @@
         </div>
 
         <!-- Warn the user if some evals are incomplete -->
+
         {#if incomplete_warning(score_summary).length}
           <div class="mt-6 mb-4">
             <Warning
               warning_message={`There are issues you should resolve before analyzing this data.`}
               tight={true}
             />
-            <ul class="list-disc list-inside text-error">
+            <ul class="list-disc list-inside text-error pl-2 pt-2">
               {#each incomplete_warning(score_summary) as warning}
                 <li>{warning}</li>
               {/each}
@@ -370,6 +419,18 @@
                       <!-- We have results, but not for this run config -->
                       <div class="text-sm text-error">0% complete</div>
                     {/if}
+                    {#if eval_config.id == evaluator.current_config_id}
+                      <div class="badge badge-primary mt-2">Default</div>
+                    {:else}
+                      <button
+                        class="link text-sm text-gray-500"
+                        on:click={() => {
+                          set_current_eval_config(eval_config.id)
+                        }}
+                      >
+                        Set as default
+                      </button>
+                    {/if}
                   </td>
                   <td>
                     <div class="max-w-[600px] min-w-[200px]">