Skip to content

Commit

Permalink
- All setting current eval config for an eval through UI
Browse files Browse the repository at this point in the history
- Improve strings/messaging
- Allow creating eval configs from /eval_configs with correct redirect
- Fix a bug where eval runs without task_run_configs were causing lookup errors.
  • Loading branch information
scosman committed Feb 26, 2025
1 parent 1133e1a commit 50811b1
Show file tree
Hide file tree
Showing 5 changed files with 183 additions and 12 deletions.
18 changes: 18 additions & 0 deletions app/desktop/studio_server/eval_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,21 @@ async def run_eval_config(

return await run_eval_runner_with_status(eval_runner)

@app.post(
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}"
)
async def set_default_eval_config(
project_id: str,
task_id: str,
eval_id: str,
eval_config_id: str,
) -> Eval:
eval = eval_from_id(project_id, task_id, eval_id)
eval.current_config_id = eval_config_id
eval.save_to_file()

return eval

@app.get(
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval"
)
Expand Down Expand Up @@ -470,6 +485,9 @@ async def get_eval_config_score_summary(

# important: readonly makes this much faster
for eval_run in eval_config.runs(readonly=True):
if eval_run.task_run_config_id is None:
# This eval_run is not associated with a run_config, so we can't count it
continue
run_config_id = str(eval_run.task_run_config_id)

# Check if we should count this eval_run. Not every eval_run has to go into the stats:
Expand Down
34 changes: 34 additions & 0 deletions app/desktop/studio_server/test_eval_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,3 +1000,37 @@ async def test_run_eval_config_eval(
assert eval_runner.eval_configs[0].id == mock_eval_config.id
assert eval_runner.run_configs is None
assert eval_runner.eval_run_type == "eval_config_eval"


@pytest.mark.asyncio
async def test_set_current_eval_config(
client, mock_task_from_id, mock_task, mock_eval, mock_eval_config
):
"""Test setting the current eval config for an evaluation."""
mock_task_from_id.return_value = mock_task

# Get the eval before updating to verify the change
response = client.get("/api/projects/project1/tasks/task1/eval/eval1")
assert response.status_code == 200
eval_before = response.json()

# The current_config_id might be None or different initially
initial_config_id = eval_before.get("current_config_id")
assert initial_config_id is None

# Set the current eval config
with patch("app.desktop.studio_server.eval_api.eval_from_id") as mock_eval_from_id:
mock_eval_from_id.return_value = mock_eval
response = client.post(
"/api/projects/project1/tasks/task1/eval/eval1/set_current_eval_config/eval_config1"
)
assert response.status_code == 200
updated_eval = response.json()

# Verify the current_config_id was updated
assert updated_eval["current_config_id"] == "eval_config1"
assert updated_eval["id"] == "eval1"

# Verify the change persists by fetching the eval again
eval_from_disk = mock_task.evals()[0]
assert eval_from_disk.current_config_id == "eval_config1"
51 changes: 51 additions & 0 deletions app/web_ui/src/lib/api_schema.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -810,6 +810,23 @@ export interface paths {
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}": {
parameters: {
query?: never;
header?: never;
path?: never;
cookie?: never;
};
get?: never;
put?: never;
/** Set Default Eval Config */
post: operations["set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post"];
delete?: never;
options?: never;
head?: never;
patch?: never;
trace?: never;
};
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval": {
parameters: {
query?: never;
Expand Down Expand Up @@ -4260,6 +4277,40 @@ export interface operations {
};
};
};
set_default_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__set_current_eval_config__eval_config_id__post: {
parameters: {
query?: never;
header?: never;
path: {
project_id: string;
task_id: string;
eval_id: string;
eval_config_id: string;
};
cookie?: never;
};
requestBody?: never;
responses: {
/** @description Successful Response */
200: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["Eval"];
};
};
/** @description Validation Error */
422: {
headers: {
[name: string]: unknown;
};
content: {
"application/json": components["schemas"]["HTTPValidationError"];
};
};
};
};
run_eval_config_eval_api_projects__project_id__tasks__task_id__eval__eval_id__run_eval_config_eval_get: {
parameters: {
query?: never;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,16 @@
throw error
}
complete = true
goto(
`/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`,
)
const next_page = $page.url.searchParams.get("next_page")
if (next_page === "eval_configs") {
goto(
`/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/eval_configs`,
)
} else {
goto(
`/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}?selected_eval_config=${data.id}`,
)
}
} catch (e) {
create_evaluator_error = createKilnError(e)
} finally {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,10 @@
load_model_info(),
load_available_prompts(),
load_available_models(),
// Get this first, as we want to know "current" for sorting
get_eval(),
])
// These can be parallel
get_eval()
get_eval_config()
get_score_summary()
})
Expand Down Expand Up @@ -102,7 +103,12 @@
if (error) {
throw error
}
eval_configs = data
// sort with current on top
eval_configs = data.sort((a, b) => {
if (evaluator && a.id === evaluator.current_config_id) return -1
if (evaluator && b.id === evaluator.current_config_id) return 1
return 0
})
} catch (error) {
eval_configs_error = createKilnError(error)
} finally {
Expand Down Expand Up @@ -180,17 +186,17 @@
const warnings: string[] = []
if (score_summary.dataset_size === 0) {
warnings.push(
"No items in your eval-config dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.",
"There are zero items in your config eval dataset. Generate some runs in your dataset tab, and tag them to add them to your eval-config dataset.",
)
}
if (score_summary.not_rated_count > 0) {
warnings.push(
`${score_summary.not_rated_count} item(s) in your eval-config dataset are not rated at all. Add human ratings to these items in the dataset tab.`,
`${score_summary.not_rated_count} item(s) in your config eval dataset are not rated at all. Add human ratings to these items in the dataset tab.`,
)
}
if (score_summary.partially_rated_count > 0) {
warnings.push(
`${score_summary.partially_rated_count} item(s) in your eval-config dataset are only partially rated. Add human ratings to these items in the dataset tab for each score.`,
`${score_summary.partially_rated_count} item(s) in your config eval dataset are only partially rated. Add human ratings to these items for every score.`,
)
}
Expand All @@ -209,11 +215,47 @@
return warnings
}
async function set_current_eval_config(
eval_config_id: string | null | undefined,
) {
if (!eval_config_id) {
return
}
try {
const { data, error } = await client.POST(
"/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/set_current_eval_config/{eval_config_id}",
{
params: {
path: {
project_id: $page.params.project_id,
task_id: $page.params.task_id,
eval_id: $page.params.eval_id,
eval_config_id: eval_config_id,
},
},
},
)
if (error) {
throw error
}
// Update the evaluator with the latest
evaluator = data
} catch (error) {
eval_error = createKilnError(error)
}
}
</script>

<AppPage
title="Compare Eval Configs"
subtitle="Find the evaluator that best matches human-ratings"
action_buttons={[
{
label: "Add Eval Config",
href: `/evals/${$page.params.project_id}/${$page.params.task_id}/${$page.params.eval_id}/create_eval_config?next_page=eval_configs`,
},
]}
>
{#if loading}
<div class="w-full min-h-[50vh] flex justify-center items-center">
Expand Down Expand Up @@ -242,16 +284,22 @@
</div>
{/each}
</div>
{#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25}
<Warning
warning_message={`There are only ${score_summary.dataset_size} items in your eval-config dataset. This is generally too small to get a good sense of how well your eval-configs perform.`}
warning_color="warning"
tight={true}
/>
{/if}
</div>
</div>
<div class="mt-16">
{#if eval_configs?.length}
<div class="flex flex-col lg:flex-row gap-4 lg:gap-8 mb-6">
<div class="grow">
<div class="text-xl font-bold">Correlation to Human Scores</div>
<div class="text-xl font-bold">Correlation to Human Ratings</div>
<div class="text-xs text-gray-500">
How each eval config correlates to human scores (ratings from the
dataset tab).
How each eval config correlates to human ratings.
</div>
{#if score_summary_error}
<div class="text-error text-sm">
Expand Down Expand Up @@ -279,13 +327,14 @@
</div>

<!-- Warn the user if some evals are incomplete -->

{#if incomplete_warning(score_summary).length}
<div class="mt-6 mb-4">
<Warning
warning_message={`There are issues you should resolve before analyzing this data.`}
tight={true}
/>
<ul class="list-disc list-inside text-error">
<ul class="list-disc list-inside text-error pl-2 pt-2">
{#each incomplete_warning(score_summary) as warning}
<li>{warning}</li>
{/each}
Expand Down Expand Up @@ -370,6 +419,18 @@
<!-- We have results, but not for this run config -->
<div class="text-sm text-error">0% complete</div>
{/if}
{#if eval_config.id == evaluator.current_config_id}
<div class="badge badge-primary mt-2">Default</div>
{:else}
<button
class="link text-sm text-gray-500"
on:click={() => {
set_current_eval_config(eval_config.id)
}}
>
Set as default
</button>
{/if}
</td>
<td>
<div class="max-w-[600px] min-w-[200px]">
Expand Down

0 comments on commit 50811b1

Please sign in to comment.