Skip to content

Commit

Permalink
More improve copy/UI.
Browse files Browse the repository at this point in the history
  • Loading branch information
scosman committed Feb 26, 2025
1 parent 43eb784 commit 36c064d
Show file tree
Hide file tree
Showing 8 changed files with 135 additions and 99 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@

<AppPage
title="Evals"
subtitle="Evaluate models, prompts, and more."
subtitle="Evaluate the quality of models, prompts, fine-tunes, and more."
sub_subtitle={is_empty ? undefined : "Read the Docs"}
sub_subtitle_link="https://docs.getkiln.ai/docs/evaluationsTODO"
action_buttons={is_empty
? []
: [
{
label: "Create Evaluator",
label: "New Evaluator",
href: `/evals/${project_id}/${task_id}/create_evaluator`,
primary: true,
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte"
import Warning from "$lib/ui/warning.svelte"
import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates"
import InfoTooltip from "$lib/ui/info_tooltip.svelte"
import RunEval from "./run_eval.svelte"
import { eval_config_to_ui_name } from "$lib/utils/formatters"
import OutputTypeTablePreview from "./output_type_table_preview.svelte"
$: project_id = $page.params.project_id
$: task_id = $page.params.task_id
Expand Down Expand Up @@ -249,6 +249,10 @@
value: evaluator.description,
})
}
properties.push({
name: "ID",
value: evaluator.id || "unknown",
})
let outputs = []
for (const output of evaluator.output_scores) {
outputs.push(output.name + " (" + output.type + ")")
Expand All @@ -264,11 +268,11 @@
eval_set_size = " (" + score_summary.dataset_size + " items)"
}
properties.push({
name: "Eval Set",
name: "Eval Dataset",
value: evaluator.eval_set_filter_id + eval_set_size,
})
properties.push({
name: "Config Eval Set",
name: "Eval Method Dataset",
value: evaluator.eval_configs_filter_id,
})
return properties
Expand Down Expand Up @@ -297,7 +301,7 @@
const properties: UiProperty[] = []
properties.push({
name: "Type",
name: "Algorithm",
value: eval_config_to_ui_name(eval_config.config_type),
})
properties.push({
Expand All @@ -308,7 +312,7 @@
),
})
properties.push({
name: "Eval Provider",
name: "Model Provider",
value: provider_name_from_id(
eval_config.model.properties["model_provider"] + "",
),
Expand Down Expand Up @@ -415,7 +419,7 @@
subtitle={evaluator?.name}
action_buttons={[
{
label: "Compare Eval Configs",
label: "Compare Evaluation Methods",
href: `/evals/${project_id}/${task_id}/${eval_id}/eval_configs`,
},
]}
Expand Down Expand Up @@ -447,10 +451,19 @@
</div>
{/each}
</div>
{#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25}
<div class="mt-4">
<Warning
warning_message={`There are only ${score_summary.dataset_size} item(s) in your eval dataset. This is generally too small to get a good sense of how well your task run methods perform.`}
warning_color="warning"
tight={true}
/>
</div>
{/if}
</div>
<div class="grow basis-1/2 flex flex-col gap-4">
<div>
<div class="text-xl font-bold">Evaluator Config</div>
<div class="text-xl font-bold">Evaluation Method</div>
<div class="text-sm text-gray-500 mb-2">
How the task outputs will be evaluated.
</div>
Expand All @@ -475,7 +488,7 @@
{property.value}
</div>
{/each}
<div class="flex items-center">Quality</div>
<div class="flex items-center">Eval Method Quality</div>
<div class="flex items-center text-gray-500 overflow-x-hidden">
<a
href={`/evals/${project_id}/${task_id}/${eval_id}/eval_configs`}
Expand All @@ -491,9 +504,16 @@
{#if task_run_configs?.length}
<div class="flex flex-col lg:flex-row gap-4 lg:gap-8 mb-6">
<div class="grow">
<div class="text-xl font-bold">Results Summary</div>
<div class="text-xl font-bold">Compare Run Methods</div>

<div class="text-xs text-gray-500">
How various task run configs perform on the selected evaluator{current_eval_config
Compare to find the best method of running your task (various
prompts, models, fine-tunes, etc).
</div>
<div class="text-xs text-gray-500 pt-2">
Scores are generated by running the 'run method' on each item of
your Eval Dataset, generatring task outputs, then evaluating those
outputs with the selected evaluation method{current_eval_config
? ` (${current_eval_config.name})`
: ""}.
</div>
Expand All @@ -504,20 +524,19 @@
</div>
{/if}
</div>
<div>
<div class="shrink-0">
{#if eval_state === "not_started"}
<button
class="btn btn-mid mr-2"
on:click={() => {
add_task_config_dialog?.show()
}}>Add Run Config</button
}}>Add Run Method</button
>
{/if}
<RunEval
bind:eval_state
bind:run_url={run_eval_url}
on_run_complete={() => {
console.log("run complete")
get_score_summary()
}}
/>
Expand All @@ -532,7 +551,7 @@
data-tip="Running evals will update any missing dataset items, without re-running complete items. If some evals consistently fail, check the logs; it is likely that the model is failing on the task or the eval."
>
<Warning
warning_message={`Some evals are incomplete and should be excluded from analysis. Run evals to complete their dataset.`}
warning_message={`Some evals are incomplete and should be excluded from analysis. Click 'Run Eval' to generate missing results.`}
tight={true}
/>
</button>
Expand All @@ -544,36 +563,15 @@
<thead>
<tr>
<th>
<div>Run Config</div>
<div>Run Method</div>
<div class="font-normal">How task output is generated</div>
</th>
{#each evaluator.output_scores as output_score}
<th class="text-center">
{output_score.name}
<div class="font-normal">
{#if output_score.type === "five_star"}
1 to 5
<span class="ml-[-5px]">
<InfoTooltip
tooltip_text="1 to 5 stars, where 5 is best"
/>
</span>
{:else if output_score.type === "pass_fail"}
pass/fail
<span class="ml-[-5px]">
<InfoTooltip tooltip_text="0 is fail and 1 is pass" />
</span>
{:else if output_score.type === "pass_fail_critical"}
pass/fail/critical
<span class="ml-[-5px]">
<InfoTooltip
tooltip_text="-1 is critical failure, 0 is fail, and 1 is pass"
/>
</span>
{:else}
{output_score.type}
{/if}
</div>
<OutputTypeTablePreview
output_score_type={output_score.type}
/>
</th>
{/each}
</tr>
Expand Down Expand Up @@ -648,9 +646,9 @@
<div
class="font-light text-sm max-w-[400px] mx-auto flex flex-col gap-2 mt-8"
>
<div class="font-medium text-lg">Create a Run Config</div>
<div class="font-medium text-lg">Create a Run Method</div>
<div>
A task run config defines how the task is run, such as which model
A task run method defines how the task is run, such as which model
and prompt to use. Create one to run this evaluator.
</div>
<button
Expand All @@ -669,7 +667,7 @@

<Dialog
bind:this={add_task_config_dialog}
title="Add a Task Run Config"
title="Add a Task Run Method"
action_buttons={[
{
label: "Cancel",
Expand All @@ -683,10 +681,10 @@
]}
>
<h4 class="text-sm text-gray-500">
Create a task run config, defining a way to run this task (model+prompt).
Define a method of running this task (model+prompt).
</h4>
<h4 class="text-sm text-gray-500 mt-1">
Your evaluator can compare multiple run configs to find the best one for
Your evaluator can compare multiple run methods to find the best one for
running this task.
</h4>
<div class="flex flex-col gap-2 pt-6">
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import { onMount, tick } from "svelte"
import { page } from "$app/stores"
import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates"
import { eval_config_to_ui_name } from "$lib/utils/formatters"
import {
model_info,
load_model_info,
Expand All @@ -20,6 +21,7 @@
load_available_prompts,
load_available_models,
} from "$lib/stores"
import OutputTypeTablePreview from "../../../output_type_table_preview.svelte"
let results: EvalRunResult | null = null
let results_error: KilnError | null = null
Expand Down Expand Up @@ -73,7 +75,7 @@
return {}
}
return {
Name: run_config.name,
"Run Method Name": run_config.name,
Model: model_name(
run_config.run_config_properties?.model_name,
$model_info,
Expand All @@ -82,7 +84,7 @@
run_config.run_config_properties?.model_provider_name,
),
Prompt: prompt_name_from_id(run_config.run_config_properties?.prompt_id),
"Input Source": evaluator.eval_set_filter_id,
"Task Inputs Dataset": evaluator.eval_set_filter_id,
}
}
Expand All @@ -94,14 +96,14 @@
return {}
}
return {
Name: evaluator.name,
"Eval Config Name": eval_config.name,
"Eval Type": eval_config.config_type,
"Eval Model": model_name(
"Eval Name": evaluator.name,
"Eval Method Name": eval_config.name,
Algorithm: eval_config_to_ui_name(eval_config.config_type),
Model: model_name(
eval_config.model.properties["model_name"] + "",
$model_info,
),
"Eval Provider": provider_name_from_id(
"Model Provider": provider_name_from_id(
eval_config.model.properties["model_provider"] + "",
),
}
Expand All @@ -110,7 +112,7 @@

<AppPage
title="Eval Results"
subtitle="Evaluating a task run config, with an evaluator."
subtitle="Evaluating a task run method with an evaluation method."
>
{#if results_loading}
<div class="w-full min-h-[50vh] flex justify-center items-center">
Expand All @@ -131,15 +133,15 @@
>
<div class="font-medium">Eval Results Empty</div>
<div class="text-error text-sm">
No results found for this run config.
No results found for this run method.
</div>
</div>
{:else if results}
<div class="flex flex-col xl:flex-row gap-8 xl:gap-16 mb-8">
<div class="grow basis-1/2">
<div class="text-xl font-bold">Task Run Config</div>
<div class="text-xl font-bold">Task Run Method</div>
<div class="text-sm text-gray-500 mb-4">
How the outputs were generated.
How the task outputs were generated.
</div>
<div
class="grid grid-cols-[auto,1fr] gap-y-2 gap-x-4 text-sm 2xl:text-base"
Expand All @@ -153,9 +155,9 @@
</div>
</div>
<div class="grow basis-1/2">
<div class="text-xl font-bold">Evaluator</div>
<div class="text-xl font-bold">Evaluation Method</div>
<div class="text-sm text-gray-500 mb-4">
How the outputs were evaluated.
How the task outputs were evaluated.
</div>
<div
class="grid grid-cols-[auto,1fr] gap-y-2 gap-x-4 text-sm 2xl:text-base"
Expand All @@ -176,7 +178,10 @@
<th>Input</th>
<th>Output</th>
{#each results.eval.output_scores as score}
<th class="text-center">{score.name}</th>
<th class="text-center">
{score.name}
<OutputTypeTablePreview output_score_type={score.type} />
</th>
{/each}
</tr>
</thead>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,9 @@

<div class="max-w-[1400px]">
<AppPage
title="Add an Evaluator Config"
subtitle="Eval configs specify how an eval is run (models, prompts, etc). Multiple configs can be added to the same evaluator."
title="Add an Evaluation Method"
subtitle="An evaluation method specifies how an eval is run (algorithm, model, prompt, etc)."
sub_subtitle="Multiple evaluation methods can be added to the same evaluator, then compared to find the most accurate."
>
{#if loading}
<div class="w-full min-h-[50vh] flex justify-center items-center">
Expand Down Expand Up @@ -311,8 +312,9 @@
</div>
<div class="text-xs text-gray-500">
<div>
Include a short description of what this task does for the
evaluator to use as context.
Include a short description of what this task does. The
evaluator will use this for context. Keep it short, ideally one
sentence. Include more detailed requirements in steps below.
</div>
</div>
</div>
Expand Down
Loading

0 comments on commit 36c064d

Please sign in to comment.