More improve copy/UI.

Kiln-AI · Feb 26, 2025 · 36c064d · 36c064d
1 parent 43eb784
commit 36c064d
Show file tree

Hide file tree

Showing 8 changed files with 135 additions and 99 deletions.
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/+page.svelte
@@ -52,14 +52,14 @@
 
 <AppPage
   title="Evals"
-  subtitle="Evaluate models, prompts, and more."
+  subtitle="Evaluate the quality of models, prompts, fine-tunes, and more."
   sub_subtitle={is_empty ? undefined : "Read the Docs"}
   sub_subtitle_link="https://docs.getkiln.ai/docs/evaluationsTODO"
   action_buttons={is_empty
     ? []
     : [
         {
-          label: "Create Evaluator",
+          label: "New Evaluator",
           href: `/evals/${project_id}/${task_id}/create_evaluator`,
           primary: true,
         },

diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte
@@ -27,9 +27,9 @@
   import PromptTypeSelector from "../../../../run/prompt_type_selector.svelte"
   import Warning from "$lib/ui/warning.svelte"
   import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates"
-  import InfoTooltip from "$lib/ui/info_tooltip.svelte"
   import RunEval from "./run_eval.svelte"
   import { eval_config_to_ui_name } from "$lib/utils/formatters"
+  import OutputTypeTablePreview from "./output_type_table_preview.svelte"
 
   $: project_id = $page.params.project_id
   $: task_id = $page.params.task_id
@@ -249,6 +249,10 @@
         value: evaluator.description,
       })
     }
+    properties.push({
+      name: "ID",
+      value: evaluator.id || "unknown",
+    })
     let outputs = []
     for (const output of evaluator.output_scores) {
       outputs.push(output.name + " (" + output.type + ")")
@@ -264,11 +268,11 @@
       eval_set_size = " (" + score_summary.dataset_size + " items)"
     }
     properties.push({
-      name: "Eval Set",
+      name: "Eval Dataset",
       value: evaluator.eval_set_filter_id + eval_set_size,
     })
     properties.push({
-      name: "Config Eval Set",
+      name: "Eval Method Dataset",
       value: evaluator.eval_configs_filter_id,
     })
     return properties
@@ -297,7 +301,7 @@
     const properties: UiProperty[] = []
 
     properties.push({
-      name: "Type",
+      name: "Algorithm",
       value: eval_config_to_ui_name(eval_config.config_type),
     })
     properties.push({
@@ -308,7 +312,7 @@
       ),
     })
     properties.push({
-      name: "Eval Provider",
+      name: "Model Provider",
       value: provider_name_from_id(
         eval_config.model.properties["model_provider"] + "",
       ),
@@ -415,7 +419,7 @@
   subtitle={evaluator?.name}
   action_buttons={[
     {
-      label: "Compare Eval Configs",
+      label: "Compare Evaluation Methods",
       href: `/evals/${project_id}/${task_id}/${eval_id}/eval_configs`,
     },
   ]}
@@ -447,10 +451,19 @@
             </div>
           {/each}
         </div>
+        {#if score_summary && score_summary.dataset_size > 0 && score_summary.dataset_size < 25}
+          <div class="mt-4">
+            <Warning
+              warning_message={`There are only ${score_summary.dataset_size} item(s) in your eval dataset. This is generally too small to get a good sense of how well your task run methods perform.`}
+              warning_color="warning"
+              tight={true}
+            />
+          </div>
+        {/if}
       </div>
       <div class="grow basis-1/2 flex flex-col gap-4">
         <div>
-          <div class="text-xl font-bold">Evaluator Config</div>
+          <div class="text-xl font-bold">Evaluation Method</div>
           <div class="text-sm text-gray-500 mb-2">
             How the task outputs will be evaluated.
           </div>
@@ -475,7 +488,7 @@
               {property.value}
             </div>
           {/each}
-          <div class="flex items-center">Quality</div>
+          <div class="flex items-center">Eval Method Quality</div>
           <div class="flex items-center text-gray-500 overflow-x-hidden">
             <a
               href={`/evals/${project_id}/${task_id}/${eval_id}/eval_configs`}
@@ -491,9 +504,16 @@
       {#if task_run_configs?.length}
         <div class="flex flex-col lg:flex-row gap-4 lg:gap-8 mb-6">
           <div class="grow">
-            <div class="text-xl font-bold">Results Summary</div>
+            <div class="text-xl font-bold">Compare Run Methods</div>
+
             <div class="text-xs text-gray-500">
-              How various task run configs perform on the selected evaluator{current_eval_config
+              Compare to find the best method of running your task (various
+              prompts, models, fine-tunes, etc).
+            </div>
+            <div class="text-xs text-gray-500 pt-2">
+              Scores are generated by running the 'run method' on each item of
+              your Eval Dataset, generatring task outputs, then evaluating those
+              outputs with the selected evaluation method{current_eval_config
                 ? ` (${current_eval_config.name})`
                 : ""}.
             </div>
@@ -504,20 +524,19 @@
               </div>
             {/if}
           </div>
-          <div>
+          <div class="shrink-0">
             {#if eval_state === "not_started"}
               <button
                 class="btn btn-mid mr-2"
                 on:click={() => {
                   add_task_config_dialog?.show()
-                }}>Add Run Config</button
+                }}>Add Run Method</button
               >
             {/if}
             <RunEval
               bind:eval_state
               bind:run_url={run_eval_url}
               on_run_complete={() => {
-                console.log("run complete")
                 get_score_summary()
               }}
             />
@@ -532,7 +551,7 @@
               data-tip="Running evals will update any missing dataset items, without re-running complete items. If some evals consistently fail, check the logs; it is likely that the model is failing on the task or the eval."
             >
               <Warning
-                warning_message={`Some evals are incomplete and should be excluded from analysis. Run evals to complete their dataset.`}
+                warning_message={`Some evals are incomplete and should be excluded from analysis. Click 'Run Eval' to generate missing results.`}
                 tight={true}
               />
             </button>
@@ -544,36 +563,15 @@
             <thead>
               <tr>
                 <th>
-                  <div>Run Config</div>
+                  <div>Run Method</div>
                   <div class="font-normal">How task output is generated</div>
                 </th>
                 {#each evaluator.output_scores as output_score}
                   <th class="text-center">
                     {output_score.name}
-                    <div class="font-normal">
-                      {#if output_score.type === "five_star"}
-                        1 to 5
-                        <span class="ml-[-5px]">
-                          <InfoTooltip
-                            tooltip_text="1 to 5 stars, where 5 is best"
-                          />
-                        </span>
-                      {:else if output_score.type === "pass_fail"}
-                        pass/fail
-                        <span class="ml-[-5px]">
-                          <InfoTooltip tooltip_text="0 is fail and 1 is pass" />
-                        </span>
-                      {:else if output_score.type === "pass_fail_critical"}
-                        pass/fail/critical
-                        <span class="ml-[-5px]">
-                          <InfoTooltip
-                            tooltip_text="-1 is critical failure, 0 is fail, and 1 is pass"
-                          />
-                        </span>
-                      {:else}
-                        {output_score.type}
-                      {/if}
-                    </div>
+                    <OutputTypeTablePreview
+                      output_score_type={output_score.type}
+                    />
                   </th>
                 {/each}
               </tr>
@@ -648,9 +646,9 @@
         <div
           class="font-light text-sm max-w-[400px] mx-auto flex flex-col gap-2 mt-8"
         >
-          <div class="font-medium text-lg">Create a Run Config</div>
+          <div class="font-medium text-lg">Create a Run Method</div>
           <div>
-            A task run config defines how the task is run, such as which model
+            A task run method defines how the task is run, such as which model
             and prompt to use. Create one to run this evaluator.
           </div>
           <button
@@ -669,7 +667,7 @@
 
 <Dialog
   bind:this={add_task_config_dialog}
-  title="Add a Task Run Config"
+  title="Add a Task Run Method"
   action_buttons={[
     {
       label: "Cancel",
@@ -683,10 +681,10 @@
   ]}
 >
   <h4 class="text-sm text-gray-500">
-    Create a task run config, defining a way to run this task (model+prompt).
+    Define a method of running this task (model+prompt).
   </h4>
   <h4 class="text-sm text-gray-500 mt-1">
-    Your evaluator can compare multiple run configs to find the best one for
+    Your evaluator can compare multiple run methods to find the best one for
     running this task.
   </h4>
   <div class="flex flex-col gap-2 pt-6">

diff --git a/...[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte b/...[project_id]/[task_id]/[eval_id]/[eval_config_id]/[run_config_id]/run_result/+page.svelte
@@ -11,6 +11,7 @@
   import { onMount, tick } from "svelte"
   import { page } from "$app/stores"
   import { string_to_json_key } from "$lib/utils/json_schema_editor/json_schema_templates"
+  import { eval_config_to_ui_name } from "$lib/utils/formatters"
   import {
     model_info,
     load_model_info,
@@ -20,6 +21,7 @@
     load_available_prompts,
     load_available_models,
   } from "$lib/stores"
+  import OutputTypeTablePreview from "../../../output_type_table_preview.svelte"
 
   let results: EvalRunResult | null = null
   let results_error: KilnError | null = null
@@ -73,7 +75,7 @@
       return {}
     }
     return {
-      Name: run_config.name,
+      "Run Method Name": run_config.name,
       Model: model_name(
         run_config.run_config_properties?.model_name,
         $model_info,
@@ -82,7 +84,7 @@
         run_config.run_config_properties?.model_provider_name,
       ),
       Prompt: prompt_name_from_id(run_config.run_config_properties?.prompt_id),
-      "Input Source": evaluator.eval_set_filter_id,
+      "Task Inputs Dataset": evaluator.eval_set_filter_id,
     }
   }
 
@@ -94,14 +96,14 @@
       return {}
     }
     return {
-      Name: evaluator.name,
-      "Eval Config Name": eval_config.name,
-      "Eval Type": eval_config.config_type,
-      "Eval Model": model_name(
+      "Eval Name": evaluator.name,
+      "Eval Method Name": eval_config.name,
+      Algorithm: eval_config_to_ui_name(eval_config.config_type),
+      Model: model_name(
         eval_config.model.properties["model_name"] + "",
         $model_info,
       ),
-      "Eval Provider": provider_name_from_id(
+      "Model Provider": provider_name_from_id(
         eval_config.model.properties["model_provider"] + "",
       ),
     }
@@ -110,7 +112,7 @@
 
 <AppPage
   title="Eval Results"
-  subtitle="Evaluating a task run config, with an evaluator."
+  subtitle="Evaluating a task run method with an evaluation method."
 >
   {#if results_loading}
     <div class="w-full min-h-[50vh] flex justify-center items-center">
@@ -131,15 +133,15 @@
     >
       <div class="font-medium">Eval Results Empty</div>
       <div class="text-error text-sm">
-        No results found for this run config.
+        No results found for this run method.
       </div>
     </div>
   {:else if results}
     <div class="flex flex-col xl:flex-row gap-8 xl:gap-16 mb-8">
       <div class="grow basis-1/2">
-        <div class="text-xl font-bold">Task Run Config</div>
+        <div class="text-xl font-bold">Task Run Method</div>
         <div class="text-sm text-gray-500 mb-4">
-          How the outputs were generated.
+          How the task outputs were generated.
         </div>
         <div
           class="grid grid-cols-[auto,1fr] gap-y-2 gap-x-4 text-sm 2xl:text-base"
@@ -153,9 +155,9 @@
         </div>
       </div>
       <div class="grow basis-1/2">
-        <div class="text-xl font-bold">Evaluator</div>
+        <div class="text-xl font-bold">Evaluation Method</div>
         <div class="text-sm text-gray-500 mb-4">
-          How the outputs were evaluated.
+          How the task outputs were evaluated.
         </div>
         <div
           class="grid grid-cols-[auto,1fr] gap-y-2 gap-x-4 text-sm 2xl:text-base"
@@ -176,7 +178,10 @@
             <th>Input</th>
             <th>Output</th>
             {#each results.eval.output_scores as score}
-              <th class="text-center">{score.name}</th>
+              <th class="text-center">
+                {score.name}
+                <OutputTypeTablePreview output_score_type={score.type} />
+              </th>
             {/each}
           </tr>
         </thead>

diff --git a/...i/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/...i/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte
@@ -224,8 +224,9 @@
 
 <div class="max-w-[1400px]">
   <AppPage
-    title="Add an Evaluator Config"
-    subtitle="Eval configs specify how an eval is run (models, prompts, etc). Multiple configs can be added to the same evaluator."
+    title="Add an Evaluation Method"
+    subtitle="An evaluation method specifies how an eval is run (algorithm, model, prompt, etc)."
+    sub_subtitle="Multiple evaluation methods can be added to the same evaluator, then compared to find the most accurate."
   >
     {#if loading}
       <div class="w-full min-h-[50vh] flex justify-center items-center">
@@ -311,8 +312,9 @@
             </div>
             <div class="text-xs text-gray-500">
               <div>
-                Include a short description of what this task does for the
-                evaluator to use as context.
+                Include a short description of what this task does. The
+                evaluator will use this for context. Keep it short, ideally one
+                sentence. Include more detailed requirements in steps below.
               </div>
             </div>
           </div>