Skip to content

Commit

Permalink
Save intermediate outputs from evals, and display it
Browse files Browse the repository at this point in the history
  • Loading branch information
scosman committed Feb 28, 2025
1 parent 9e31b8c commit e514d63
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 24 deletions.
15 changes: 11 additions & 4 deletions app/web_ui/src/lib/api_schema.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -995,11 +995,11 @@ export interface components {
/** Mean Normalized Squared Error */
mean_normalized_squared_error: number;
/** Spearman Correlation */
spearman_correlation: number;
spearman_correlation: number | null;
/** Pearson Correlation */
pearson_correlation: number;
pearson_correlation: number | null;
/** Kendalltau Correlation */
kendalltau_correlation: number;
kendalltau_correlation: number | null;
};
/**
* CreateDatasetSplitRequest
Expand Down Expand Up @@ -1511,6 +1511,13 @@ export interface components {
* @description The output of the task. JSON formatted for structured output, plaintext for unstructured output.
*/
output: string;
/**
* Intermediate Outputs
* @description The intermediate outputs of the task.
*/
intermediate_outputs?: {
[key: string]: string;
} | null;
/**
* Scores
* @description The scores of the evaluator (specifically the EvalConfig this object is a child of).
Expand Down Expand Up @@ -1813,7 +1820,7 @@ export interface components {
* Where models have instruct and raw versions, instruct is default and raw is specified.
* @enum {string}
*/
ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b";
ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b";
/**
* ModelProviderName
* @description Enumeration of supported AI model providers.
Expand Down
1 change: 1 addition & 0 deletions app/web_ui/src/lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,4 @@ export type EvalResultSummary = components["schemas"]["EvalResultSummary"]
export type EvalRunResult = components["schemas"]["EvalRunResult"]
export type EvalConfigCompareSummary =
components["schemas"]["EvalConfigCompareSummary"]
export type EvalRun = components["schemas"]["EvalRun"]
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
EvalRunResult,
Eval,
EvalConfig,
EvalRun,
TaskRunConfig,
} from "$lib/types"
import { client } from "$lib/api_client"
Expand All @@ -29,6 +30,8 @@
let results_error: KilnError | null = null
let results_loading = true
let peek_dialog: Dialog | null = null
let thinking_dialog: Dialog | null = null
let displayed_result: EvalRun | null = null
onMount(async () => {
peek_dialog?.show()
Expand Down Expand Up @@ -179,8 +182,8 @@
<table class="table">
<thead>
<tr>
<th>Input</th>
<th>Output</th>
<th>Input & Output</th>
<th>Thinking</th>
{#each results.eval.output_scores as score}
<th class="text-center">
{score.name}
Expand All @@ -192,8 +195,47 @@
<tbody>
{#each results.results as result}
<tr>
<td> {result.input} </td>
<td> {result.output} </td>
<td>
<div class="font-medium">Input:</div>
<div>
{result.input}
</div>
<div class="font-medium mt-4">Output:</div>
<div>
{result.output}
</div>
</td>
<td>
{#if result.intermediate_outputs?.reasoning || result.intermediate_outputs?.chain_of_thought}
<div class="max-w-[600px] min-w-[200px]">
<div class="max-h-[140px] overflow-y-hidden relative">
{result.intermediate_outputs?.reasoning ||
result.intermediate_outputs?.chain_of_thought ||
"N/A"}
<div class="absolute bottom-0 left-0 w-full">
<div
class="h-36 bg-gradient-to-t from-white to-transparent"
></div>
<div
class="text-center bg-white font-medium font-sm text-gray-500"
>
<button
class="text-gray-500"
on:click={() => {
displayed_result = result
thinking_dialog?.show()
}}
>
See all
</button>
</div>
</div>
</div>
</div>
{:else}
N/A
{/if}
</td>
{#each results.eval.output_scores as score}
{@const score_value =
result.scores[string_to_json_key(score.name)]}
Expand Down Expand Up @@ -245,3 +287,20 @@
</div>
</div>
</Dialog>

<Dialog
bind:this={thinking_dialog}
title="Thinking Output"
action_buttons={[
{
label: "Close",
isCancel: true,
},
]}
>
<div class="font-light text-sm whitespace-pre-wrap">
{displayed_result?.intermediate_outputs?.reasoning ||
displayed_result?.intermediate_outputs?.chain_of_thought ||
"N/A"}
</div>
</Dialog>
16 changes: 11 additions & 5 deletions libs/core/kiln_ai/adapters/eval/base_eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from abc import abstractmethod
from typing import Dict

from kiln_ai.adapters.adapter_registry import adapter_for_task
from kiln_ai.adapters.ml_model_list import ModelProviderName
Expand Down Expand Up @@ -40,7 +41,9 @@ def model_and_provider(self) -> tuple[str, ModelProviderName]:

return model_name, ModelProviderName(provider)

async def run_task_and_eval(self, input: str) -> tuple[TaskRun, EvalScores]:
async def run_task_and_eval(
self, input: str
) -> tuple[TaskRun, EvalScores, Dict[str, str] | None]:
if self.run_config is None:
raise ValueError("Run config is required for run_task_and_eval")

Expand All @@ -59,14 +62,17 @@ async def run_task_and_eval(self, input: str) -> tuple[TaskRun, EvalScores]:
# we don't save by default here. We'll save manually after validating the output
run_output = await run_adapter.invoke(parsed_input)

eval_output = await self.run_eval(run_output)
eval_output, intermediate_outputs = await self.run_eval(run_output)
validate_schema(eval_output, self.score_schema)

return run_output, eval_output
return run_output, eval_output, intermediate_outputs

@abstractmethod
# Runs the eval on the given task run and returns a dictionary of scores which should conform to the score schema
async def run_eval(self, task_run: TaskRun) -> EvalScores:
# Runs the eval on the given task run
# Returns a dictionary of scores which should conform to the score schema, and a dictionary of intermediate outputs
async def run_eval(
self, task_run: TaskRun
) -> tuple[EvalScores, Dict[str, str] | None]:
pass

@classmethod
Expand Down
17 changes: 12 additions & 5 deletions libs/core/kiln_ai/adapters/eval/eval_runner.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import logging
from dataclasses import dataclass
from typing import AsyncGenerator, Dict, List, Literal, Set

Expand All @@ -10,6 +11,8 @@
from kiln_ai.datamodel.task import TaskRunConfig
from kiln_ai.datamodel.task_run import TaskRun

logger = logging.getLogger(__name__)


@dataclass
class EvalJob:
Expand Down Expand Up @@ -227,15 +230,18 @@ async def run_job(self, job: EvalJob) -> bool:

task_output: str | None = None
scores: EvalScores | None = None
intermediate_outputs: Dict[str, str] | None = None
if job.type == "eval_config_eval":
# Eval config eval, we use the saved input from the task run, not invoking the task again
scores = await evaluator.run_eval(job.item)
scores, intermediate_outputs = await evaluator.run_eval(job.item)
task_output = job.item.output.output
else:
# Task run eval, we invoke the task again to get a fresh output
result_task_run, scores = await evaluator.run_task_and_eval(
job.item.input
)
(
result_task_run,
scores,
intermediate_outputs,
) = await evaluator.run_task_and_eval(job.item.input)
task_output = result_task_run.output.output

# Save the job result
Expand All @@ -249,10 +255,11 @@ async def run_job(self, job: EvalJob) -> bool:
scores=scores,
input=job.item.input,
output=task_output,
intermediate_outputs=intermediate_outputs,
)
eval_run.save_to_file()

return True
except Exception as e:
print(f"Error running eval job for dataset item {job.item.id}: {e}")
logger.error(f"Error running eval job for dataset item {job.item.id}: {e}")
return False
10 changes: 7 additions & 3 deletions libs/core/kiln_ai/adapters/eval/g_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,9 @@ def __init__(self, eval_config: EvalConfig, run_config: RunConfig | None):

self.geval_task = GEvalTask(eval_config)

async def run_eval(self, task_run: TaskRun) -> EvalScores:
async def run_eval(
self, task_run: TaskRun
) -> tuple[EvalScores, Dict[str, str] | None]:
"""
Run this G-Eval on the given task run.
"""
Expand Down Expand Up @@ -128,9 +130,11 @@ async def run_eval(self, task_run: TaskRun) -> EvalScores:
_, run_output = await adapter.invoke_returning_run_output(input)

if self.eval_config.config_type == EvalConfigType.llm_as_judge:
return self.build_llm_as_judge_score(run_output)
return self.build_llm_as_judge_score(
run_output
), run_output.intermediate_outputs
else:
return self.build_g_eval_score(run_output)
return self.build_g_eval_score(run_output), run_output.intermediate_outputs

def build_llm_as_judge_score(self, run_output: RunOutput) -> EvalScores:
"""
Expand Down
17 changes: 14 additions & 3 deletions libs/core/kiln_ai/adapters/eval/test_eval_runner.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from typing import Dict
from unittest.mock import AsyncMock, patch

import pytest
Expand Down Expand Up @@ -484,12 +485,17 @@ async def test_run_job_success_task_run_eval(
input="test input",
input_source=data_source,
output=TaskOutput(output="evaluated output"),
intermediate_outputs={"intermediate_output": "intermediate output"},
)
mock_scores = {"accuracy": 0.95}

class MockEvaluator(BaseEval):
async def run_task_and_eval(self, input_text):
return mock_result_run, mock_scores
return (
mock_result_run,
mock_scores,
{"intermediate_output": "intermediate output"},
)

with patch(
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
Expand All @@ -508,6 +514,9 @@ async def run_task_and_eval(self, input_text):
assert saved_run.scores == mock_scores
assert saved_run.input == "test input"
assert saved_run.output == "evaluated output"
assert saved_run.intermediate_outputs == {
"intermediate_output": "intermediate output"
}
assert saved_run.parent_eval_config().id == mock_eval_config.id
assert saved_run.eval_config_eval is False

Expand Down Expand Up @@ -544,8 +553,10 @@ class MockEvaluator(BaseEval):
async def run_task_and_eval(self, input_text):
raise ValueError("Attempted to run task and eval for a config eval")

async def run_eval(self, task_run: TaskRun) -> EvalScores:
return mock_scores
async def run_eval(
self, task_run: TaskRun
) -> tuple[EvalScores, Dict[str, str] | None]:
return mock_scores, {"intermediate_output": "intermediate output"}

with patch(
"kiln_ai.adapters.eval.eval_runner.eval_adapter_from_type",
Expand Down
4 changes: 4 additions & 0 deletions libs/core/kiln_ai/datamodel/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ class EvalRun(KilnParentedModel):
output: str = Field(
description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
)
intermediate_outputs: Dict[str, str] | None = Field(
default=None,
description="The intermediate outputs of the task.",
)
scores: EvalScores = Field(
description="The scores of the evaluator (specifically the EvalConfig this object is a child of)."
)
Expand Down

0 comments on commit e514d63

Please sign in to comment.