diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py index fa32f6b6..f71c1612 100644 --- a/app/desktop/studio_server/eval_api.py +++ b/app/desktop/studio_server/eval_api.py @@ -22,7 +22,7 @@ EvalConfigType, EvalOutputScore, EvalRun, - EvalTemplate, + EvalTemplateId, ) from kiln_ai.datamodel.json_schema import string_to_json_key from kiln_ai.datamodel.prompt_id import is_frozen_prompt @@ -47,7 +47,7 @@ def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval: raise HTTPException( status_code=404, - detail=f"Task not found. ID: {task_id}", + detail=f"Eval not found. ID: {eval_id}", ) @@ -79,9 +79,9 @@ def task_run_config_from_id( ) -# JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better async def run_eval_runner_with_status(eval_runner: EvalRunner) -> StreamingResponse: - # Async messages via server side events (SSE) + # Yields async messages designed to be used with server sent events (SSE) + # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events async def event_generator(): async for progress in eval_runner.run(): data = { @@ -103,7 +103,7 @@ async def event_generator(): class CreateEvaluatorRequest(BaseModel): name: str description: str - template: EvalTemplate | None + template: EvalTemplateId | None output_scores: list[EvalOutputScore] eval_set_filter_id: DatasetFilterId eval_configs_filter_id: DatasetFilterId @@ -142,18 +142,18 @@ class EvalRunResult(BaseModel): class EvalResultSummary(BaseModel): # run_config_id -> output_score_id -> ScoreSummary - results: Dict[str, Dict[str, ScoreSummary]] + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] # run_config_id -> percent of the dataset that has been processed - run_config_percent_complete: Dict[str, float] + run_config_percent_complete: Dict[ID_TYPE, float] # The total size of the dataset used for the eval dataset_size: int class EvalConfigCompareSummary(BaseModel): # Summary of results. eval_config_id -> output_score_id -> CorrelationResult - results: Dict[str, Dict[str, CorrelationResult]] + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] # eval_config_id -> percent of the dataset that has been processed (run with eval scores) - eval_config_percent_complete: Dict[str, float] + eval_config_percent_complete: Dict[ID_TYPE, float] # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size) dataset_size: int # The number of dataset items which are fully rated, partially rated, or not rated at all. @@ -180,9 +180,10 @@ def human_score_from_task_run( if score_key == "overall_rating": human_score = task_run.output.rating.value else: - req_rating = task_run.output.rating.requirement_ratings.get( - score_key_to_task_requirement_id[score_key], None - ) + req_id = score_key_to_task_requirement_id.get(score_key, None) + if req_id is None: + return None + req_rating = task_run.output.rating.requirement_ratings.get(req_id, None) if req_rating is not None: human_score = req_rating.value @@ -199,7 +200,6 @@ def count_human_evals( partially_rated_count: int = 0 not_rated_count: int = 0 for dataset_item in items: - # Check it has all scores has_all_scores = True has_any_scores = False for output_score in eval.output_scores: @@ -346,8 +346,9 @@ async def create_eval_config( eval_config.save_to_file() return eval_config + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better @app.get( - "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run" + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval" ) async def run_eval_config( project_id: str, @@ -397,6 +398,7 @@ async def set_default_eval_config( return eval + # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval" ) @@ -440,6 +442,7 @@ async def get_eval_run_results( run_config=run_config, ) + # This compares run_configs to each other on a given eval_config. Compare to below which compares eval_configs to each other. @app.get( "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary" ) @@ -463,29 +466,27 @@ async def get_eval_config_score_summary( ) # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run - remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { - str(run_config.id): set(expected_dataset_ids) - for run_config in task_runs_configs + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + run_config.id: set(expected_dataset_ids) for run_config in task_runs_configs } # Track how often we are missing scores in a eval_config. Should be 0 for a complete eval_config - partial_incomplete_counts: Dict[str, int] = { - str(run_config.id): 0 for run_config in task_runs_configs + partial_incomplete_counts: Dict[ID_TYPE, int] = { + run_config.id: 0 for run_config in task_runs_configs } - # task_run_config_id -> output_score_id -> score/total - total_scores: Dict[str, Dict[str, float]] = {} - score_counts: Dict[str, Dict[str, int]] = {} + # task_run_config_id -> output_score_json_key -> score/total for calculating the mean score + total_scores: Dict[ID_TYPE, Dict[str, float]] = {} + score_counts: Dict[ID_TYPE, Dict[str, int]] = {} - # important: readonly makes this much faster for eval_run in eval_config.runs(readonly=True): if eval_run.task_run_config_id is None: - # This eval_run is not associated with a run_config, so we can't count it + # This eval_run is not associated with a run_config, so we should not count it continue - run_config_id = str(eval_run.task_run_config_id) + run_config_id = eval_run.task_run_config_id # Check if we should count this eval_run. Not every eval_run has to go into the stats: # - a dataset_id can be removed from the dataset filter (removed a tag) - # - this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) + # - this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]: continue else: @@ -513,25 +514,25 @@ async def get_eval_config_score_summary( partial_incomplete_counts[run_config_id] += 1 # Convert to score summaries - results: Dict[str, Dict[str, ScoreSummary]] = {} + results: Dict[ID_TYPE, Dict[str, ScoreSummary]] = {} for run_config_id, output_scores in total_scores.items(): results[run_config_id] = {} for output_score_id, score in output_scores.items(): - if score_counts[run_config_id][output_score_id] > 0: + count = score_counts[run_config_id][output_score_id] + if count > 0: results[run_config_id][output_score_id] = ScoreSummary( - mean_score=score / score_counts[run_config_id][output_score_id] + mean_score=score / count ) # Calculate the percent of the dataset that has been processed - run_config_percent_complete: Dict[str, float] = {} + run_config_percent_complete: Dict[ID_TYPE, float] = {} for run_config in task_runs_configs: - run_config_id = str(run_config.id) # Partial incomplete (missing scores), and fully incomplete (no eval_run) - incomplete_count = partial_incomplete_counts[run_config_id] + len( - remaining_expected_dataset_ids[run_config_id] + incomplete_count = partial_incomplete_counts[run_config.id] + len( + remaining_expected_dataset_ids[run_config.id] ) percent_incomplete = incomplete_count / len(expected_dataset_ids) - run_config_percent_complete[str(run_config.id)] = 1 - percent_incomplete + run_config_percent_complete[run_config.id] = 1 - percent_incomplete return EvalResultSummary( results=results, @@ -573,18 +574,15 @@ async def get_eval_configs_score_summary( not_rated_count=0, ) - # save a copy of the expected dataset ids for each eval config, we'll update each as we process each eval run - remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = { - str(eval_config.id): set(expected_dataset_ids) - for eval_config in eval_configs + # save a copy of the expected dataset ids for each eval config id, we'll update each as we process each eval run + remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = { + eval_config.id: set(expected_dataset_ids) for eval_config in eval_configs } - # eval_config_id -> output_score_id -> correlation calculator - correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {} + # eval_config_id -> output_score_json_key -> correlation calculator + correlation_calculators: Dict[ID_TYPE, Dict[str, CorrelationCalculator]] = {} - # important: readonly makes this much faster for eval_config in eval_configs: - eval_config_id = str(eval_config.id) for eval_run in eval_config.runs(readonly=True): dataset_item = expected_dataset_items.get(eval_run.dataset_id, None) if dataset_item is None: @@ -593,14 +591,14 @@ async def get_eval_configs_score_summary( continue # Check if we should count this eval_run. Not every eval_run has to go into the stats: - # Example: this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted) + # Example: this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are) if ( eval_run.dataset_id - not in remaining_expected_dataset_ids[eval_config_id] + not in remaining_expected_dataset_ids[eval_config.id] ): continue else: - remaining_expected_dataset_ids[eval_config_id].remove( + remaining_expected_dataset_ids[eval_config.id].remove( eval_run.dataset_id ) @@ -617,13 +615,15 @@ async def get_eval_configs_score_summary( # This score doesn't have both a human eval and eval score, so we can't compare continue - if eval_config_id not in correlation_calculators: - correlation_calculators[eval_config_id] = {} + if eval_config.id not in correlation_calculators: + correlation_calculators[eval_config.id] = {} - if score_key not in correlation_calculators[eval_config_id]: - correlation_calculators[eval_config_id][score_key] = ( - CorrelationCalculator() - ) + calculator = correlation_calculators[eval_config.id].get( + score_key, None + ) + if calculator is None: + calculator = CorrelationCalculator() + correlation_calculators[eval_config.id][score_key] = calculator normalized_eval_score = normalize_rating( eval_score, output_score.type @@ -631,7 +631,7 @@ async def get_eval_configs_score_summary( normalized_human_score = normalize_rating( human_score, output_score.type ) - correlation_calculators[eval_config_id][score_key].add_score( + calculator.add_score( CorrelationScore( measured_score=eval_score, human_score=human_score, @@ -641,27 +641,26 @@ async def get_eval_configs_score_summary( ) # Convert to score summaries - results: Dict[str, Dict[str, CorrelationResult]] = {} + results: Dict[ID_TYPE, Dict[str, CorrelationResult]] = {} for eval_config_id in correlation_calculators.keys(): results[eval_config_id] = {} for score_key in correlation_calculators[eval_config_id].keys(): - if not correlation_calculators[eval_config_id][score_key]: + calculator = correlation_calculators[eval_config_id].get( + score_key, None + ) + if calculator is None: # No scores to calculate correlation for this pair continue - correlation_result = correlation_calculators[eval_config_id][ - score_key - ].calculate_correlation() + correlation_result = calculator.calculate_correlation() results[eval_config_id][score_key] = correlation_result # Calculate the percent of the dataset that has been processed - eval_config_percent_complete: Dict[str, float] = {} + eval_config_percent_complete: Dict[ID_TYPE, float] = {} for eval_config in eval_configs: - eval_config_id = str(eval_config.id) - # Partial incomplete (missing scores), and fully incomplete (no eval_run) - incomplete_count = len(remaining_expected_dataset_ids[eval_config_id]) + incomplete_count = len(remaining_expected_dataset_ids[eval_config.id]) percent_incomplete = incomplete_count / len(expected_dataset_ids) - eval_config_percent_complete[str(eval_config.id)] = 1 - percent_incomplete + eval_config_percent_complete[eval_config.id] = 1 - percent_incomplete # Count how many dataset items have human evals fully_rated_count, partially_rated_count, not_rated_count = count_human_evals( diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py index 175dec2a..58a6e2fc 100644 --- a/app/desktop/studio_server/test_eval_api.py +++ b/app/desktop/studio_server/test_eval_api.py @@ -27,7 +27,7 @@ EvalConfigType, EvalOutputScore, EvalRun, - EvalTemplate, + EvalTemplateId, ) from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig @@ -87,7 +87,7 @@ def mock_eval(mock_task): id="eval1", name="Test Eval", description="Test Description", - template=EvalTemplate.bias, + template=EvalTemplateId.bias, output_scores=[ EvalOutputScore(name="score1", description="desc1", type="five_star"), EvalOutputScore( @@ -177,7 +177,7 @@ def test_get_eval_not_found(client, mock_task, mock_task_from_id): response = client.get("/api/projects/project1/tasks/task1/eval/non_existent") assert response.status_code == 404 - assert response.json()["detail"] == "Task not found. ID: task1" + assert response.json()["detail"] == "Eval not found. ID: non_existent" @pytest.fixture @@ -428,7 +428,7 @@ async def mock_run(): # Make request with specific run_config_ids response = client.get( - "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run", + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval", params={"run_config_ids": ["run_config1", "run_config2"]}, ) @@ -465,7 +465,7 @@ async def test_run_eval_config_no_run_configs_error( # Make request with no run_config_ids and all_run_configs=False response = client.get( - "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run" + "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval" ) assert response.status_code == 400 diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts index 0990e615..b2d369b7 100644 --- a/app/web_ui/src/lib/api_schema.d.ts +++ b/app/web_ui/src/lib/api_schema.d.ts @@ -793,7 +793,7 @@ export interface paths { patch?: never; trace?: never; }; - "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run": { + "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval": { parameters: { query?: never; header?: never; @@ -801,7 +801,7 @@ export interface paths { cookie?: never; }; /** Run Eval Config */ - get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get"]; + get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get"]; put?: never; post?: never; delete?: never; @@ -1031,7 +1031,7 @@ export interface components { name: string; /** Description */ description: string; - template: components["schemas"]["EvalTemplate"] | null; + template: components["schemas"]["EvalTemplateId"] | null; /** Output Scores */ output_scores: components["schemas"]["EvalOutputScore"][]; /** Eval Set Filter Id */ @@ -1330,7 +1330,7 @@ export interface components { */ description?: string | null; /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */ - template?: components["schemas"]["EvalTemplate"] | null; + template?: components["schemas"]["EvalTemplateId"] | null; /** * Current Config Id * @description The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs. @@ -1540,11 +1540,11 @@ export interface components { run_config: components["schemas"]["TaskRunConfig"]; }; /** - * EvalTemplate + * EvalTemplateId * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval. * @enum {string} */ - EvalTemplate: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; + EvalTemplateId: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak"; /** * FineTuneParameter * @description A parameter for a fine-tune. Hyperparameters, etc. @@ -1818,7 +1818,7 @@ export interface components { * Where models have instruct and raw versions, instruct is default and raw is specified. * @enum {string} */ - ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b"; + ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "claude_3_7_sonnet" | "claude_3_7_sonnet_thinking" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b"; /** * ModelProviderName * @description Enumeration of supported AI model providers. @@ -4262,7 +4262,7 @@ export interface operations { }; }; }; - run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get: { + run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get: { parameters: { query?: { run_config_ids?: string[]; diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts index 4ee5b6f0..8419f6d7 100644 --- a/app/web_ui/src/lib/types.ts +++ b/app/web_ui/src/lib/types.ts @@ -21,7 +21,7 @@ export type RunSummary = components["schemas"]["RunSummary"] export type PromptResponse = components["schemas"]["PromptResponse"] export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"] export type EvalOutputScore = components["schemas"]["EvalOutputScore"] -export type EvalTemplate = components["schemas"]["EvalTemplate"] +export type EvalTemplateId = components["schemas"]["EvalTemplateId"] export type Eval = components["schemas"]["Eval"] export type EvalConfigType = components["schemas"]["EvalConfigType"] export type EvalConfig = components["schemas"]["EvalConfig"] diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte index 760b8d7e..f9687c0d 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte @@ -336,7 +336,7 @@ | "running" | "complete" | "complete_with_errors" = "not_started" - $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true` + $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run_task_run_eval?all_run_configs=true` let task_run_config_model_name = "" let task_run_config_provider_name = "" diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte index 7a7496fb..399b2ed1 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte @@ -9,7 +9,7 @@ import { onMount } from "svelte" import Warning from "$lib/ui/warning.svelte" import AvailableModelsDropdown from "../../../../../run/available_models_dropdown.svelte" - import type { Eval, EvalTemplate, Task, EvalConfigType } from "$lib/types" + import type { Eval, EvalTemplateId, Task, EvalConfigType } from "$lib/types" import { tick } from "svelte" import { load_task } from "$lib/stores" import { goto } from "$app/navigation" @@ -18,7 +18,7 @@ let task_description: string = "" let eval_steps: string[] = [] - type EvalTemplateWithoutKiln = Exclude + type EvalTemplateWithoutKiln = Exclude const eval_steps_static_templates: Record = { toxicity: [ diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte index 87688a4a..de0c034b 100644 --- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte +++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte @@ -1,7 +1,7 @@