diff --git a/app/desktop/studio_server/eval_api.py b/app/desktop/studio_server/eval_api.py
index fa32f6b6..f71c1612 100644
--- a/app/desktop/studio_server/eval_api.py
+++ b/app/desktop/studio_server/eval_api.py
@@ -22,7 +22,7 @@
     EvalConfigType,
     EvalOutputScore,
     EvalRun,
-    EvalTemplate,
+    EvalTemplateId,
 )
 from kiln_ai.datamodel.json_schema import string_to_json_key
 from kiln_ai.datamodel.prompt_id import is_frozen_prompt
@@ -47,7 +47,7 @@ def eval_from_id(project_id: str, task_id: str, eval_id: str) -> Eval:
 
     raise HTTPException(
         status_code=404,
-        detail=f"Task not found. ID: {task_id}",
+        detail=f"Eval not found. ID: {eval_id}",
     )
 
 
@@ -79,9 +79,9 @@ def task_run_config_from_id(
     )
 
 
-# JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better
 async def run_eval_runner_with_status(eval_runner: EvalRunner) -> StreamingResponse:
-    # Async messages via server side events (SSE)
+    # Yields async messages designed to be used with server sent events (SSE)
+    # https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events
     async def event_generator():
         async for progress in eval_runner.run():
             data = {
@@ -103,7 +103,7 @@ async def event_generator():
 class CreateEvaluatorRequest(BaseModel):
     name: str
     description: str
-    template: EvalTemplate | None
+    template: EvalTemplateId | None
     output_scores: list[EvalOutputScore]
     eval_set_filter_id: DatasetFilterId
     eval_configs_filter_id: DatasetFilterId
@@ -142,18 +142,18 @@ class EvalRunResult(BaseModel):
 
 class EvalResultSummary(BaseModel):
     # run_config_id -> output_score_id -> ScoreSummary
-    results: Dict[str, Dict[str, ScoreSummary]]
+    results: Dict[ID_TYPE, Dict[str, ScoreSummary]]
     # run_config_id -> percent of the dataset that has been processed
-    run_config_percent_complete: Dict[str, float]
+    run_config_percent_complete: Dict[ID_TYPE, float]
     # The total size of the dataset used for the eval
     dataset_size: int
 
 
 class EvalConfigCompareSummary(BaseModel):
     # Summary of results. eval_config_id -> output_score_id -> CorrelationResult
-    results: Dict[str, Dict[str, CorrelationResult]]
+    results: Dict[ID_TYPE, Dict[str, CorrelationResult]]
     # eval_config_id -> percent of the dataset that has been processed (run with eval scores)
-    eval_config_percent_complete: Dict[str, float]
+    eval_config_percent_complete: Dict[ID_TYPE, float]
     # The total size of the dataset used for the eval config comparisons (eval.eval_configs_filter_id set size)
     dataset_size: int
     # The number of dataset items which are fully rated, partially rated, or not rated at all.
@@ -180,9 +180,10 @@ def human_score_from_task_run(
     if score_key == "overall_rating":
         human_score = task_run.output.rating.value
     else:
-        req_rating = task_run.output.rating.requirement_ratings.get(
-            score_key_to_task_requirement_id[score_key], None
-        )
+        req_id = score_key_to_task_requirement_id.get(score_key, None)
+        if req_id is None:
+            return None
+        req_rating = task_run.output.rating.requirement_ratings.get(req_id, None)
         if req_rating is not None:
             human_score = req_rating.value
 
@@ -199,7 +200,6 @@ def count_human_evals(
     partially_rated_count: int = 0
     not_rated_count: int = 0
     for dataset_item in items:
-        # Check it has all scores
         has_all_scores = True
         has_any_scores = False
         for output_score in eval.output_scores:
@@ -346,8 +346,9 @@ async def create_eval_config(
         eval_config.save_to_file()
         return eval_config
 
+    # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better
     @app.get(
-        "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run"
+        "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval"
     )
     async def run_eval_config(
         project_id: str,
@@ -397,6 +398,7 @@ async def set_default_eval_config(
 
         return eval
 
+    # JS SSE client (EventSource) doesn't work with POST requests, so we use GET, even though post would be better
     @app.get(
         "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/run_eval_config_eval"
     )
@@ -440,6 +442,7 @@ async def get_eval_run_results(
             run_config=run_config,
         )
 
+    # This compares run_configs to each other on a given eval_config. Compare to below which compares eval_configs to each other.
     @app.get(
         "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/score_summary"
     )
@@ -463,29 +466,27 @@ async def get_eval_config_score_summary(
             )
 
         # save a copy of the expected dataset ids for each run config, we'll update each as we process each eval run
-        remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = {
-            str(run_config.id): set(expected_dataset_ids)
-            for run_config in task_runs_configs
+        remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = {
+            run_config.id: set(expected_dataset_ids) for run_config in task_runs_configs
         }
         # Track how often we are missing scores in a eval_config. Should be 0 for a complete eval_config
-        partial_incomplete_counts: Dict[str, int] = {
-            str(run_config.id): 0 for run_config in task_runs_configs
+        partial_incomplete_counts: Dict[ID_TYPE, int] = {
+            run_config.id: 0 for run_config in task_runs_configs
         }
 
-        # task_run_config_id -> output_score_id -> score/total
-        total_scores: Dict[str, Dict[str, float]] = {}
-        score_counts: Dict[str, Dict[str, int]] = {}
+        # task_run_config_id -> output_score_json_key -> score/total for calculating the mean score
+        total_scores: Dict[ID_TYPE, Dict[str, float]] = {}
+        score_counts: Dict[ID_TYPE, Dict[str, int]] = {}
 
-        # important: readonly makes this much faster
         for eval_run in eval_config.runs(readonly=True):
             if eval_run.task_run_config_id is None:
-                # This eval_run is not associated with a run_config, so we can't count it
+                # This eval_run is not associated with a run_config, so we should not count it
                 continue
-            run_config_id = str(eval_run.task_run_config_id)
+            run_config_id = eval_run.task_run_config_id
 
             # Check if we should count this eval_run. Not every eval_run has to go into the stats:
             # - a dataset_id can be removed from the dataset filter (removed a tag)
-            # - this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted)
+            # - this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are)
             if eval_run.dataset_id not in remaining_expected_dataset_ids[run_config_id]:
                 continue
             else:
@@ -513,25 +514,25 @@ async def get_eval_config_score_summary(
                 partial_incomplete_counts[run_config_id] += 1
 
         # Convert to score summaries
-        results: Dict[str, Dict[str, ScoreSummary]] = {}
+        results: Dict[ID_TYPE, Dict[str, ScoreSummary]] = {}
         for run_config_id, output_scores in total_scores.items():
             results[run_config_id] = {}
             for output_score_id, score in output_scores.items():
-                if score_counts[run_config_id][output_score_id] > 0:
+                count = score_counts[run_config_id][output_score_id]
+                if count > 0:
                     results[run_config_id][output_score_id] = ScoreSummary(
-                        mean_score=score / score_counts[run_config_id][output_score_id]
+                        mean_score=score / count
                     )
 
         # Calculate the percent of the dataset that has been processed
-        run_config_percent_complete: Dict[str, float] = {}
+        run_config_percent_complete: Dict[ID_TYPE, float] = {}
         for run_config in task_runs_configs:
-            run_config_id = str(run_config.id)
             # Partial incomplete (missing scores), and fully incomplete (no eval_run)
-            incomplete_count = partial_incomplete_counts[run_config_id] + len(
-                remaining_expected_dataset_ids[run_config_id]
+            incomplete_count = partial_incomplete_counts[run_config.id] + len(
+                remaining_expected_dataset_ids[run_config.id]
             )
             percent_incomplete = incomplete_count / len(expected_dataset_ids)
-            run_config_percent_complete[str(run_config.id)] = 1 - percent_incomplete
+            run_config_percent_complete[run_config.id] = 1 - percent_incomplete
 
         return EvalResultSummary(
             results=results,
@@ -573,18 +574,15 @@ async def get_eval_configs_score_summary(
                 not_rated_count=0,
             )
 
-        # save a copy of the expected dataset ids for each eval config, we'll update each as we process each eval run
-        remaining_expected_dataset_ids: Dict[str, Set[ID_TYPE]] = {
-            str(eval_config.id): set(expected_dataset_ids)
-            for eval_config in eval_configs
+        # save a copy of the expected dataset ids for each eval config id, we'll update each as we process each eval run
+        remaining_expected_dataset_ids: Dict[ID_TYPE, Set[ID_TYPE]] = {
+            eval_config.id: set(expected_dataset_ids) for eval_config in eval_configs
         }
 
-        # eval_config_id -> output_score_id -> correlation calculator
-        correlation_calculators: Dict[str, Dict[str, CorrelationCalculator]] = {}
+        # eval_config_id -> output_score_json_key -> correlation calculator
+        correlation_calculators: Dict[ID_TYPE, Dict[str, CorrelationCalculator]] = {}
 
-        # important: readonly makes this much faster
         for eval_config in eval_configs:
-            eval_config_id = str(eval_config.id)
             for eval_run in eval_config.runs(readonly=True):
                 dataset_item = expected_dataset_items.get(eval_run.dataset_id, None)
                 if dataset_item is None:
@@ -593,14 +591,14 @@ async def get_eval_configs_score_summary(
                     continue
 
                 # Check if we should count this eval_run. Not every eval_run has to go into the stats:
-                # Example: this dataset_id was already counted (not great there are dupes, but really shouldn't be double counted)
+                # Example: this dataset_id was already counted (not great there are dupes, but shouldn't be double counted if there are)
                 if (
                     eval_run.dataset_id
-                    not in remaining_expected_dataset_ids[eval_config_id]
+                    not in remaining_expected_dataset_ids[eval_config.id]
                 ):
                     continue
                 else:
-                    remaining_expected_dataset_ids[eval_config_id].remove(
+                    remaining_expected_dataset_ids[eval_config.id].remove(
                         eval_run.dataset_id
                     )
 
@@ -617,13 +615,15 @@ async def get_eval_configs_score_summary(
                         # This score doesn't have both a human eval and eval score, so we can't compare
                         continue
 
-                    if eval_config_id not in correlation_calculators:
-                        correlation_calculators[eval_config_id] = {}
+                    if eval_config.id not in correlation_calculators:
+                        correlation_calculators[eval_config.id] = {}
 
-                    if score_key not in correlation_calculators[eval_config_id]:
-                        correlation_calculators[eval_config_id][score_key] = (
-                            CorrelationCalculator()
-                        )
+                    calculator = correlation_calculators[eval_config.id].get(
+                        score_key, None
+                    )
+                    if calculator is None:
+                        calculator = CorrelationCalculator()
+                        correlation_calculators[eval_config.id][score_key] = calculator
 
                     normalized_eval_score = normalize_rating(
                         eval_score, output_score.type
@@ -631,7 +631,7 @@ async def get_eval_configs_score_summary(
                     normalized_human_score = normalize_rating(
                         human_score, output_score.type
                     )
-                    correlation_calculators[eval_config_id][score_key].add_score(
+                    calculator.add_score(
                         CorrelationScore(
                             measured_score=eval_score,
                             human_score=human_score,
@@ -641,27 +641,26 @@ async def get_eval_configs_score_summary(
                     )
 
         # Convert to score summaries
-        results: Dict[str, Dict[str, CorrelationResult]] = {}
+        results: Dict[ID_TYPE, Dict[str, CorrelationResult]] = {}
         for eval_config_id in correlation_calculators.keys():
             results[eval_config_id] = {}
             for score_key in correlation_calculators[eval_config_id].keys():
-                if not correlation_calculators[eval_config_id][score_key]:
+                calculator = correlation_calculators[eval_config_id].get(
+                    score_key, None
+                )
+                if calculator is None:
                     # No scores to calculate correlation for this pair
                     continue
 
-                correlation_result = correlation_calculators[eval_config_id][
-                    score_key
-                ].calculate_correlation()
+                correlation_result = calculator.calculate_correlation()
                 results[eval_config_id][score_key] = correlation_result
 
         # Calculate the percent of the dataset that has been processed
-        eval_config_percent_complete: Dict[str, float] = {}
+        eval_config_percent_complete: Dict[ID_TYPE, float] = {}
         for eval_config in eval_configs:
-            eval_config_id = str(eval_config.id)
-            # Partial incomplete (missing scores), and fully incomplete (no eval_run)
-            incomplete_count = len(remaining_expected_dataset_ids[eval_config_id])
+            incomplete_count = len(remaining_expected_dataset_ids[eval_config.id])
             percent_incomplete = incomplete_count / len(expected_dataset_ids)
-            eval_config_percent_complete[str(eval_config.id)] = 1 - percent_incomplete
+            eval_config_percent_complete[eval_config.id] = 1 - percent_incomplete
 
         # Count how many dataset items have human evals
         fully_rated_count, partially_rated_count, not_rated_count = count_human_evals(
diff --git a/app/desktop/studio_server/test_eval_api.py b/app/desktop/studio_server/test_eval_api.py
index 175dec2a..58a6e2fc 100644
--- a/app/desktop/studio_server/test_eval_api.py
+++ b/app/desktop/studio_server/test_eval_api.py
@@ -27,7 +27,7 @@
     EvalConfigType,
     EvalOutputScore,
     EvalRun,
-    EvalTemplate,
+    EvalTemplateId,
 )
 from kiln_ai.datamodel.task import RunConfigProperties, TaskRunConfig
 
@@ -87,7 +87,7 @@ def mock_eval(mock_task):
         id="eval1",
         name="Test Eval",
         description="Test Description",
-        template=EvalTemplate.bias,
+        template=EvalTemplateId.bias,
         output_scores=[
             EvalOutputScore(name="score1", description="desc1", type="five_star"),
             EvalOutputScore(
@@ -177,7 +177,7 @@ def test_get_eval_not_found(client, mock_task, mock_task_from_id):
     response = client.get("/api/projects/project1/tasks/task1/eval/non_existent")
 
     assert response.status_code == 404
-    assert response.json()["detail"] == "Task not found. ID: task1"
+    assert response.json()["detail"] == "Eval not found. ID: non_existent"
 
 
 @pytest.fixture
@@ -428,7 +428,7 @@ async def mock_run():
 
         # Make request with specific run_config_ids
         response = client.get(
-            "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run",
+            "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval",
             params={"run_config_ids": ["run_config1", "run_config2"]},
         )
 
@@ -465,7 +465,7 @@ async def test_run_eval_config_no_run_configs_error(
 
         # Make request with no run_config_ids and all_run_configs=False
         response = client.get(
-            "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run"
+            "/api/projects/project1/tasks/task1/eval/eval1/eval_config/eval_config1/run_task_run_eval"
         )
 
         assert response.status_code == 400
diff --git a/app/web_ui/src/lib/api_schema.d.ts b/app/web_ui/src/lib/api_schema.d.ts
index 0990e615..b2d369b7 100644
--- a/app/web_ui/src/lib/api_schema.d.ts
+++ b/app/web_ui/src/lib/api_schema.d.ts
@@ -793,7 +793,7 @@ export interface paths {
         patch?: never;
         trace?: never;
     };
-    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run": {
+    "/api/projects/{project_id}/tasks/{task_id}/eval/{eval_id}/eval_config/{eval_config_id}/run_task_run_eval": {
         parameters: {
             query?: never;
             header?: never;
@@ -801,7 +801,7 @@ export interface paths {
             cookie?: never;
         };
         /** Run Eval Config */
-        get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get"];
+        get: operations["run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get"];
         put?: never;
         post?: never;
         delete?: never;
@@ -1031,7 +1031,7 @@ export interface components {
             name: string;
             /** Description */
             description: string;
-            template: components["schemas"]["EvalTemplate"] | null;
+            template: components["schemas"]["EvalTemplateId"] | null;
             /** Output Scores */
             output_scores: components["schemas"]["EvalOutputScore"][];
             /** Eval Set Filter Id */
@@ -1330,7 +1330,7 @@ export interface components {
              */
             description?: string | null;
             /** @description The template selected when creating this eval. Useful for suggesting eval steps and output scores. */
-            template?: components["schemas"]["EvalTemplate"] | null;
+            template?: components["schemas"]["EvalTemplateId"] | null;
             /**
              * Current Config Id
              * @description The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.
@@ -1540,11 +1540,11 @@ export interface components {
             run_config: components["schemas"]["TaskRunConfig"];
         };
         /**
-         * EvalTemplate
+         * EvalTemplateId
          * @description An eval template is a pre-defined eval that can be used as a starting point for a new eval.
          * @enum {string}
          */
-        EvalTemplate: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak";
+        EvalTemplateId: "kiln_requirements" | "toxicity" | "bias" | "maliciousness" | "factual_correctness" | "jailbreak";
         /**
          * FineTuneParameter
          * @description A parameter for a fine-tune. Hyperparameters, etc.
@@ -1818,7 +1818,7 @@ export interface components {
          *     Where models have instruct and raw versions, instruct is default and raw is specified.
          * @enum {string}
          */
-        ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b";
+        ModelName: "llama_3_1_8b" | "llama_3_1_70b" | "llama_3_1_405b" | "llama_3_2_1b" | "llama_3_2_3b" | "llama_3_2_11b" | "llama_3_2_90b" | "llama_3_3_70b" | "gpt_4o_mini" | "gpt_4o" | "phi_3_5" | "phi_4" | "mistral_large" | "mistral_nemo" | "gemma_2_2b" | "gemma_2_9b" | "gemma_2_27b" | "claude_3_5_haiku" | "claude_3_5_sonnet" | "claude_3_7_sonnet" | "claude_3_7_sonnet_thinking" | "gemini_1_5_flash" | "gemini_1_5_flash_8b" | "gemini_1_5_pro" | "gemini_2_0_flash" | "nemotron_70b" | "mixtral_8x7b" | "qwen_2p5_7b" | "qwen_2p5_72b" | "deepseek_3" | "deepseek_r1" | "mistral_small_3" | "deepseek_r1_distill_qwen_32b" | "deepseek_r1_distill_llama_70b" | "deepseek_r1_distill_qwen_14b" | "deepseek_r1_distill_qwen_1p5b" | "deepseek_r1_distill_qwen_7b" | "deepseek_r1_distill_llama_8b" | "dolphin_2_9_8x22b";
         /**
          * ModelProviderName
          * @description Enumeration of supported AI model providers.
@@ -4262,7 +4262,7 @@ export interface operations {
             };
         };
     };
-    run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_get: {
+    run_eval_config_api_projects__project_id__tasks__task_id__eval__eval_id__eval_config__eval_config_id__run_task_run_eval_get: {
         parameters: {
             query?: {
                 run_config_ids?: string[];
diff --git a/app/web_ui/src/lib/types.ts b/app/web_ui/src/lib/types.ts
index 4ee5b6f0..8419f6d7 100644
--- a/app/web_ui/src/lib/types.ts
+++ b/app/web_ui/src/lib/types.ts
@@ -21,7 +21,7 @@ export type RunSummary = components["schemas"]["RunSummary"]
 export type PromptResponse = components["schemas"]["PromptResponse"]
 export type FinetuneDataStrategy = components["schemas"]["FinetuneDataStrategy"]
 export type EvalOutputScore = components["schemas"]["EvalOutputScore"]
-export type EvalTemplate = components["schemas"]["EvalTemplate"]
+export type EvalTemplateId = components["schemas"]["EvalTemplateId"]
 export type Eval = components["schemas"]["Eval"]
 export type EvalConfigType = components["schemas"]["EvalConfigType"]
 export type EvalConfig = components["schemas"]["EvalConfig"]
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte
index 760b8d7e..f9687c0d 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/+page.svelte
@@ -336,7 +336,7 @@
     | "running"
     | "complete"
     | "complete_with_errors" = "not_started"
-  $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run?all_run_configs=true`
+  $: run_eval_url = `${base_url}/api/projects/${project_id}/tasks/${task_id}/eval/${eval_id}/eval_config/${current_eval_config_id}/run_task_run_eval?all_run_configs=true`
 
   let task_run_config_model_name = ""
   let task_run_config_provider_name = ""
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte
index 7a7496fb..399b2ed1 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/[eval_id]/create_eval_config/+page.svelte
@@ -9,7 +9,7 @@
   import { onMount } from "svelte"
   import Warning from "$lib/ui/warning.svelte"
   import AvailableModelsDropdown from "../../../../../run/available_models_dropdown.svelte"
-  import type { Eval, EvalTemplate, Task, EvalConfigType } from "$lib/types"
+  import type { Eval, EvalTemplateId, Task, EvalConfigType } from "$lib/types"
   import { tick } from "svelte"
   import { load_task } from "$lib/stores"
   import { goto } from "$app/navigation"
@@ -18,7 +18,7 @@
   let task_description: string = ""
   let eval_steps: string[] = []
 
-  type EvalTemplateWithoutKiln = Exclude<EvalTemplate, "kiln_requirements">
+  type EvalTemplateWithoutKiln = Exclude<EvalTemplateId, "kiln_requirements">
   const eval_steps_static_templates: Record<EvalTemplateWithoutKiln, string[]> =
     {
       toxicity: [
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte
index 87688a4a..de0c034b 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/+page.svelte
@@ -1,7 +1,7 @@
 <script lang="ts">
   import AppPage from "../../../../app_page.svelte"
   import SelectEvalTemplate from "./select_eval_template.svelte"
-  import type { EvalOutputScore, EvalTemplate } from "$lib/types"
+  import type { EvalOutputScore, EvalTemplateId } from "$lib/types"
   import { type EvalTemplateResult } from "./eval_template"
   import FormContainer from "$lib/utils/form_container.svelte"
   import type { Task } from "$lib/types"
@@ -35,7 +35,7 @@
     }
   })
 
-  let selected_template: EvalTemplate | "none" | null = null
+  let selected_template: EvalTemplateId | "none" | null = null
   function on_selected_template(template: EvalTemplateResult) {
     // Populate out model from the template
     name = template.name
@@ -112,7 +112,7 @@
   }
 
   // Default tags for each eval template
-  const eval_set_default_tags: Record<EvalTemplate | "none", string> = {
+  const eval_set_default_tags: Record<EvalTemplateId | "none", string> = {
     kiln_requirements: "eval_set",
     toxicity: "toxicity_eval_set",
     bias: "bias_eval_set",
@@ -123,7 +123,7 @@
   }
   $: suggested_eval_set_tag =
     eval_set_default_tags[selected_template ?? "none"] || "eval_set"
-  const config_set_default_tags: Record<EvalTemplate | "none", string> = {
+  const config_set_default_tags: Record<EvalTemplateId | "none", string> = {
     kiln_requirements: "golden",
     toxicity: "toxicity_golden",
     bias: "bias_golden",
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts
index 3a36e57b..ac7f8d8f 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/eval_template.ts
@@ -1,8 +1,8 @@
-import type { EvalOutputScore, EvalTemplate } from "$lib/types"
+import type { EvalOutputScore, EvalTemplateId } from "$lib/types"
 
 export type EvalTemplateResult = {
-  // Server IDs are EvalTemplate. We have a custom "none" value for the UI.
-  template_id: EvalTemplate | "none"
+  // Server IDs are EvalTemplateId. We have a custom "none" value for the UI.
+  template_id: EvalTemplateId | "none"
   name: string
   description: string
   output_scores: EvalOutputScore[]
diff --git a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte
index 12b6a0bb..33af9b54 100644
--- a/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte
+++ b/app/web_ui/src/routes/(app)/evals/[project_id]/[task_id]/create_evaluator/select_eval_template.svelte
@@ -1,11 +1,11 @@
 <script lang="ts">
   import type { EvalTemplateResult } from "./eval_template"
-  import type { Task, EvalTemplate } from "$lib/types"
+  import type { Task, EvalTemplateId } from "$lib/types"
   export let selected_template_callback: (template: EvalTemplateResult) => void
   export let task: Task | null | undefined
 
   interface EvaluatorTemplateDescription {
-    id: EvalTemplate | "none"
+    id: EvalTemplateId | "none"
     name: string
     description: string
     recommended?: boolean
@@ -138,7 +138,7 @@
   ]
 
   function select_template(
-    template_id: EvalTemplate | "none",
+    template_id: EvalTemplateId | "none",
     template: EvalTemplateResult | undefined,
   ) {
     // No op
diff --git a/libs/core/kiln_ai/datamodel/eval.py b/libs/core/kiln_ai/datamodel/eval.py
index a5c33382..81d52d18 100644
--- a/libs/core/kiln_ai/datamodel/eval.py
+++ b/libs/core/kiln_ai/datamodel/eval.py
@@ -2,7 +2,7 @@
 from enum import Enum
 from typing import TYPE_CHECKING, Any, Dict, List, Union
 
-from pydantic import BaseModel, Field, ValidationInfo, model_validator
+from pydantic import BaseModel, Field, model_validator
 from typing_extensions import Self
 
 from kiln_ai.datamodel.basemodel import (
@@ -14,7 +14,6 @@
 from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 from kiln_ai.datamodel.json_schema import string_to_json_key
-from kiln_ai.datamodel.task_output import DataSource, DataSourceType
 from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 
 if TYPE_CHECKING:
@@ -23,7 +22,7 @@
 EvalScores = Dict[str, float]
 
 
-class EvalTemplate(str, Enum):
+class EvalTemplateId(str, Enum):
     """
     An eval template is a pre-defined eval that can be used as a starting point for a new eval.
     """
@@ -248,7 +247,7 @@ class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}
     description: str | None = Field(
         default=None, description="The description of the eval"
     )
-    template: EvalTemplate | None = Field(
+    template: EvalTemplateId | None = Field(
         default=None,
         description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
     )
diff --git a/libs/core/kiln_ai/datamodel/task.py b/libs/core/kiln_ai/datamodel/task.py
index fb8a6838..03c8c756 100644
--- a/libs/core/kiln_ai/datamodel/task.py
+++ b/libs/core/kiln_ai/datamodel/task.py
@@ -1,7 +1,6 @@
 from typing import TYPE_CHECKING, Dict, List, Union
 
-from pydantic import BaseModel, Field, model_validator
-from typing_extensions import Self
+from pydantic import BaseModel, Field
 
 from kiln_ai.datamodel import Finetune
 from kiln_ai.datamodel.basemodel import (