Skip to content

Commit

Permalink
Improve synthetic data gen to follow human guidance for output genera…
Browse files Browse the repository at this point in the history
…tion, not just topic+input generation.

Add Dolphin, an uncensored model, to help with generating datasets for toxicity and bias.
  • Loading branch information
scosman committed Feb 28, 2025
1 parent cfb732c commit 7f19ffe
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 3 deletions.
11 changes: 11 additions & 0 deletions app/desktop/studio_server/data_gen_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
DataGenCategoriesTaskInput,
DataGenSampleTask,
DataGenSampleTaskInput,
wrap_task_with_guidance,
)
from kiln_ai.datamodel import DataSource, DataSourceType, PromptId, TaskRun
from kiln_server.run_api import model_provider_from_string
Expand Down Expand Up @@ -62,6 +63,10 @@ class DataGenSaveSamplesApiInput(BaseModel):
prompt_method: PromptId = Field(
description="The prompt method used to generate the output"
)
human_guidance: str | None = Field(
description="Optional human guidance for generation",
default=None,
)


def connect_data_gen_api(app: FastAPI):
Expand Down Expand Up @@ -121,6 +126,12 @@ async def save_sample(
) -> TaskRun:
task = task_from_id(project_id, task_id)

# Wrap the task instuctions with human guidance, if provided
if sample.human_guidance is not None and sample.human_guidance.strip() != "":
task.instruction = wrap_task_with_guidance(
task.instruction, sample.human_guidance
)

tags = ["synthetic"]
if session_id:
tags.append(f"synthetic_session_{session_id}")
Expand Down
5 changes: 5 additions & 0 deletions app/web_ui/src/lib/api_schema.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1184,6 +1184,11 @@ export interface components {
* @description The prompt method used to generate the output
*/
prompt_method: string;
/**
* Human Guidance
* @description Optional human guidance for generation
*/
human_guidance?: string | null;
};
/**
* DataSource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,12 +500,12 @@
<div class="text-xl font-bold">Compare Run Methods</div>

<div class="text-xs text-gray-500">
Compare to find the best method of running your task (various
prompts, models, fine-tunes, etc).
Find the best method of running your task including various
prompts, models, fine-tunes, and more.
</div>
<div class="text-xs text-gray-500 pt-2">
Scores are generated by running the 'run method' on each item of
your Eval Dataset, generatring task outputs, then evaluating those
your eval dataset, generating task outputs, then evaluating those
outputs with the selected evaluation method{current_eval_config
? ` (${current_eval_config.name})`
: ""}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import FormContainer from "$lib/utils/form_container.svelte"
import { type SampleData } from "./gen_model"
import FormElement from "$lib/utils/form_element.svelte"
import Warning from "$lib/ui/warning.svelte"
let session_id = Math.floor(Math.random() * 1000000000000).toString()
Expand Down Expand Up @@ -284,6 +285,10 @@
const formatted_input = task?.input_json_schema
? JSON.parse(sample.input)
: sample.input
const save_sample_guidance =
guidance_enabled && human_guidance.length > 0
? human_guidance
: undefined
const {
error: post_error,
data,
Expand All @@ -308,6 +313,7 @@
output_provider: provider,
prompt_method,
topic_path: topic_path || [],
human_guidance: save_sample_guidance,
},
},
)
Expand Down Expand Up @@ -485,6 +491,18 @@
{/if}
</div>
</div>
{#if guidance_enabled && human_guidance.length > 0}
{#if prompt_method.includes("::")}
<Warning
warning_message="Human guidance is enabled, but you've selected a custom prompt with a fixed string. Human guidance will not be applied."
/>
{:else}
<Warning
warning_message="Human guidance is enabled. Your guidance will be passed to the model and used to influence output."
warning_color="warning"
/>
{/if}
{/if}
<AvailableModelsDropdown
requires_structured_output={task?.output_json_schema ? true : false}
bind:model
Expand Down
18 changes: 18 additions & 0 deletions libs/core/kiln_ai/adapters/data_gen/data_gen_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,3 +183,21 @@ def __init__(self, target_task: Task, num_samples: int = 8):
input_json_schema=json.dumps(DataGenSampleTaskInput.model_json_schema()),
output_json_schema=list_json_schema_for_task(target_task),
)


def wrap_task_with_guidance(original_instruction: str, guidance: str) -> str:
"""Wrap the original instruction with human guidance.
Args:
original_instruction: The original instruction to wrap
guidance: The human guidance to wrap the instruction with
"""
return f"""{original_instruction}
# Special Instructions
The above instructions are the original instructions for this task. For this execution, we've been given additional instructions. Follow both, but prioritize the additional instructions when they conflict. The additional instructions are:
<additional_instructions>
{guidance}
</additional_instructions>
"""
24 changes: 24 additions & 0 deletions libs/core/kiln_ai/adapters/ml_model_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class ModelFamily(str, Enum):
mixtral = "mixtral"
qwen = "qwen"
deepseek = "deepseek"
dolphin = "dolphin"


# Where models have instruct and raw versions, instruct is default and raw is specified
Expand Down Expand Up @@ -88,6 +89,7 @@ class ModelName(str, Enum):
deepseek_r1_distill_qwen_1p5b = "deepseek_r1_distill_qwen_1p5b"
deepseek_r1_distill_qwen_7b = "deepseek_r1_distill_qwen_7b"
deepseek_r1_distill_llama_8b = "deepseek_r1_distill_llama_8b"
dolphin_2_9_8x22b = "dolphin_2_9_8x22b"


class ModelParserID(str, Enum):
Expand Down Expand Up @@ -962,4 +964,26 @@ class KilnModel(BaseModel):
),
],
),
# Dolphin 2.9 Mixtral 8x22B
KilnModel(
family=ModelFamily.dolphin,
name=ModelName.dolphin_2_9_8x22b,
friendly_name="Dolphin 2.9 8x22B",
providers=[
KilnModelProvider(
name=ModelProviderName.ollama,
structured_output_mode=StructuredOutputMode.json_schema,
supports_data_gen=True,
provider_options={"model": "dolphin-mixtral:8x22b"},
),
KilnModelProvider(
name=ModelProviderName.openrouter,
provider_options={
"model": "cognitivecomputations/dolphin-mixtral-8x22b"
},
supports_data_gen=True,
structured_output_mode=StructuredOutputMode.json_instruction_and_object,
),
],
),
]

0 comments on commit 7f19ffe

Please sign in to comment.