diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md index df1140ce..1b0bef63 100644 --- a/adalflow/CHANGELOG.md +++ b/adalflow/CHANGELOG.md @@ -1,16 +1,23 @@ -## [0.2.7] - 2024-09-23 +## [0.2.7] - 2025-01-16 -### Improved -- Better diagnose report for `Trainer.diagnose`. -- Multi-hop RAG with handling of Cycle. - -## [0.2.7] - TO Be Released ### Added - `Memory` is completed with `call` and `add_dialog_turn` methods. - Integrated `LanceDB` in the `Retriever` +- Multi-modal (image input and generation) in `OpenAIClient` along with tests. +- `ComponentList` to support a list of components registered in a component. Added `test_componentlist` to test the `ComponentList`. + ### Improved +- Better diagnose report for `Trainer.diagnose`. - `BedrockAPIClient` added more details on setup, yet it is still in experimental stage. - `AzureAPIClient` added more details on setup, yet it is still in experimental stage. +- `Retriever` class: + - Support data id (field). +- `GradComponent`: Support pass-through gradient for the `forward` method. + +Optimization +- Aggregated all backward engine prompts in `backward_engine_prompt`. +- Added `TGDData` for the optimizer to support reasoning at proposing new prompt. +- Added `sequential_order` in the `Trainer` to support the sequential training order. Reorganized the trainer code. ## [0.2.6] - 2024-11-25 ### Improved - Add default `max_tokens=512` to the `AnthropicAPIClient` to avoid the error when the user does not provide the `max_tokens` in the prompt. diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py index 4c9b45ba..79310aab 100644 --- a/adalflow/adalflow/__init__.py +++ b/adalflow/adalflow/__init__.py @@ -1,6 +1,6 @@ -__version__ = "0.2.6" +__version__ = "0.2.7" -from adalflow.core.component import Component, fun_to_component +from adalflow.core.component import Component from adalflow.core.container import Sequential, ComponentList from adalflow.core.base_data_class import DataClass, DataClassFormatType, required_field @@ -24,6 +24,9 @@ FloatParser, ListParser, BooleanParser, + Parser, + func_to_parser, + FuncParser, ) from adalflow.core.retriever import Retriever from adalflow.components.output_parsers import ( @@ -101,6 +104,9 @@ "FloatParser", "ListParser", "BooleanParser", + "Parser", + "func_to_parser", + "FuncParser", # Output Parsers with dataclass formatting "YamlOutputParser", "JsonOutputParser", diff --git a/adalflow/adalflow/components/agent/react.py b/adalflow/adalflow/components/agent/react.py index 92428e53..ad287036 100644 --- a/adalflow/adalflow/components/agent/react.py +++ b/adalflow/adalflow/components/agent/react.py @@ -1,14 +1,18 @@ """Implementation and optimization of React agent.""" from typing import List, Union, Callable, Optional, Any, Dict -from copy import deepcopy +from dataclasses import dataclass, field +from adalflow.core.base_data_class import DataClass import logging +import traceback from adalflow.core.generator import Generator -from adalflow.core.component import Component +from adalflow.optim.grad_component import GradComponent2 +from adalflow.optim.parameter import Parameter, ParameterType from adalflow.core.func_tool import FunctionTool, AsyncCallable from adalflow.core.tool_manager import ToolManager +from adalflow.core.component import Component from adalflow.components.output_parsers import JsonOutputParser from adalflow.core.types import ( StepOutput, @@ -17,6 +21,7 @@ FunctionOutput, FunctionExpression, ) +from adalflow.optim.grad_component import fun_to_grad_component from adalflow.core.model_client import ModelClient from adalflow.utils.logger import printc @@ -25,58 +30,148 @@ __all__ = ["DEFAULT_REACT_AGENT_SYSTEM_PROMPT", "ReActAgent"] -# TODO: test react agent -DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r""" -{# role/task description #} -You are a helpful assistant. -Answer the user's query using the tools provided below with minimal steps and maximum accuracy. -{# REACT instructions #} -Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action. +react_agent_task_desc = r""" + +You are an excellent task planner. +Answer the input query using the tools provided below with maximum accuracy. + +Each step you will read the previous thought, Action(name, kwargs), and Observation(execution result of the action) and then provide the next Thought and Action. + +Follow function docstring to best call the tool. +- For simple queries: Directly call the ``finish`` action and provide the answer. +- For complex queries: + - Step 1: Read the user query and divide it into multisteps. Start with the first tool/subquery. + - Call one tool at a time to solve each subquery/subquestion. \ + - At step 'finish', give the final answer based on all previous steps. +REMEMBER: +- Action MUST call one of the tools. It CANNOT be empty. +- You will ALWAYS END WITH 'finish' tool to finish the task directly with answer or failure message. +- When the tool is a class method and when class_instance exists, use . to call instead (NOT the CLASS NAME) + +""" + +# - In this case, you are working as a multi-hop retriever and your answer in finish MUST be verbatim short factoid responses from retrieved context. +# - Answer with only the exact answer phrase, not a full sentence or paragraph. + +DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r""" +{{react_agent_task_desc}} +- You cant use more than {{max_steps}} steps. At the {{max_steps}}th current step, must finish with answer. + {# Tools #} {% if tools %} - -You available tools are: + +Tools and instructions: {% for tool in tools %} {{ loop.index }}. {{tool}} ------------------------ {% endfor %} - + +{% endif %} +{# Context Variables #} +{% if context_variables is not none %} + +You have access to context_variables with the following keys: +{% for key, value in context_variables.items() %} +{{ key }} +------------------------ +{% endfor %} +You can either pass context_variables or context_variables['key'] to the tools depending on the tool's requirements. + {% endif %} {# output format and examples for output format #} - + {{output_format_str}} - - -{# Task specification to teach the agent how to think using 'divide and conquer' strategy #} -- For simple queries: Directly call the ``finish`` action and provide the answer. -- For complex queries: - - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery. - - Call one available tool at a time to solve each subquery/subquestion. \ - - At step 'finish', join all subqueries answers and finish the task. -Remember: -- Action must call one of the above tools with name. It can not be empty. -- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message. - - + +{% if examples %} + +Examples: +{% for example in examples %} +{{example}} +------------------------ +{% endfor %} + +{% endif %} + ----------------- -User query: + +Input query: {{ input_str }} +_____________________ +Current Step/Max Step: {{step_history|length + 1}} / {{max_steps}} {# Step History #} {% if step_history %} Your previous steps: {% for history in step_history %} Step {{ loop.index }}. -"Thought": "{{history.action.thought}}", -"Action": "{{history.action.action}}", -"Observation": "{{history.observation}}" +{% if history.action %} +"thought": "{{history.action.thought}}", +"name": "{{history.action.name}}, +"kwargs": {{history.action.kwargs}}", +{% endif %} +"observation": "{{history.observation}}" + ------------------------ {% endfor %} {% endif %} -You:""" + +""" + + +class CombineStepHistory(GradComponent2): + def __init__(self): + super().__init__(desc="Extract the final answer from the step history.") + + def call( + self, + step_history: List[StepOutput], + react_agent_task_desc: str, + id: Optional[str] = None, + ) -> str: + if not step_history: + return "" + answer = step_history[-1].observation + return answer + + +# class FunctionOutputToStepOutput(GradComponent2): +# def __init__(self): +# super().__init__(desc="Convert the FunctionOutput to StepOutput") + +# def call( +# self, +# action_str: FunctionExpression, +# step: int, +# result: FunctionOutput, +# func: Function, +# id: Optional[str] = None, +# ) -> StepOutput: +# """Convert the action string to StepOutput.""" +# step_output = StepOutput(step=step) +# if not isinstance(action_str, FunctionExpression): +# raise ValueError(f"Expected FunctionExpression, but got {type(action_str)}") +# step_output.action = action_str +# step_output.function = func + +# step_output.observation = result.output +# return step_output + + +@dataclass +class ReActOutput(DataClass): + r"""Similar to GeneratorOutput, but with additional step history and final answer.""" + + id: Optional[str] = field( + default=None, metadata={"desc": "The unique id of the output"} + ) + step_history: List[StepOutput] = field( + metadata={"desc": "The history of steps."}, default_factory=list + ) + + answer: Any = field(metadata={"desc": "The final answer."}, default=None) class ReActAgent(Component): @@ -135,12 +230,16 @@ def __init__( max_steps: int = 10, add_llm_as_fallback: bool = True, # TODO: the examples are just for specifying the output format, not end to end input-output examples, need further optimization - examples: List[FunctionExpression] = [], + # examples: List[FunctionExpression] = [], + examples: Union[List[FunctionExpression], List[str]] = [], *, # the following arguments are mainly for the planner model_client: ModelClient, model_kwargs: Dict = {}, + # template for the planner template: Optional[str] = None, # allow users to customize the template + context_variables: Optional[Dict] = None, # context variables + debug: bool = False, ): super().__init__() template = template or DEFAULT_REACT_AGENT_SYSTEM_PROMPT @@ -148,23 +247,55 @@ def __init__( self.max_steps = max_steps self.add_llm_as_fallback = add_llm_as_fallback + self.context_variables = context_variables + self.debug = debug - self._init_tools(tools, model_client, model_kwargs) + processed_tools = self._init_tools(tools, model_client, model_kwargs) + self.tool_manager: ToolManager = ToolManager( + tools=processed_tools, + additional_context={"context_variables": self.context_variables}, + ) - ouput_data_class = FunctionExpression - example = FunctionExpression.from_function( - thought="I have finished the task.", - func=self._finish, - answer="final answer: 'answer'", + ouput_data_class = Function + example = Function( + thought="Based on all the subtasks, I am able to answer the question. Following the finish doc string, and ....", + name="finish", + kwargs={"answer": "final answer"}, ) self._examples = examples + [example] output_parser = JsonOutputParser( - data_class=ouput_data_class, examples=self._examples, return_data_class=True + data_class=ouput_data_class, + examples=self._examples, + return_data_class=True, + include_fields=[ + "thought", + "name", + "kwargs", + ], ) + # output_parser = DataClassParser(return_data_class=True, data_class=Function) prompt_kwargs = { "tools": self.tool_manager.yaml_definitions, "output_format_str": output_parser.format_instructions(), + "react_agent_task_desc": Parameter( + name="react_agent_task_desc", + # data=react_agent_task_desc, + data="You are an excellent task planner. Answer the input query using the tools provided below with maximum accuracy.\n\nEach step you will read the previous thought, Action(name, kwargs), and Observation(execution result of the action) and then provide the next Thought and Action.\n\n\nFollow function docstring to best call the tool.\n- For simple queries: Directly call the 'finish' action and answer with a concise 'yes' or 'no' when it fits.\n- For complex queries:\n - Step 1: Understand the main subject(s) and context of the user query accurately.\n - Step 2: Break down the query into multisteps, starting with the first tool/subquery.\n - Ensure each step accurately reflects the subjects under consideration.\n - Continuously verify your extracted information and logic for factual accuracy using concise comparisons.\n - At step 'finish', conclude with a precise final answer.\nREMEMBER:\n- Action MUST call one of the tools. It CANNOT be empty.\n- You will ALWAYS END WITH 'finish' tool to conclude the task directly with an answer or failure message.\n- When the tool is a class method and when class_instance exists, use . to call instead (NOT the CLASS NAME).\n", + role_desc="Task instruction for the agent to plan steps to solve a question in sequential and multi-steps to get the final answer. \ + For optimizer: you need to adapt this to the current specific task.", + param_type=ParameterType.PROMPT, + requires_opt=True, + ), + # "examples": Parameter( + # name="examples", + # data=None, + # role_desc="Examples for the ReAct agent.", + # param_type=ParameterType.DEMOS, + # requires_opt=True, + # ), + "context_variables": self.context_variables, + "max_steps": self.max_steps, } self.planner = Generator( template=template, @@ -172,9 +303,13 @@ def __init__( output_processors=output_parser, model_client=model_client, model_kwargs=model_kwargs, + use_cache=True, ) - self.step_history: List[StepOutput] = [] + # added this component to the computation graph + # self.append_step_history = AppendStepHistory() + self.combine_step_history = CombineStepHistory() + # self.function_output_to_step_output = FunctionOutputToStepOutput() def _init_tools( self, @@ -182,15 +317,15 @@ def _init_tools( model_client: ModelClient, model_kwargs: Dict, ): - r"""Initialize the tools.""" - tools = deepcopy(tools) + r"""Initialize the tools. Using reference or else(copy or deepcopy) we can not set the training/eval mode for each tool.""" + processed_tools = [] _additional_llm_tool = ( Generator(model_client=model_client, model_kwargs=model_kwargs) if self.add_llm_as_fallback else None ) - def llm_tool(input: str) -> str: + def llm_tool(input: str, **kwargs) -> str: """I answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.""" try: output: GeneratorOutput = _additional_llm_tool( @@ -199,107 +334,583 @@ def llm_tool(input: str) -> str: response = output.data if output else None return response except Exception as e: - log.error(f"Error using the generator: {e}") - print(f"Error using the generator: {e}") + log.error(f"Error using the llm_tool: {e}") + print(f"Error using the llm_tool: {e}") return None - def finish(answer: str) -> str: - """Finish the task with answer.""" + # always add **kwargs for us to track the id, __doc__ as the predecessors. + from adalflow.optim.grad_component import fun_to_grad_component + + @fun_to_grad_component( + desc="Finish", + doc_string=Parameter( + # data="Finish the task with verbatim short factoid answer.", + data="Ensure factual accuracy by precisely identifying each item in the step history, avoiding incorrect associations. Construct the final answer with brevity, directly addressing the query without unnecessary details.", + param_type=ParameterType.PROMPT, + requires_opt=True, + role_desc="Instruct the agent on how to create the final answer from the step history.", + name="doc_string", + ), + ) + def finish(answer: str, **kwargs) -> str: return answer - self._finish = finish - + self._finish = FunctionTool(fn=finish, component=finish) + processed_tools = tools.copy() if self.add_llm_as_fallback: - tools.append(llm_tool) - tools.append(finish) - self.tool_manager: ToolManager = ToolManager(tools=tools) - - def reset(self): - r"""Reset the agent to start a new query.""" - self.step_history = [] - - # TODO: add async execution - def _execute_action(self, action_step: StepOutput) -> Optional[StepOutput]: - """Parse the action string to a function call and execute it. Update the action_step with the result.""" - action = action_step.action - try: + processed_tools.append(llm_tool) + processed_tools.append(self._finish) + return processed_tools - fun: Function = self.tool_manager.parse_func_expr(action) - result: FunctionOutput = self.tool_manager.execute_func(fun) - # TODO: optimize the action_step - action_step.function = fun - action_step.observation = result.output - return action_step - except Exception as e: - log.error(f"Error executing {action}: {e}") - # pass the error as observation so that the agent can continue and correct the error in the next step - action_step.observation = f"Error executing {action}: {e}" - return action_step + def _execute_action( + self, + step_output: StepOutput, + response: Union[Parameter, GeneratorOutput], + id: Optional[str] = None, + ) -> Optional[StepOutput]: + """Parse the action string to a function call and execute it. Update the step_output with the result.""" - def _run_one_step(self, step: int, prompt_kwargs: Dict, model_kwargs: Dict) -> str: - """Run one step of the agent. Plan and execute the action for the step.""" - step_output: StepOutput = StepOutput(step=step) - prompt_kwargs["step_history"] = self.step_history + def handle_error(response: Parameter, e: str): - log.debug( - f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}" - ) + @fun_to_grad_component() + def set_step_output_with_error( + step_output: StepOutput, error: str, response: Any + ): + """Set the step_output with error.""" + step_output.observation = f"error: {error} at {response.data}" + return step_output - response: GeneratorOutput = self.planner( - prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs - ) - if response.error: - error_msg = f"Error planning step {step}: {response.error}" + response.add_successor_map_fn( + successor=set_step_output_with_error, map_fn=lambda x: x.data + ) + return set_step_output_with_error.forward(step_output, e, response) + + step = step_output.step + + if isinstance(response, Parameter): + + try: + # TO FunctionExpression + + # func: Union[Function, Parameter] = self.tool_manager( + # expr_or_fun=response, step="parse", map_fn=lambda x: x.data.data + # ) + step_output.action = response.data.data + printc(f"Step test train: {step}: {step_output.action}", color="blue") + # # add action to the step_output + # step_output.action = response.data.data + # # parse failed + # if not isinstance(func, Parameter): + # raise ValueError( + # f"Expected Parameter, but got {type(func)}: {func}" + # ) + # if isinstance(func, str): + + # @fun_to_grad_component() + # def set_step_output_with_error( + # step_output: StepOutput, data: FunctionExpression, error: str + # ): + # """Set the step_output with error.""" + # step_output.observation = f"Error in parsing the FunctionExperession to Function: {error}" + # return step_output + + # response.add_successor_map_fn( + # successor=set_step_output_with_error, + # map_fn=lambda x: x.data.data, + # ) + # step_output = set_step_output_with_error.forward( + # step_output, response, error=func + # ) + # return step_output + + # except Exception as e: + # e = f"{e} at parsing error at functionexpression: {response.data}" + # return handle_error(response, e) + + # try: + # # printc(f"func: {func}", color="yellow") + # # replace the id + if isinstance(response.data.data, Function): + # response.data.data.kwargs["id"] = id + response.data.data.kwargs.update({"id": id}) + # printc( + # f"add id to the function: {response.data.data}", color="blue" + # ) + + # if self.debug: + # printc(f"func: {func.data}", color="yellow") + + result: Parameter = self.tool_manager( + expr_or_fun=response, step="execute", map_fn=lambda x: x.data.data + ) + # printc(f"Step test train result: {step}: {result.data}", color="blue") + # return + + if isinstance(result, str): + # create dummy step output + + @fun_to_grad_component() + def set_step_output_with_error(step_output: StepOutput, data: str): + """Set the step_output with error.""" + step_output.observation = f"Error {data} in executing action." + + return step_output + + response.add_successor_map_fn( + successor=set_step_output_with_error, + map_fn=lambda x: x.data.data, + ) + step_output = set_step_output_with_error.forward( + step_output, response + ) + + return step_output + + except Exception as e: + e = f"{e} Error executing action: {response.data}" + return handle_error(response, e) + + try: + # printc(f"result: {result}", color="red") + # result.add_successor_map_fn( + # successor=self.function_output_to_step_output, + # map_fn=lambda x: x.data, + # ) + # response.add_successor_map_fn( + # successor=self.function_output_to_step_output, + # map_fn=lambda x: x.data.data, + # ) + # func.add_successor_map_fn( + # successor=self.function_output_to_step_output, + # map_fn=lambda x: x.data, + # ) + # step_output = self.function_output_to_step_output.forward( + # action_str=response, + # step=step, + # result=result, + # func=func, + # ) + + # generate the step_output + # step_output = StepOutput( + # step=step, + # action=response.data.data, + # function=None, + # # function=func.data, + # observation=result.data.output, + # ) + step_output.step = step + step_output.observation = result.data.output + # printc(f"Step test train: {step}: {step_output}", color="blue") + # replace the result.data with the step_output + result.data = step_output + result.role_desc = "The result of the action execution, observation is the final answer" + result.param_type = ParameterType.OUTPUT + + # print(f"Step test: {step}: {result}") + + return result + except Exception as e: + e = f"{e} Error converting function output to step output: {result.data}" + + return handle_error(response, e) + + else: + + return self._execute_action_eval_mode( + x=response, + step_output=step_output, + step=step, + id=id, + ) + + def _execute_action_eval_mode( + self, + x: GeneratorOutput, + step_output: StepOutput, + step: int, + id=None, + ) -> StepOutput: + """Execute the action and update the step_output.""" + if x.error or not x.data: + error_msg = f"Error planning step {step}: {x.error}" step_output.observation = error_msg + step_output.action = None log.error(error_msg) + return step_output else: try: - fun_expr: FunctionExpression = response.data + fun_expr: Function = x.data + # printc(f"Step test {step}: {fun_expr}", color="blue") + # printc(f"Step {step}: {fun_expr}", color="blue") step_output.action = fun_expr + # add id to the function + fun_expr.kwargs.update({"id": id}) log.debug(f"Step {step}: {fun_expr}") if step_output and step_output.action: - step_output = self._execute_action(step_output) - printc(f"Step {step}: \n{step_output}\n_______\n", color="blue") + + # fun: Function = self.tool_manager( + # expr_or_fun=fun_expr, step="parse" + # ) + + # step_output.function = fun + # printc(f"Step {step}: {fun}", color="blue") + result: FunctionOutput = self.tool_manager( + expr_or_fun=x.data, + step="execute", + ) + printc(f"Step result {step}: {result}", color="blue") + step_output.observation = result.output + if self.debug: + printc(f"Step {step}: \n{step_output}\n_______\n", color="blue") + return step_output else: + if self.debug: + printc(f"Failed to parse response for step {step}", color="red") log.error(f"Failed to parse response for step {step}") + return step_output except Exception as e: error_msg = f"Error parsing response for step {step}: {e}" step_output.observation = error_msg log.error(error_msg) + if self.debug: + printc(error_msg, color="red") + return step_output - self.step_history.append(step_output) + def _run_one_step( + self, + step: int, + prompt_kwargs: Dict, + model_kwargs: Dict, + id: Optional[str] = None, + # step_history: Union["Parameter", List[str]] = None, + step_history: List[StepOutput] = [], + # ) -> Union[List[StepOutput], Parameter]: + ) -> Union[Parameter, StepOutput]: + """Run one step of the agent. Plan and execute the action for the step. + Need to deal with both train and eval mode on the self.planner. + """ + if self.debug: + printc(f"step: {step}", color="yellow") + + step_history_value = [] + for step_output in step_history: + if isinstance(step_output, Parameter): + step_history_value.append(step_output.data) + else: + step_history_value.append(step_output) + + prompt_kwargs["step_history"] = step_history_value + + for data in step_history_value: + if not data: + raise ValueError( + f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}" + ) + if not isinstance(data, StepOutput): + raise ValueError( + f"Expected StepOutput, but got {type(data)}, all steps: {step_history_value}" + ) - return response + log.debug( + f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}" + ) + try: - def call( + response: Union[GeneratorOutput, Parameter] = self.planner( + prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs, id=id + ) + # prompt_str = self.planner.get_prompt(**prompt_kwargs) + # printc(f"Prompt: {prompt_str}", color="yellow") + + except Exception as e: + error_msg = f"Error happened in planner response at step {step}: {e}.\n" + error_msg += ( + f"Prompt kwargs: {prompt_kwargs}\nModel kwargs: {model_kwargs}\n" + ) + error_msg += f"Traceback:\n{traceback.format_exc()}" + raise RuntimeError(error_msg) + + step_output: StepOutput = StepOutput(step=step) + + try: + + if self.training and isinstance(response, Parameter): + + if not isinstance(response.data, GeneratorOutput): + raise ValueError( + f"Expected GeneratorOutput, but got {type(response.data)}, value: {response.data}" + ) + # Detect planner parsing errors to FunctionExpression so that the prompt can be trained to self-correct + if not isinstance(response.data.data, Function): + + @fun_to_grad_component() + def set_step_output_with_error( + step_output: StepOutput, data: GeneratorOutput + ): + """Set the step_output with error.""" + step_output.observation = f"Error {data.error} in parsing response: {data.raw_response}, data type: {type(data.data)}" + return step_output + + response.add_successor_map_fn( + successor=set_step_output_with_error, + map_fn=lambda x: x.data, + ) + step_output = set_step_output_with_error.forward( + step_output, response + ) + + else: + + step_output: Parameter = self._execute_action( + step_output, response, id + ) + if not isinstance(step_output, Parameter): + raise ValueError( + f"Expected Parameter, but got {type(step_output)}, value: {step_output}" + ) + if self.debug: + printc(f"step_output: {step_output.data}", color="red") + if not isinstance(step_output, Parameter): + raise ValueError( + f"Ensure step_output to be Parameter at training mode. Got {type(step_output)}.\n\ + Please check the observation for error details: {step_output}" + ) + # combine the current step_output with the step_history + # step_output.add_successor_map_fn( + # successor=self.append_step_history, map_fn=lambda x: x.data + # ) + # step_history.add_successor_map_fn( + # successor=self.append_step_history, map_fn=lambda x: x.data + # ) + + # step_history = self.append_step_history.forward( + # step_output, step_history + # ) + # connect step_history to the next planner + # step_history.add_successor_map_fn( + # successor=self.planner, map_fn=lambda x: x.data + # ) + # if self.debug: + # printc( + # f"step_history: {step_history.get_prompt_data()}", color="red" + # ) + return step_output + return step_history + + else: + + step_output: StepOutput = self._execute_action( + step_output=step_output, response=response, id=id + ) + if not step_output: + raise RuntimeError( + f"Error executing action at step {step}: {step_output}" + ) + + if self.debug: + printc(f"step_output: {step_output}", color="red") + # step_history.append(step_output) + return step_output + except Exception as e: + error_msg = f"Error during execution at step {step}: {e}.\n" + error_msg += f"Step output: {step_output}\nResponse: {response}\n" + error_msg += f"Traceback:\n{traceback.format_exc()}" + raise RuntimeError(error_msg) + + def _check_last_step( + self, step_history: List[Union[StepOutput, Parameter]] + ) -> bool: + """Check if the last step is the finish step.""" + if not step_history: + return True + + last_step: Union[StepOutput, Parameter] = step_history[-1] + + if isinstance(last_step, Parameter): + last_step = last_step.data + + if last_step and last_step.action and last_step.action.name == "finish": + return True + return False + + def _get_answer( + self, step_history: List[Union[StepOutput, Parameter]] + ) -> Union[str, "Parameter"]: + """Get the final answer from the step history. + + When in training mode, we pass the whole step_history to the backward engine to find the feedback + """ + if not step_history: + return None + + last_step: Union[StepOutput, Parameter] = step_history[-1] + if isinstance(last_step, Parameter): + # output = ReActOutput( + # step_history=last_step.data, + # answer=str(last_step.data[-1].observation), + # ) + answer = self.combine_step_history( + step_history=step_history, + id=last_step.data_id, + react_agent_task_desc=self.planner.prompt_kwargs[ + "react_agent_task_desc" + ], + ) + return answer + else: + return str(last_step.observation) + + # last_step: StepOutput = None + # if isinstance( + # step_history, Parameter + # ): # change the step history at the last step + # try: + # output = ReActOutput( + # step_history=step_history.data, + # answer=str(step_history.data[-1].observation), + # ) + # step_history.data = output + # step_history.data_in_prompt = map_step_history_list_to_prompt + # return step_history + + # except Exception as e: + # log.error(f"Error getting data from Parameter: {e}") + # return None + # else: + # last_step = step_history[-1] + # # printc(f"last_step: {last_step}", color="yellow") + + # return str(last_step.observation) + + def call(self, *args, **kwargs) -> ReActOutput: + output = self.bicall(*args, **kwargs) + if not isinstance(output, ReActOutput) or not output: + raise ValueError(f"Expected ReActOutput, but got {type(output)}") + return output + + def forward(self, *args, **kwargs) -> Parameter: + return self.bicall(*args, **kwargs) + + def _is_step_output_last_step(self, step_output: StepOutput) -> bool: + """Check if the step output is the last step.""" + step_output_data = ( + step_output.data if isinstance(step_output, Parameter) else step_output + ) + if ( + step_output_data + and step_output_data.function + and step_output_data.function.name == "finish" + ): + return True + return False + + def bicall( self, input: str, promt_kwargs: Optional[Dict] = {}, model_kwargs: Optional[Dict] = {}, - ) -> Any: + id: Optional[str] = None, + ) -> Union["Parameter", ReActOutput]: r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs.""" - prompt_kwargs = {**promt_kwargs, "input_str": input} + # initialize step_history in both training and eval mode + # step_history = None + + # if self.training: + # step_history = Parameter( + # data=[], + # param_type=ParameterType.INPUT, + # name="step_history", + # requires_opt=True, + # data_in_prompt=map_step_history_to_prompt, + # ) + # else: + # step_history = [] + + # set up the prompts + prompt_kwargs = { + **promt_kwargs, + "input_str": input, + } + + step_history: List[Union[StepOutput, Parameter]] = [] + printc(f"input_query: {input}", color="red") for i in range(self.max_steps): step = i + 1 try: - self._run_one_step(step, prompt_kwargs, model_kwargs) - if ( - self.step_history[-1].function - and self.step_history[-1].function.name == "finish" - ): + step_output = self._run_one_step( + step, prompt_kwargs, model_kwargs, id, step_history + ) + if isinstance(step_output, Parameter): + step_output.data_id = id + step_history.append(step_output) + if self._check_last_step(step_history): break except Exception as e: log.error(f"Error running step {step}: {e}") + printc(f"Error running step {step}: {e}", color="red") + raise e # the only place to raise the error for debugging. In normal cases, the agent should not raise an error. - answer = self.step_history[-1].observation - printc(f"answer:\n {answer}", color="green") - log.info(f"step_history: {self.step_history}") - self.reset() - return answer + answer = self._get_answer(step_history) + if self.training: + return answer + # wrap the output + output = ReActOutput(step_history=step_history, id=id, answer=answer) + if self.debug: + printc(f"answer: {output}", color="yellow") + + return output def _extra_repr(self) -> str: s = f"max_steps={self.max_steps}, add_llm_as_fallback={self.add_llm_as_fallback}, " return s + + +if __name__ == "__main__": + from adalflow.components.model_client import OpenAIClient + from adalflow.utils import setup_env + from adalflow.core.func_tool import FunctionTool + + setup_env() + + class App(Component): + def __init__(self): + super().__init__() + self.llm_tool = Generator( + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + def llm_as_tool(input: str, id: Optional[str] = None) -> str: + """Used as a calculator tool.""" + printc(f"llm_as_tool: {input}", color="yellow") + + return self.llm_tool(prompt_kwargs={"input_str": input}, id=id) + + self.react_agent = ReActAgent( + tools=[FunctionTool(llm_as_tool, component=self.llm_tool)], + max_steps=2, + add_llm_as_fallback=False, + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + def call(self, input: str, id: Optional[str] = None) -> Union[str, "Parameter"]: + return self.react_agent(input, id=id) + + def forward( + self, input: str, id: Optional[str] = None + ) -> Union[str, "Parameter"]: + return self.react_agent(input, id=id) + + # print(OutputParameter.__mro__) + + app = App() + app.train() + output = app("I want to multiply 3 and 4.", id="123") + # print(output) + printc(output, color="yellow") + output.draw_graph() diff --git a/adalflow/adalflow/components/agent/react_v2.py b/adalflow/adalflow/components/agent/react_v2.py new file mode 100644 index 00000000..00027f51 --- /dev/null +++ b/adalflow/adalflow/components/agent/react_v2.py @@ -0,0 +1,568 @@ +"""Implementation and optimization of React agent.""" + +from typing import List, Union, Callable, Optional, Any, Dict +from dataclasses import dataclass, field +from adalflow.core.base_data_class import DataClass +from copy import deepcopy +import logging +import warnings + + +from adalflow.core.generator import Generator +from adalflow.optim.grad_component import GradComponent +from adalflow.optim.parameter import Parameter, ParameterType +from adalflow.core.func_tool import FunctionTool, AsyncCallable +from adalflow.core.tool_manager import ToolManager +from adalflow.components.output_parsers import JsonOutputParser +from adalflow.core.types import ( + StepOutput, + GeneratorOutput, + Function, + FunctionOutput, + FunctionExpression, +) +from adalflow.core.model_client import ModelClient +from adalflow.utils.logger import printc + + +log = logging.getLogger(__name__) + +__all__ = ["DEFAULT_REACT_AGENT_SYSTEM_PROMPT", "ReActAgent"] + + +react_agent_task_desc = r"""{# role/task description #} +You are a helpful assistant. +Answer the user's query using the tools provided below with minimal steps and maximum accuracy. +{# REACT instructions #} +Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action. + + +{# Task specification to teach the agent how to think using 'divide and conquer' strategy #} +- For simple queries: Directly call the ``finish`` action and provide the answer. +- For complex queries: + - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery. + - Call one available tool at a time to solve each subquery/subquestion. \ + - At step 'finish', join all subqueries answers and finish the task. +Remember: +- Action must call one of the above tools with name. It can not be empty. +- You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message. + +""" + +DEFAULT_REACT_AGENT_SYSTEM_PROMPT = r""" +{{react_agent_task_desc}} +{# Tools #} +{% if tools %} + +You available tools are: +{% for tool in tools %} +{{ loop.index }}. +{{tool}} +------------------------ +{% endfor %} + +{% endif %} +{# Context Variables #} +{% if context_variables %} + +You have access to context_variables with the following keys: +{% for key, value in context_variables.items() %} +{{ key }} +------------------------ +{% endfor %} +You can either pass context_variables or context_variables['key'] to the tools depending on the tool's requirements. + +{% endif %} +{# output format and examples for output format #} + +{{output_format_str}} + + +----------------- + +User query: +{{ input_str }} +{# Step History #} +{% if step_history %} + +Your previous steps: +{% for history in step_history %} +Step {{ loop.index }}. +"Thought": "{{history.action.thought}}", +"Action": "{{history.action.action}}", +"Observation": "{{history.observation}}" +------------------------ +{% endfor %} + +{% endif %} + +""" + +# We have parameters react_agent_task_desc, tools, output_format_str, input_str, step_history +# react_agent_task_desc is trainable per use case +# step_history is a list to track the history, where each time it will be updated with the current step output + + +class AppendStepHistory(GradComponent): + def __init__(self): + super().__init__() + self.name = "AppendStepHistory" + self._component_desc = "Append the step_output to the step_history." + + def call( + self, step_output: StepOutput, step_history: List[StepOutput] + ) -> List[StepOutput]: + """Append the step_output to the step_history.""" + if not step_history: + step_history = [] + # make a copy step_history for better tracking + step_history = deepcopy(step_history) + + step_history.append(step_output) + # printc(f"step_history: {step_history}", color="yellow") + return step_history + + +class ExecuteAction(GradComponent): + def __init__(self): + super().__init__() + self.name = "ExecuteAction" + self._component_desc = "Execute the action and output the new step_output." + + def call( + self, + response: GeneratorOutput, + step_output: StepOutput, + execute_action: Callable, + id: Optional[str] = None, + ) -> StepOutput: + """Parse the action string to a function call and execute it. Update the action_step with the result.""" + step = step_output.step + output = execute_action_fn(response, step_output, step, execute_action, id) + if isinstance(output, Parameter): + output = output.full_response + return output + + +class FunctionOutputToStepOutput(GradComponent): + def __init__(self): + super().__init__() + self.name = "FunctionOutputToStepOutput" + self._component_desc = "Convert the FunctionOutput to StepOutput." + + def call(self, output: FunctionOutput, step_output: StepOutput) -> StepOutput: + """Convert the FunctionOutput to StepOutput.""" + + temp_result = output.output + if isinstance(temp_result, Parameter): + step_output.observation = temp_result.data + else: + step_output.observation = temp_result + return step_output + # step_output = StepOutput(step=step) + # step_output.observation = output.output + # return step_output + + +# TODO: make execute_action_fn to a GradComponent to enable the training of the tools too. +def execute_action_fn( + x: GeneratorOutput, step_output: StepOutput, step: int, execute_action: Any, id=None +) -> StepOutput: + """Execute the action and update the step_output.""" + if x.error: + error_msg = f"Error planning step {step}: {x.error}" + step_output.observation = error_msg + log.error(error_msg) + else: + try: + fun_expr: FunctionExpression = x.data + step_output.action = fun_expr + log.debug(f"Step {step}: {fun_expr}") + + if step_output and step_output.action: + step_output = execute_action(step_output, id) + printc(f"Step {step}: \n{step_output}\n_______\n", color="blue") + return step_output + else: + printc(f"Failed to parse response for step {step}", color="red") + log.error(f"Failed to parse response for step {step}") + return step_output + except Exception as e: + error_msg = f"Error parsing response for step {step}: {e}" + step_output.observation = error_msg + log.error(error_msg) + printc(error_msg, color="red") + return step_output + + +@dataclass +class ReActOutput(DataClass): + r"""Similar to GeneratorOutput, but with additional step history and final answer.""" + + id: Optional[str] = field( + default=None, metadata={"desc": "The unique id of the output"} + ) + step_history: List[StepOutput] = field( + metadata={"desc": "The history of steps."}, default_factory=list + ) + + answer: Any = field(metadata={"desc": "The final answer."}, default=None) + + +class ReActAgent(GradComponent): + __doc__ = r"""ReActAgent uses generator as a planner that runs multiple and sequential functional call steps to generate the final response. + + Users need to set up: + - tools: a list of tools to use to complete the task. Each tool is a function or a function tool. + - max_steps: the maximum number of steps the agent can take to complete the task. + - use_llm_as_fallback: a boolean to decide whether to use an additional LLM model as a fallback tool to answer the query. + - model_client: the model client to use to generate the response. + - model_kwargs: the model kwargs to use to generate the response. + - template: the template to use to generate the prompt. Default is DEFAULT_REACT_AGENT_SYSTEM_PROMPT. + + For the generator, the default arguments are: + (1) default prompt: DEFAULT_REACT_AGENT_SYSTEM_PROMPT + (2) default output_processors: JsonParser + + There are `examples` which is optional, a list of string examples in the prompt. + + Example: + + .. code-block:: python + + from core.openai_client import OpenAIClient + from components.agent.react import ReActAgent + from core.func_tool import FunctionTool + # define the tools + def multiply(a: int, b: int) -> int: + '''Multiply two numbers.''' + return a * b + def add(a: int, b: int) -> int: + '''Add two numbers.''' + return a + b + agent = ReActAgent( + tools=[multiply, add], + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + # Using examples: + + call_multiply = FunctionExpression.from_function( + thought="I want to multiply 3 and 4.", + + + + Reference: + [1] https://arxiv.org/abs/2210.03629, published in Mar, 2023. + """ + + # TODO: allow users to pass in a few examples. Need to be a list of FunctionExpression instances. + def __init__( + self, + # added arguments specifc to React + tools: List[Union[Callable, AsyncCallable, FunctionTool]] = [], + max_steps: int = 10, + add_llm_as_fallback: bool = True, + # TODO: the examples are just for specifying the output format, not end to end input-output examples, need further optimization + # examples: List[FunctionExpression] = [], + examples: Union[List[FunctionExpression], List[str]] = [], + *, + # the following arguments are mainly for the planner + model_client: ModelClient, + model_kwargs: Dict = {}, + # template for the planner + template: Optional[str] = None, # allow users to customize the template + context_variables: Optional[Dict] = None, # context variables + ): + super().__init__() + template = template or DEFAULT_REACT_AGENT_SYSTEM_PROMPT + + self.max_steps = max_steps + + self.add_llm_as_fallback = add_llm_as_fallback + self.context_variables = context_variables + + self._init_tools(tools, model_client, model_kwargs) + + ouput_data_class = FunctionExpression + example = FunctionExpression.from_function( + thought="I have finished the task.", + func=self._finish, + answer="final answer: 'answer'", + ) + self._examples = examples + [example] + + output_parser = JsonOutputParser( + data_class=ouput_data_class, examples=self._examples, return_data_class=True + ) + prompt_kwargs = { + "tools": self.tool_manager.yaml_definitions, + "output_format_str": output_parser.format_instructions(), + "react_agent_task_desc": Parameter( + name="react_agent_task_desc", + data=react_agent_task_desc, + role_desc="Task description for the ReAct agent which functions as a planner using a Large Language Model.", + param_type=ParameterType.PROMPT, + requires_opt=True, + ), + "context_variables": self.context_variables, + } + self.planner = Generator( + template=template, + prompt_kwargs=prompt_kwargs, + output_processors=output_parser, + model_client=model_client, + model_kwargs=model_kwargs, + ) + + # added this component to the computation graph + self.append_step_history = AppendStepHistory() + self.execute_action = ExecuteAction() + self.function_output_to_step_output = FunctionOutputToStepOutput() + + def _init_tools( + self, + tools: List[Union[Callable, AsyncCallable, FunctionTool]], + model_client: ModelClient, + model_kwargs: Dict, + ): + r"""Initialize the tools.""" + tools = deepcopy(tools) + _additional_llm_tool = ( + Generator(model_client=model_client, model_kwargs=model_kwargs) + if self.add_llm_as_fallback + else None + ) + + def llm_tool(input: str, **kwargs) -> str: + """I answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.""" + try: + output: GeneratorOutput = _additional_llm_tool( + prompt_kwargs={"input_str": input} + ) + response = output.data if output else None + return response + except Exception as e: + log.error(f"Error using the generator: {e}") + print(f"Error using the generator: {e}") + + return None + + def finish(answer: str, **kwargs) -> str: + """Finish the task with answer.""" + return answer + + self._finish = finish + + if self.add_llm_as_fallback: + tools.append(llm_tool) + tools.append(finish) + self.tool_manager: ToolManager = ToolManager( + tools=tools, + additional_context={"context_variables": self.context_variables}, + ) + + # TODO: add async execution + def _execute_action( + self, action_step: StepOutput, id: Optional[str] = None + ) -> Optional[StepOutput]: + """Parse the action string to a function call and execute it. Update the action_step with the result.""" + action = action_step.action + try: + + fun: Function = self.tool_manager.parse_func_expr(action) + # replace the id + fun.kwargs["id"] = id + + result: Union[FunctionOutput, Parameter] = self.tool_manager(fun) + action_step.function = fun + if isinstance(result, Parameter): + result.add_successor_map_fn( + successor=self.function_output_to_step_output, + map_fn=lambda x: x.data, + ) + action_step: StepOutput = self.function_output_to_step_output( + output=result, step_output=action_step + ) + elif isinstance(result, FunctionOutput): + action_step.observation = result.output + else: + warnings.warn(f"Fails to parse the result: {result}") + action_step.observation = result + + return action_step + except Exception as e: + log.error(f"Error executing {action}: {e}") + # pass the error as observation so that the agent can continue and correct the error in the next step + action_step.observation = f"Error executing {action}: {e}" + return action_step + + def _run_one_step( + self, + step: int, + prompt_kwargs: Dict, + model_kwargs: Dict, + id: Optional[str] = None, + step_history: Union["Parameter", List[str]] = None, + ) -> Union[StepOutput, Parameter]: + """Run one step of the agent. Plan and execute the action for the step. + Need to deal with both train and eval mode on the self.planner. + """ + + prompt_kwargs["step_history"] = step_history + + log.debug( + f"Running step {step} with prompt: {self.planner.prompt(**prompt_kwargs)}" + ) + + response: Union[GeneratorOutput, Parameter] = self.planner( + prompt_kwargs=prompt_kwargs, model_kwargs=model_kwargs, id=id + ) + + # create a new step output + step_output: StepOutput = StepOutput(step=step) + + # connecting two generators in the computation graph, it will set up self.step_history + if isinstance(response, Parameter): + # get the full response + def map_fn(x: Parameter) -> GeneratorOutput: + return x.full_response + + response.add_successor_map_fn(successor=self.execute_action, map_fn=map_fn) + + step_output: Parameter = self.execute_action.forward( + response, step_output, self._execute_action, id + ) + step_output.add_successor_map_fn( + successor=self.append_step_history, map_fn=lambda x: x.data + ) + + step_history = self.append_step_history.forward(step_output, step_history) + # connect step_history to the next planner + step_history.add_successor_map_fn( + successor=self.planner, map_fn=lambda x: x.data + ) + # convert step history back to data + printc(f"step_history: {step_history.data}", color="yellow") + return step_history + + else: + step_output = execute_action_fn( + response, step_output, step, self._execute_action, id + ) + step_history.append(step_output) + return step_history + + def _check_last_step( + self, step_history: Union["Parameter", List[str]] = None + ) -> bool: + """Check if the last step is the finish step.""" + if not step_history: + return True + + last_step: StepOutput = None + if isinstance(step_history, Parameter): + try: + step_history = step_history.data + last_step = step_history[-1] + + except Exception as e: + log.error(f"Error getting data from Parameter: {e}") + return False + else: + last_step = step_history[-1] + + if last_step and last_step.function and last_step.function.name == "finish": + return True + return False + + def _get_answer( + self, step_history: Union["Parameter", List[str]] = None + ) -> Union[str, "Parameter"]: + """Get the final answer from the step history.""" + if not step_history: + return None + + last_step: StepOutput = None + if isinstance(step_history, Parameter): + try: + return step_history + + except Exception as e: + log.error(f"Error getting data from Parameter: {e}") + return None + else: + last_step = step_history[-1] + + return last_step.observation + + def call(self, *args, **kwargs): + return self.bicall(*args, **kwargs) + + def forward(self, *args, **kwargs) -> Parameter: + return self.bicall(*args, **kwargs) + + def _is_step_output_last_step(self, step_output: StepOutput) -> bool: + """Check if the step output is the last step.""" + step_output_data = ( + step_output.data if isinstance(step_output, Parameter) else step_output + ) + if ( + step_output_data + and step_output_data.function + and step_output_data.function.name == "finish" + ): + return True + return False + + def bicall( + self, + input: str, + promt_kwargs: Optional[Dict] = {}, + model_kwargs: Optional[Dict] = {}, + id: Optional[str] = None, + ) -> Union["Parameter", ReActOutput]: + r"""prompt_kwargs: additional prompt kwargs to either replace or add to the preset prompt kwargs.""" + # initialize step_history + step_history = None + if self.training: + step_history = Parameter( + data=[], + param_type=ParameterType.INPUT, + name="step_history", + requires_opt=True, + ) + else: + step_history = [] + + # set up the prompts + prompt_kwargs = { + **promt_kwargs, + "input_str": input, + } + + printc(f"input_query: {input}", color="red") + for i in range(self.max_steps): + step = i + 1 + try: + step_history = self._run_one_step( + step, prompt_kwargs, model_kwargs, id, step_history + ) + + if self._check_last_step(step_history): + break + + except Exception as e: + log.error(f"Error running step {step}: {e}") + + answer = self._get_answer(step_history) + if self.training: + return answer + # wrap the output + output = ReActOutput(step_history=step_history, id=id, answer=answer) + return output + + def _extra_repr(self) -> str: + s = f"max_steps={self.max_steps}, add_llm_as_fallback={self.add_llm_as_fallback}, " + return s diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py index c3750667..1e22d5ef 100644 --- a/adalflow/adalflow/components/model_client/openai_client.py +++ b/adalflow/adalflow/components/model_client/openai_client.py @@ -310,15 +310,19 @@ def convert_inputs_to_api_kwargs( raise ValueError("model must be specified for image generation") # Set defaults for DALL-E 3 if not specified final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024") - final_model_kwargs["quality"] = final_model_kwargs.get("quality", "standard") + final_model_kwargs["quality"] = final_model_kwargs.get( + "quality", "standard" + ) final_model_kwargs["n"] = final_model_kwargs.get("n", 1) - final_model_kwargs["response_format"] = final_model_kwargs.get("response_format", "url") + final_model_kwargs["response_format"] = final_model_kwargs.get( + "response_format", "url" + ) # Handle image edits and variations image = final_model_kwargs.get("image") if isinstance(image, str) and os.path.isfile(image): final_model_kwargs["image"] = self._encode_image(image) - + mask = final_model_kwargs.get("mask") if isinstance(mask, str) and os.path.isfile(mask): final_model_kwargs["mask"] = self._encode_image(mask) @@ -340,11 +344,7 @@ def parse_image_generation_response(self, response: List[Image]) -> GeneratorOut ) except Exception as e: log.error(f"Error parsing image generation response: {e}") - return GeneratorOutput( - data=None, - error=str(e), - raw_response=str(response) - ) + return GeneratorOutput(data=None, error=str(e), raw_response=str(response)) @backoff.on_exception( backoff.expo, @@ -417,7 +417,9 @@ async def acall( response = await self.async_client.images.edit(**api_kwargs) else: # Image variation - response = await self.async_client.images.create_variation(**api_kwargs) + response = await self.async_client.images.create_variation( + **api_kwargs + ) else: # Image generation response = await self.async_client.images.generate(**api_kwargs) diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py index 6d2e56dd..dc258686 100644 --- a/adalflow/adalflow/components/output_parsers/dataclass_parser.py +++ b/adalflow/adalflow/components/output_parsers/dataclass_parser.py @@ -4,12 +4,12 @@ from typing import Any, Literal, List, Optional import logging -from adalflow.core.component import Component from adalflow.core.prompt_builder import Prompt -from adalflow.core.string_parser import YamlParser, JsonParser +from adalflow.core.string_parser import YamlParser, JsonParser, Parser from adalflow.core.base_data_class import DataClass, DataClassFormatType from adalflow.core.base_data_class import ExcludeType, IncludeType + __all__ = ["DataClassParser"] log = logging.getLogger(__name__) @@ -42,7 +42,7 @@ """ -class DataClassParser(Component): +class DataClassParser(Parser): __doc__ = r"""Made the structured output even simpler compared with JsonOutputParser and YamlOutputParser. 1. Understands __input_fields__ and __output_fields__ from the DataClass (no need to use include/exclude to decide fields). @@ -166,6 +166,9 @@ def get_examples_str( examples_str = Prompt(template=EXAMPLES_FORMAT)(examples=str_examples) return examples_str + def __call__(self, *args, **kwargs): + return self.call(*args, **kwargs) + def call(self, input: str) -> Any: r"""Parse the output string to the desired format and return the parsed output.""" try: diff --git a/adalflow/adalflow/components/output_parsers/outputs.py b/adalflow/adalflow/components/output_parsers/outputs.py index 288cba67..82e5e2cc 100644 --- a/adalflow/adalflow/components/output_parsers/outputs.py +++ b/adalflow/adalflow/components/output_parsers/outputs.py @@ -11,9 +11,8 @@ from typing import Dict, Any, Optional, List import logging -from adalflow.core.component import Component from adalflow.core.prompt_builder import Prompt -from adalflow.core.string_parser import YamlParser, ListParser, JsonParser +from adalflow.core.string_parser import YamlParser, ListParser, JsonParser, Parser from adalflow.core.base_data_class import DataClass, DataClassFormatType from adalflow.core.base_data_class import ExcludeType, IncludeType @@ -69,15 +68,19 @@ YAML_OUTPUT_PARSER_OUTPUT_TYPE = Dict[str, Any] -class OutputParser(Component): +class OutputParser(Parser): __doc__ = r"""The abstract class for all output parsers. + On top of the basic string Parser, it handles structured data interaction: + 1. format_instructions: Return the formatted instructions to use in prompt for the output format. + 2. call: Parse the output string to the desired format and return the parsed output via yaml or json. + This interface helps users customize output parsers with consistent interfaces for the Generator. Even though you don't always need to subclass it. - AdalFlow uses two core components: + AdalFlow uses two core classes: 1. the Prompt to format output instruction - 2. A string parser component from core.string_parser for response parsing. + 2. A string parser from core.string_parser for response parsing. """ def __init__(self, *args, **kwargs) -> None: @@ -88,6 +91,9 @@ def format_instructions(self) -> str: r"""Return the formatted instructions to use in prompt for the output format.""" raise NotImplementedError("This is an abstract method.") + def __call__(self, *args: Any, **kwds: Any) -> Any: + return self.call(*args, **kwds) + def call(self, input: str) -> Any: r"""Parse the output string to the desired format and return the parsed output.""" raise NotImplementedError("This is an abstract method.") diff --git a/adalflow/adalflow/core/__init__.py b/adalflow/adalflow/core/__init__.py index a4a67c6a..928b3ce5 100644 --- a/adalflow/adalflow/core/__init__.py +++ b/adalflow/adalflow/core/__init__.py @@ -1,12 +1,23 @@ from .base_data_class import DataClass, required_field, DataClassFormatType -from .component import Component, FunComponent, fun_to_component +from .component import Component from .container import Sequential, ComponentList from .db import LocalDB from .default_prompt_template import DEFAULT_ADALFLOW_SYSTEM_PROMPT from .embedder import Embedder, BatchEmbedder from .generator import Generator, BackwardEngine from .model_client import ModelClient +from .string_parser import ( + Parser, + FuncParser, + func_to_parser, + YamlParser, + JsonParser, + IntParser, + FloatParser, + ListParser, + BooleanParser, +) # from .parameter import Parameter from .prompt_builder import Prompt @@ -51,8 +62,6 @@ "Component", "Sequential", "ComponentList", - "FunComponent", - "fun_to_component", "DataClass", "DataClassFormatType", "required_field", @@ -94,6 +103,16 @@ "DialogTurn", "Conversation", "Tokenizer", + # Parsers + "Parser", + "FuncParser", + "func_to_parser", + "YamlParser", + "JsonParser", + "IntParser", + "FloatParser", + "ListParser", + "BooleanParser", ] for name in __all__: diff --git a/adalflow/adalflow/core/base_data_class.py b/adalflow/adalflow/core/base_data_class.py index 1a379724..543a1090 100644 --- a/adalflow/adalflow/core/base_data_class.py +++ b/adalflow/adalflow/core/base_data_class.py @@ -292,7 +292,9 @@ class TrecDataList(DataClass): # {'data': [{'question': 'What is the capital of France?'}]} """ if not is_dataclass(self): - raise ValueError("to_dict() called on a class type, not an instance.") + raise ValueError( + f"to_dict() is not called on a dataclass instance: {self.__class__}. You might forget to use @dataclass decorator." + ) # convert all fields to its data if its parameter fields = self.__dataclass_fields__ from adalflow.optim.parameter import Parameter diff --git a/adalflow/adalflow/core/component.py b/adalflow/adalflow/core/component.py index d0dd6631..30399a11 100644 --- a/adalflow/adalflow/core/component.py +++ b/adalflow/adalflow/core/component.py @@ -3,7 +3,6 @@ from collections import OrderedDict, namedtuple from typing import ( - Callable, Dict, Any, Optional, @@ -394,8 +393,8 @@ def register_parameter( raise ValueError('parameter name can\'t contain "."') elif name == "": raise ValueError('parameter name can\'t be empty string ""') - elif hasattr(self, name) and name not in self._parameters: - raise KeyError("attribute '{}' already exists".format(name)) + # elif hasattr(self, name) and name not in self._parameters: + # raise KeyError("attribute '{}' already exists".format(name)) if param is None: self._parameters[name] = None @@ -519,17 +518,85 @@ def named_parameters( # ) # plt.show() - # TODO: do we need to disable this format of calling instead use call and acall extensively? - def __call__(self, *args, **kwargs): - r"""In default, we use sync call.""" - output = self.call(*args, **kwargs) - return output + def forward(self, *args, **kwargs): + """ + User must override this for the training scenario + if bicall is not defined. + """ + raise NotImplementedError("Subclasses must implement `forward` or `bicall`.") def call(self, *args, **kwargs): + """ + User must override this for the inference scenario + if bicall is not defined. + """ + if self._has_bicall(): + output = self.bicall(*args, **kwargs) + return output + raise NotImplementedError("Subclasses must implement `call` or `bicall`.") + + def bicall(self, *args, **kwargs): + """ + If the user provides a `bicall` method, then `__call__` will automatically + dispatch here for both training and inference scenarios. This can internally + decide how to handle training vs. inference, or just produce a single unified + output type. + """ + # Default fallback if not overridden raise NotImplementedError( - f"Component {type(self).__name__} is missing the required 'call' method." + "Optional method. Implement to handle both scenarios in one place." ) + def __call__(self, *args, **kwargs): + # 1. If `bicall` is defined by the user, use it + # and let the `bicall` implementation handle + # the difference between training vs. inference. + from adalflow.optim.parameter import Parameter + + if self._has_bicall(): + output = self.bicall(*args, **kwargs) + + # Validation checks based on training or inference + if self.training: + # Ensure output is a Parameter in training + if not isinstance(output, Parameter): + raise ValueError( + f"Output should be of type Parameter in training mode, but got {type(output)}" + ) + else: + # Ensure output is not a Parameter in inference + if isinstance(output, Parameter): + raise ValueError( + f"Output should not be of type Parameter in inference mode, but got {type(output)}" + ) + return output + + # 2. Otherwise, if `bicall` is not defined, fall back to forward / call + if self.training: + output = self.forward(*args, **kwargs) + # Validation for training + if not isinstance(output, Parameter): + raise ValueError( + f"Output should be of type Parameter in training mode, but got {type(output)}" + ) + return output + else: + output = self.call(*args, **kwargs) + # Validation for inference + if isinstance(output, Parameter): + raise ValueError( + f"Output should not be of type Parameter in inference mode, but got {type(output)}" + ) + return output + + def _has_bicall(self): + """ + Helper method to check if this subclass has overridden bicall. + """ + # The default `bicall` in this class raises NotImplementedError, + # so we can check if the method is still the same one as in `MyModule`. + return self.bicall.__func__ is not Component.bicall + async def acall(self, *args, **kwargs): r"""API call, file io.""" pass @@ -890,6 +957,18 @@ def _get_name(self): def __repr__(self): # We treat the extra repr like the sub-module, one item per line extra_lines = [] + # add training mode + status = "" + if self.training: + status = "training: True" + else: + status = "training: False" + # add teacher mode + if self.teacher_mode: + status += ", teacher_mode: True" + else: + status += ", teacher_mode: False" + extra_lines.append(status) extra_repr = self._extra_repr() # empty string will be split into list [''] if extra_repr: @@ -928,76 +1007,6 @@ def _get_init_args(self, *args, **kwargs) -> Dict[str, Any]: return init_args -# TODO: support async call -class FunComponent(Component): - r"""Component that wraps a function. - - Args: - fun (Callable): The function to be wrapped. - - Examples: - - function = lambda x: x + 1 - fun_component = FunComponent(function) - print(fun_component(1)) # 2 - """ - - def __init__(self, fun: Optional[Callable] = None, afun: Optional[Callable] = None): - super().__init__() - self.fun_name = fun.__name__ - EntityMapping.register(self.fun_name, fun) - - def call(self, *args, **kwargs): - fun = EntityMapping.get(self.fun_name) - return fun(*args, **kwargs) - - def _extra_repr(self) -> str: - return super()._extra_repr() + f"fun_name={self.fun_name}" - - -def fun_to_component(fun) -> FunComponent: - r"""Helper function to convert a function into a Component with - its own class name. - - Can be used as both a decorator and a function. - - Args: - fun (Callable): The function to be wrapped. - Returns: - FunComponent: The component that wraps the function. - - Examples: - 1. As a decorator: - >>> @fun_to_component - >>> def my_function(x): - >>> return x + 1 - >>> # is equivalent to - >>> class MyFunctionComponent(FunComponent): - >>> def __init__(self): - >>> super().__init__(my_function) - - 2. As a function: - >>> my_function_component = fun_to_component(my_function) - """ - - # Split the function name by underscores, capitalize each part, and join them back together - class_name = ( - "".join(part.capitalize() for part in fun.__name__.split("_")) + "Component" - ) - # register the function - EntityMapping.register(fun.__name__, fun) - # Define a new component class dynamically - component_class = type( - class_name, - (FunComponent,), - {"__init__": lambda self: FunComponent.__init__(self, fun)}, - ) - # register the component - EntityMapping.register(class_name, component_class) - - return component_class() - - # TODO: not used yet, will further investigate dict mode # class ComponentDict(Component): # r""" diff --git a/adalflow/adalflow/core/func_tool.py b/adalflow/adalflow/core/func_tool.py index 62d4f3fe..06194ef7 100644 --- a/adalflow/adalflow/core/func_tool.py +++ b/adalflow/adalflow/core/func_tool.py @@ -4,7 +4,8 @@ """ from typing import Any, Optional, Callable, Awaitable, Union -from inspect import iscoroutinefunction +from inspect import iscoroutinefunction, ismethod, isfunction +import inspect import logging import asyncio import nest_asyncio @@ -16,9 +17,12 @@ Function, ) from adalflow.core import Component +from adalflow.optim.parameter import Parameter +from adalflow.optim.grad_component import FunGradComponent from adalflow.core.functional import ( get_fun_schema, ) +from adalflow.utils import printc from inspect import signature AsyncCallable = Callable[..., Awaitable[Any]] @@ -37,6 +41,21 @@ def is_running_in_event_loop() -> bool: return False +def find_instance_name_from_self(instance): + """ + Attempt to find the variable name of the instance in the calling context. + + :param instance: The instance to find the name for. + :return: The variable name of the instance, if found; otherwise, None. + """ + # Inspect the calling stack frame + frame = inspect.stack()[2].frame + for var_name, var_obj in frame.f_locals.items(): + if var_obj is instance: + return var_name + return None + + FunctionType = Union[Callable[..., Any], Awaitable[Callable[..., Any]]] @@ -49,6 +68,32 @@ class FunctionTool(Component): Function be used by LLM as a tool to achieve a specific task. + What function can you pass as a tool? + 1. Any unbound function you wrote outside of a class. + 2. Any class method you wrote in your component. It can call `self` and other methods inside of your component. + 3. When the function is using a trainable component, and you can directly use the component's method as a tool or wrap it in a function. But you need to make sure to pass the component to the tool. + + Here are some examples: + + .. code-block:: python + + from adalflow.core.func_tool import FunctionTool + class AgenticRAG(GradComponent): + def __init__(self, ...): + super().__init__() + self.retriever = Retriever() + self.llm = Generator() + + def retriever_as_tool(input: str) -> str: + r"Used as a retriever tool." + return self.retriever(input) + + tools = [FunctionTool(retriever_as_tool, component=self.retriever), + FunctionTool(self.llm.__call__, component=self.llm)] + # if you have trainable component, this will ensure it can be trained together with your whole task pipeline + # if you dont want to train them and simply treating them as a tool, you can call like this + # tools = [FunctionTool(retriever_as_tool), FunctionTool(self.llm.__call__, component=self.llm)] + Features: - Supports both synchronous and asynchronous functions via ``call`` and ``acall``. - Creates a FunctionDefinition from the function using ``get_fun_schema``. @@ -63,21 +108,35 @@ class FunctionTool(Component): - via sandboxed execute directionly using ``sandbox_exec``. + A FunctionTool allows other GradComponent(as a tool) to pass through correctly. """ def __init__( self, fn: FunctionType, + component: Optional[Component] = None, definition: Optional[FunctionDefinition] = None, ): - super().__init__() + super().__init__( + name="FunctionTool", desc="A component calls and executes a function." + ) nest_asyncio.apply() assert fn is not None, "fn must be provided" - self.fn = fn - self._is_async = iscoroutinefunction(fn) + # self.fn = fn # it can be a function or component + self.component = component # pass it here to control the training mode - self.definition = definition or self._create_fn_definition() + if isinstance(fn, Component): + self.fn = fn.__call__ + else: + self.fn = fn + self._is_async = iscoroutinefunction(fn) + if isinstance(fn, Component): + self.definition = ( + definition or self._create_fn_definition_for_grad_component(fn) + ) + else: + self.definition = definition or self._create_fn_definition() if self._is_async: log.info(f"FunctionTool: {fn} is async: {self._is_async}") @@ -85,22 +144,103 @@ def __init__( def is_async(self) -> bool: return self._is_async + def _create_fn_definition_for_grad_component( + self, fn: FunGradComponent + ) -> FunctionDefinition: + name = fn.fun_name + docstring = fn.doc_string + signature_str = str(signature(fn.fun)) + instance = None + return FunctionDefinition( + func_name=name, + func_desc=( + f"{name}{signature_str}\nDocstring:{docstring}" + if isinstance(docstring, str) + else f"{name}{signature_str}\nDocstring:{docstring.data}" + ), + func_parameters=get_fun_schema(name, fn.fun), + class_instance=instance, + ) + def _create_fn_definition(self) -> FunctionDefinition: + name = self.fn.__name__ docstring = self.fn.__doc__ - description = f"{docstring}" - description = f"{name}{signature(self.fn)}\n{docstring}" - # description = f"{name}{signature(self.fn)}\n{docstring}" + signature_str = str(signature(self.fn)) + + # Get the class that owns the method, if applicable + cls_name = None + instance = None + if ismethod(self.fn): # Check if it’s a bound method + instance = self.fn.__self__ + instance = find_instance_name_from_self(instance) + if name == "__call__" and not instance: + raise ValueError( + "Please provide a name for the instance in the calling context" + ) + cls_name = self.fn.__self__.__class__.__name__ + elif isfunction(self.fn): # Unbound method + cls_name = self.fn.__qualname__.split(".")[0] + + # Build the description + description = f"{name}{signature_str}\n" + if cls_name: + description += f"Belongs to class: {cls_name}\n" + if docstring: + description += f"Docstring: {docstring}\n" + + # Get function parameters schema fn_parameters = get_fun_schema(name, self.fn) + return FunctionDefinition( - func_name=name, func_desc=description, func_parameters=fn_parameters + func_name=name, + func_desc=description, + func_parameters=fn_parameters, + class_instance=instance, ) + def forward(self, *args, **kwargs) -> Parameter: + r"""Forward the function tool.""" + return self.bicall(*args, **kwargs) + def call(self, *args: Any, **kwargs: Any) -> FunctionOutput: r"""Execute the function synchronously. Example: + .. code-block:: python + + import time + def sync_function_1(): + time.sleep(1) + return "Function 1 completed" + + tool_1 = FunctionTool(sync_function_1) + output = tool_1.call() + """ + return self.bicall(*args, **kwargs) + # if self._is_async: + # raise ValueError("FunctionTool is asynchronous, use acall instead") + # output, error = None, None + # try: + # output = self.fn(*args, **kwargs) + # except Exception as e: + # log.error(f"Error at calling {self.fn}: {e}") + # # raise ValueError(f"Error: {e}") + # error = str(e) + # return FunctionOutput( + # name=self.definition.func_name, + # # raw_input={"args": args, "kwargs": kwargs}, + # input=Function(name=self.definition.func_name, args=args, kwargs=kwargs), + # output=output, + # error=error, + # ) + + def bicall(self, *args: Any, **kwargs: Any) -> Union[FunctionOutput, Parameter]: + r"""Execute the function synchronously. + + Example: + .. code-block:: python import time @@ -114,12 +254,33 @@ def sync_function_1(): if self._is_async: raise ValueError("FunctionTool is asynchronous, use acall instead") output, error = None, None + + # NOTE: special case: + # self.fn can have both train and eval mode or untrainable as a function. try: + # printc(f"args: {args}, kwargs: {kwargs}, fn: {self.fn}", color="yellow") output = self.fn(*args, **kwargs) + # printc(f"output 1: {output}", color="yellow") except Exception as e: log.error(f"Error at calling {self.fn}: {e}") - # raise ValueError(f"Error: {e}") - error = str(e) + error = f"Error at calling {self.fn}: {e}" + + if isinstance(output, Parameter): + if not self.training: + raise ValueError( + f"FunctionTool {self.definition.func_name} is in eval mode, but the output is Parameter" + ) + output.data = FunctionOutput( + name=self.definition.func_name, + # raw_input={"args": args, "kwargs": kwargs}, + input=Function( + name=self.definition.func_name, args=args, kwargs=kwargs + ), + output=output.data, + error=error, + ) + return output + # printc(f"output: {output}", color="yellow") return FunctionOutput( name=self.definition.func_name, # raw_input={"args": args, "kwargs": kwargs}, @@ -249,9 +410,9 @@ async def run_sync_and_async_mix(): return result - def __call__(self, *args, **kwargs) -> FunctionOutput: - r"""Execute the function synchronously or asynchronously based on the function type.""" - return self.execute(*args, **kwargs) + # def __call__(self, *args, **kwargs) -> FunctionOutput: + # r"""Execute the function synchronously or asynchronously based on the function type.""" + # return self.execute(*args, **kwargs) def _extra_repr(self) -> str: s = f"fn: {self.fn}, async: {self._is_async}, definition: {self.definition}" @@ -260,61 +421,123 @@ def _extra_repr(self) -> str: if __name__ == "__main__": - import asyncio - import time - - async def async_function_1(): - await asyncio.sleep(1) - return "Function 1 completed" - - def sync_function_1(): - time.sleep(1) - return "Function 1 completed" - - async def async_function_2(): - await asyncio.sleep(2) - return "Function 2 completed" - - def sync_function_2(): - time.sleep(2) - return "Function 2 completed" - - async_tool_1 = FunctionTool(async_function_1) - sync_tool_1 = FunctionTool(sync_function_2) - async_tool_2 = FunctionTool(async_function_2) - sync_tool_2 = FunctionTool(sync_function_2) - - def run_sync_and_async_mix_without_wait(): - # both sync and async tool can use execute - # sync tool can also use call - # takes 5 seconds (1+1+2) + overhead - start_time = time.time() - results = [ - async_tool_1.execute(), - sync_tool_1.execute(), - sync_tool_2.call(), - ] - print(results) - end_time = time.time() - print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}") - return results - - async def run_sync_and_async_mix(): - # both sync and async tool can use execute&to_thread - # async tool can also use acall without to_thread - # takes a bit over 2 seconds max(2) - start_time = time.time() - results = await asyncio.gather( - async_tool_1.execute(), - sync_tool_1.execute(), - async_tool_2.acall(), - ) - print(results) - end_time = time.time() - print(f"run_sync_and_async_mix time: {end_time - start_time}") - return results - - print(async_tool_1.execute()) - - run_sync_and_async_mix_without_wait() - asyncio.run(run_sync_and_async_mix()) + # import asyncio + # import time + + # async def async_function_1(): + # await asyncio.sleep(1) + # return "Function 1 completed" + + # def sync_function_1(): + # time.sleep(1) + # return "Function 1 completed" + + # async def async_function_2(): + # await asyncio.sleep(2) + # return "Function 2 completed" + + # def sync_function_2(): + # time.sleep(2) + # return "Function 2 completed" + + # async_tool_1 = FunctionTool(async_function_1) + # sync_tool_1 = FunctionTool(sync_function_2) + # async_tool_2 = FunctionTool(async_function_2) + # sync_tool_2 = FunctionTool(sync_function_2) + + # def run_sync_and_async_mix_without_wait(): + # # both sync and async tool can use execute + # # sync tool can also use call + # # takes 5 seconds (1+1+2) + overhead + # start_time = time.time() + # results = [ + # async_tool_1.execute(), + # sync_tool_1.execute(), + # sync_tool_2.call(), + # ] + # print(results) + # end_time = time.time() + # print(f"run_sync_and_async_mix_without_wait time: {end_time - start_time}") + # return results + + # async def run_sync_and_async_mix(): + # # both sync and async tool can use execute&to_thread + # # async tool can also use acall without to_thread + # # takes a bit over 2 seconds max(2) + # start_time = time.time() + # results = await asyncio.gather( + # async_tool_1.execute(), + # sync_tool_1.execute(), + # async_tool_2.acall(), + # ) + # print(results) + # end_time = time.time() + # print(f"run_sync_and_async_mix time: {end_time - start_time}") + # return results + + # print(async_tool_1.execute()) + + # run_sync_and_async_mix_without_wait() + # asyncio.run(run_sync_and_async_mix()) + + from adalflow.components.model_client import OpenAIClient + from adalflow.core.generator import Generator + from adalflow.optim.parameter import Parameter + from adalflow.core.types import GeneratorOutput + from adalflow.utils import setup_env, printc + + setup_env() + + llm = Generator( + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + # llm.train() + + def llm_as_tool(input: str, id: Optional[str] = None) -> str: + """Used as a calculator tool.""" + printc(f"llm_as_tool: {input}", color="yellow") + + return llm(prompt_kwargs={"input_str": input}, id=id) + + llm_tool = FunctionTool(llm_as_tool, component=llm) + llm_tool.train() + output: Parameter = llm_tool("What is 2+2?") + output.draw_graph() + print(output) + llm_tool.eval() + output: FunctionTool = llm_tool("What is 2+2?") + print(output) + assert isinstance(output, FunctionOutput) + assert isinstance(output.output, GeneratorOutput) + + # grad component + + from adalflow.optim.grad_component import fun_to_grad_component + from adalflow.optim.parameter import ParameterType + + @fun_to_grad_component( + desc="Finish", + doc_string=Parameter( + data="Finish the task with verbatim short factoid responses from retrieved context.", + param_type=ParameterType.PROMPT, + requires_opt=True, + role_desc="Instruct how the agent creates the final answer from the step history.", + name="doc_string", + ), + ) + def finish(answer: str, **kwargs) -> str: + # """Finish the task with verbatim short factoid responses from retrieved context.""" + # printc(f"finish: {answer}", color="yellow") + return answer + + finish_tool = FunctionTool(fn=finish, component=finish) + + definition = finish_tool.definition + print(definition) + # call function + finish_tool.train() + output: Parameter = finish_tool( + "Finish the task with verbatim short factoid responses from retrieved context." + ) + print(output) diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py index baedd8fb..f0a80047 100644 --- a/adalflow/adalflow/core/generator.py +++ b/adalflow/adalflow/core/generator.py @@ -9,6 +9,7 @@ from typing import Any, Dict, Optional, Union, Callable, Tuple, List import logging +from dataclasses import dataclass, field from adalflow.core.types import ( @@ -21,7 +22,12 @@ from adalflow.core.base_data_class import DataClass -from adalflow.optim.parameter import Parameter, GradientContext +from adalflow.optim.parameter import ( + Parameter, + GradientContext, + Gradient, + OutputParameter, +) from adalflow.optim.types import ParameterType from adalflow.core.prompt_builder import Prompt @@ -32,16 +38,20 @@ from adalflow.utils.cache import CachedEngine from adalflow.tracing.callback_manager import CallbackManager from adalflow.utils.global_config import get_adalflow_default_root_path +from adalflow.core.string_parser import JsonParser, Parser + from adalflow.optim.text_grad.backend_engine_prompt import ( FEEDBACK_ENGINE_TEMPLATE, LLM_CONVERSATION_TEMPLATE, + ALL_PRED_INFO, + OUTPUT_INSTRUCTION, VARIABLE_AND_PEERS_INFO, - # CONVERSATION_START_INSTRUCTION_BASE, CONVERSATION_START_INSTRUCTION_CHAIN, OBJECTIVE_INSTRUCTION_BASE, OBJECTIVE_INSTRUCTION_CHAIN, ) +from adalflow.utils.logger import printc __all__ = ["Generator", "BackwardEngine", "create_teacher_generator"] @@ -53,6 +63,20 @@ PromptArgType = Dict[str, Union[str, Parameter]] +@dataclass +class BackwardPassSetup(DataClass): + all_pred_at_once: bool = field( + default=False, metadata={"desc": "Backward all predecessors at once."} + ) + threshold_score_to_compute_grad_for_errors: float = field( + default=0.9, + metadata={"desc": "Threshold score to compute gradient for errors."}, + ) + compute_grad_for_errors_only: bool = field( + default=True, metadata={"desc": "Compute gradient for errors only."} + ) + + class Generator(GradComponent, CachedEngine, CallbackManager): __doc__ = """An user-facing orchestration component for LLM prediction. @@ -85,6 +109,10 @@ class Generator(GradComponent, CachedEngine, CallbackManager): {} ) # to create teacher generator from student TODO: might reaccess this + backward_pass_setup: BackwardPassSetup = ( + BackwardPassSetup() + ) # default setup for the backward pass + def __init__( self, *, @@ -95,7 +123,7 @@ def __init__( template: Optional[str] = None, prompt_kwargs: Optional[Dict] = {}, # args for the output processing - output_processors: Optional[Component] = None, + output_processors: Optional[Parser] = None, name: Optional[str] = None, # args for the cache cache_path: Optional[str] = None, @@ -142,6 +170,11 @@ def __init__( self.output_processors = output_processors + if output_processors and (not isinstance(output_processors, Parser)): + raise ValueError( + f"output_processors should be a Parser instance, got {type(output_processors)}" + ) + self.set_parameters(prompt_kwargs) # end of trainable parameters @@ -169,6 +202,9 @@ def __init__( {} ) # used by dynamic computation graph and backpropagation + def update_default_backward_pass_setup(self, setup: BackwardPassSetup): + self.backward_pass_setup = setup + def set_cache_path(self, cache_path: str, model_client: object, model: str): """Set the cache path for the generator.""" @@ -244,7 +280,9 @@ def set_parameters(self, prompt_kwargs: PromptArgType): peers = [ p for k, p in prompt_kwargs.items() - if isinstance(p, Parameter) and k != key + if isinstance(p, Parameter) + and k != key + and p.param_type == ParameterType.PROMPT ] p.set_peers(peers) setattr(self, key, p) @@ -301,7 +339,7 @@ def get_prompt(self, **kwargs) -> str: return self.prompt.call(**kwargs) def _extra_repr(self) -> str: - s = f"model_kwargs={self.model_kwargs}, model_type={self.model_type}" + s = f"model_kwargs={self.model_kwargs}, model_type={self.model_type}, prompt={self.prompt}" return s def _post_call(self, completion: Any) -> GeneratorOutput: @@ -338,6 +376,7 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]: model_kwargs=composed_model_kwargs, model_type=self.model_type, ) + # printc(f"api_kwargs: {api_kwargs}", color="red") return api_kwargs def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any: @@ -454,7 +493,7 @@ def forward( prompt_kwargs[k] = Parameter( data=v, name=f"{self.name}_{k}", - requires_opt=True, + requires_opt=False, param_type=ParameterType.INPUT, data_id=id, ) @@ -506,7 +545,13 @@ def forward( self.model_kwargs, model_kwargs ), } + # printc(f"input_args: {input_args}", color="red") + output = self.call(**input_args, id=id) + if not isinstance(output, GeneratorOutput): + raise ValueError( + f"Output should be of type GeneratorOutput, got {type(output)}" + ) # 2. Generate a Parameter object from the output combined_prompt_kwargs = compose_model_kwargs(self.prompt_kwargs, prompt_kwargs) # if self.data_map_func is None: @@ -517,19 +562,32 @@ def forward( ] log.debug(f"Predecessors: {predecessors} for generator {self.name}") - param_data = ( - output.raw_response - if output and not output.error - else f"Error: {output.error}, raw_response: {output.raw_response}" - ) - response: Parameter = Parameter( + + def data_to_prompt_map_fn(data: Parameter) -> str: + data: GeneratorOutput = data.data + # if data.data is not None: + # return data.data + if data.error is not None: + return f"Response: {data.raw_response} parsed with error: {data.error}" + return f" {data.raw_response}" + + # TODO: all parameter should just wrap the whole output. + # this is for training. + param_data = output + response: Parameter = OutputParameter( data=param_data, name=self.name + "_output", role_desc=f"Output from (llm) {self.name}", param_type=ParameterType.GENERATOR_OUTPUT, + data_id=id, + full_response=output, # the data structure + data_in_prompt=data_to_prompt_map_fn, ) response.set_predecessors(predecessors) - response.trace_forward_pass(input_args=input_args, full_response=output) + response.trace_forward_pass( + input_args=input_args, full_response=output, id=self.id, name=self.name + ) + # setattr(response, "full_response", output) # *** special to the generator *** response.trace_api_kwargs(api_kwargs=self._trace_api_kwargs) # attach the demo to the demo parameter @@ -560,15 +618,13 @@ def forward( log.debug(f"Backward engine: {self.backward_engine}") # attach a funtion to compute gradient for predecessors + response.set_grad_fn( BackwardContext( backward_fn=self.backward, backward_engine=self.backward_engine, response=response, - prompt_kwargs={ - k: v.data if isinstance(v, Parameter) else v - for k, v in prompt_kwargs.items() - }, + prompt_kwargs=prompt_kwargs, template=self.template, prompt_str=self.get_prompt(**combined_prompt_kwargs), id=id, @@ -576,7 +632,6 @@ def forward( ) return response - # == pytorch custom autograd function == def backward( self, response: Parameter, # the output of the forward pass @@ -589,6 +644,14 @@ def backward( log.info(f"Generator: Backward: {response.name}") + backward_pass_setup = ( + backward_engine.backward_pass_setup if backward_engine else None + ) + printc( + f"backward pass setup: {backward_pass_setup}, name: {self.name}", + color="red", + ) + children_params = response.predecessors is_intermediate_node = True if response.get_gradient_and_context_text().strip() == "": @@ -597,41 +660,232 @@ def backward( # backward score to the demo parameter for pred in children_params: # if pred.requires_opt: - pred.set_score(response._score) + if response.score is not None: + pred.set_score(response.score) log.debug( - f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}" + f"backpropagate the score {response.score} to {pred.name}, is_teacher: {self.teacher_mode}" ) if pred.param_type == ParameterType.DEMOS: # Accumulate the score to the demo pred.add_score_to_trace( - trace_id=id, score=response._score, is_teacher=self.teacher_mode + trace_id=id, score=response.score, is_teacher=self.teacher_mode ) log.debug(f"Pred: {pred.name}, traces: {pred._traces}") # 1.backward for text-gradients if backward_engine: + log.debug( f"Generator: Backward engine is set for the generator. {backward_engine}" ) - for pred in children_params: - if not pred.requires_opt or pred.param_type == ParameterType.DEMOS: - log.debug( - f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." + if response.backward_engine_disabled: + for pred in children_params: + pred.backward_engine_disabled = True + return + + all_pred_at_once = backward_pass_setup.all_pred_at_once + + if not all_pred_at_once: + for pred in children_params: + if not pred.requires_opt or pred.param_type == ParameterType.DEMOS: + log.debug( + f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." + ) + continue + + self._backward_through_one_predecessor( + pred=pred, + response=response, + prompt_kwargs=prompt_kwargs, + template=template, + backward_engine=backward_engine, + prompt_str=prompt_str, + backward_pass_setup=backward_pass_setup, + is_intermediate_node=is_intermediate_node, + ) + else: + backward = False + for pred in children_params: + if pred.requires_opt and pred.param_type in [ + ParameterType.PROMPT, + ParameterType.GENERATOR_OUTPUT, + ParameterType.RETRIEVER_OUTPUT, + ParameterType.OUTPUT, + ]: + backward = True + break + if backward: + # 2nd approach, backward all that need opt at once. + self._backward_through_all_predecessors( + children_params=children_params, + response=response, + prompt_kwargs=prompt_kwargs, + template=template, + backward_engine=backward_engine, + prompt_str=prompt_str, + backward_pass_setup=backward_pass_setup, + is_intermediate_node=is_intermediate_node, ) - continue - - self._backward_through_one_predecessor( - pred=pred, - response=response, - prompt_kwargs=prompt_kwargs, - template=template, - backward_engine=backward_engine, - prompt_str=prompt_str, - is_intermediate_node=is_intermediate_node, - ) else: log.debug("Backward engine is not set for the generator. No text gradient.") + @staticmethod + def _backward_through_all_predecessors( + children_params: List[Parameter], + response: Parameter, + prompt_kwargs: Dict[str, str], + template: str, + backward_engine: "BackwardEngine", + prompt_str: str, + backward_pass_setup: BackwardPassSetup, + is_intermediate_node: bool = False, + ): + parser = JsonParser() + # instruction and objective is the same for all the children + instruction_str, objective_str = None, None + + # 1. Generate the conversation input and output + input_prompt_kwargs = { + k: v.get_prompt_data() if isinstance(v, Parameter) else v + for k, v in prompt_kwargs.items() + } + + print(f"gt: {response.get_gt()}") + + conversation_prompt_kwargs = { + "input_value": input_prompt_kwargs, + "llm_output": response.get_prompt_data(), + # "gt": response.get_gt(), + } + + conversation_str = Prompt( + prompt_kwargs=conversation_prompt_kwargs, + template=LLM_CONVERSATION_TEMPLATE, + )() + + all_pred_info = Prompt( + prompt_kwargs={"variables": [p.get_param_info() for p in children_params]}, + template=ALL_PRED_INFO, + )() + + printc(f"all_pred_info: {all_pred_info}") + + conv_ins_template = None # CONVERSATION_START_INSTRUCTION_BASE + obj_ins_template = OBJECTIVE_INSTRUCTION_BASE + if is_intermediate_node: # TODO: this will always be true + conv_ins_template = CONVERSATION_START_INSTRUCTION_CHAIN + obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN + response_gradient = response.get_gradients_str() + # response_gradient = response.get_gradients_component_schema() + # response_gradient = response.get_gradients_component_schema( + # skip_correct_sample=False + # ) + if not response_gradient: + raise ValueError( + f"Generator: No gradient found for {response}. Please check the response." + ) + + # replace variable and peers with all_pred_info + + instruction_str = Prompt( + template=conv_ins_template, + prompt_kwargs={ + "variable_and_peers_info": all_pred_info, + "conversation_str": conversation_str, + }, + )() + objective_str = Prompt( + template=obj_ins_template, + prompt_kwargs={ + "response_desc": response.role_desc, + "response_gradient": response_gradient, + "instruction_to_backward_engine": response.instruction_to_backward_engine, + }, + )() + + backward_engine_prompt_kwargs = { + "conversation_sec": instruction_str, + "objective_instruction_sec": objective_str, + "output_format_str": OUTPUT_INSTRUCTION, + } + + backward_engine_prompt_str = backward_engine.get_prompt( + **backward_engine_prompt_kwargs + ) + # print(f"Backward engine prompt: {backward_engine_prompt_str}") + + gradient_output: GeneratorOutput = None + response_gradient_list = [""] * len(children_params) + if ( + backward_pass_setup.compute_grad_for_errors_only + and response.score is not None + and float(response.score) + > backward_pass_setup.threshold_score_to_compute_grad_for_errors + ): + manual_response_1 = f"Eval score: {response.score}. No noticeable error." + response_gradient_list = [manual_response_1] * len(children_params) + raw_response = str(response_gradient_list) + gradient_output = GeneratorOutput( + data=response_gradient_list, raw_response=raw_response + ) + else: + + gradient_output: GeneratorOutput = backward_engine( + prompt_kwargs=backward_engine_prompt_kwargs + ) + if not isinstance(gradient_output, GeneratorOutput): + raise ValueError( + f"Generator: Backward Engine should return a GeneratorOutput. Got {gradient_output} instead." + ) + + # parse the list of gradients + + try: + response_gradient_list = parser.call(gradient_output.data) + except Exception as e: + log.error(f"Error parsing the response_gradient_list: {e}") + failure_message = backward_engine.failure_message_to_optimizer( + gradient_output + ) + if failure_message: + response_gradient_list = [failure_message] * len(children_params) + printc(f"failure_message: {failure_message}", color="red") + + print(f"gradient list: {response_gradient_list}") + + # generate the gradient for each child + for i, pred in enumerate(children_params): + if not pred.requires_opt or pred.param_type == ParameterType.DEMOS: + log.debug( + f"Generator: Skipping {pred} as it does not require optimization." + ) + continue + + gradient_data = ( + response_gradient_list[i] + if response_gradient_list and len(response_gradient_list) > i + else "Failed to get the gradient." + ) + + var_gradient = Gradient( + data=gradient_data, + data_id=response.data_id, + score=response.score, # add score to gradient + from_response=response, + to_pred=pred, + ) + var_gradient.add_context( + GradientContext( + input_output=conversation_str, + response_desc=response.role_desc, + variable_desc=pred.role_desc, # parameter_desc + ) + ) + var_gradient.add_prompt(backward_engine_prompt_str) + pred.add_gradient(var_gradient) + if response.score is not None: + pred.set_score(response.score) + @staticmethod def _backward_through_one_predecessor( pred: Parameter, @@ -640,6 +894,7 @@ def _backward_through_one_predecessor( template: str, backward_engine: "BackwardEngine", prompt_str: str, + backward_pass_setup: BackwardPassSetup, is_intermediate_node: bool = False, ): """Creating gradient/textual feedback for prompt type parameters.""" @@ -648,9 +903,6 @@ def _backward_through_one_predecessor( f"Generator: Skipping {pred} as it does not require optimization." ) return - log.debug( - f"Generator: Backward through {pred}, is_intermediate_node: {is_intermediate_node}" - ) if pred.check_if_already_computed_gradient_respect_to(response.id): log.debug( @@ -669,10 +921,9 @@ def _backward_through_one_predecessor( } conversation_prompt_kwargs = { - # "variable_name": pred.name, - # "variable_desc": pred.role_desc, "input_value": input_prompt_kwargs, - "llm_output": response.data, + "llm_output": response.get_prompt_data(), + "gt": response.get_gt(), } conversation_str = Prompt( @@ -682,8 +933,11 @@ def _backward_through_one_predecessor( variable_dict = pred.get_param_info() + peers = [p.get_param_info() for p in pred.peers] + # peers = [] + variable_and_peers_info = Prompt( - prompt_kwargs={"variable": variable_dict, "peers": pred.peers}, + prompt_kwargs={"variable": variable_dict, "peers": peers}, template=VARIABLE_AND_PEERS_INFO, )() @@ -692,12 +946,23 @@ def _backward_through_one_predecessor( if is_intermediate_node: # TODO: this will always be true conv_ins_template = CONVERSATION_START_INSTRUCTION_CHAIN obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN - + response_gradient = response.get_gradients_str() + # response_gradient = response.get_gradients_component_schema() + if not response_gradient: + raise ValueError( + f"Generator: No gradient found for {response}. Please check the response. pred: {pred}" + ) + predecessors = [ + pred.get_param_info() + for pred in response.predecessors + if pred not in pred.peers + ] instruction_str = Prompt( template=conv_ins_template, prompt_kwargs={ "variable_and_peers_info": variable_and_peers_info, "conversation_str": conversation_str, + "predecessors": predecessors, }, )() log.info(f"Conversation start instruction base str: {instruction_str}") @@ -705,9 +970,7 @@ def _backward_through_one_predecessor( template=obj_ins_template, prompt_kwargs={ "response_desc": response.role_desc, - "response_gradient": response.get_gradient_and_context_text( - skip_correct_sample=True - ), + "response_gradient": response_gradient, "instruction_to_backward_engine": pred.instruction_to_backward_engine, }, )() @@ -716,23 +979,34 @@ def _backward_through_one_predecessor( "conversation_sec": instruction_str, "objective_instruction_sec": objective_str, } + backward_engine_prompt_str = backward_engine.get_prompt( + **backward_engine_prompt_kwargs + ) + # print(f"Backward engine prompt: {backward_engine_prompt_str}") gradient_output: GeneratorOutput = None - if response._score is not None and float(response._score) > 0.9: + if ( + backward_pass_setup.compute_grad_for_errors_only + and response.score is not None + and float(response.score) + > backward_pass_setup.threshold_score_to_compute_grad_for_errors + ): log.debug(f"EvalFnToTextLoss: Skipping {pred} as the score is high enough.") # TODO: plus score descriptions - manual_response = f"You get score: {response._score}." + manual_response = f"Eval score: {response.score}. No noticeable error." gradient_output = GeneratorOutput( data=manual_response, raw_response=manual_response ) else: - # manual_response = f"You get score: {response._score}." - # gradient_output = GeneratorOutput( - # data=manual_response, raw_response=manual_response - # ) gradient_output: GeneratorOutput = backward_engine( prompt_kwargs=backward_engine_prompt_kwargs ) + if not isinstance(gradient_output, GeneratorOutput): + raise ValueError( + f"Generator: Backward Engine should return a GeneratorOutput. Got {gradient_output} instead." + ) + printc(f"Backward engine gradient: {gradient_output}") + # USE this to trace each node's input and output, all nodes can be visualized log.info( f"Generator Backward Engine Prompt: {backward_engine.get_prompt( **backward_engine_prompt_kwargs)}" @@ -741,29 +1015,25 @@ def _backward_through_one_predecessor( gradient_output.data or backward_engine.failure_message_to_optimizer(gradient_output) ) - log.info( - f"Generator Gradient value: {gradient_value}, raw response: {gradient_output.raw_response}" - ) # TODO: make it a debug feature - # prompt_str = backward_engine.get_prompt(**backward_engine_prompt_kwargs) - var_gradient = Parameter( - name=f"{response.name}_to_{pred.name}_grad", - # gradient_prompt=prompt_str, # trace the prompt + var_gradient = Gradient( data=gradient_value, - requires_opt=True, - role_desc=f"feedback for {pred.name}", - score=response._score, # add score to gradient - param_type=ParameterType.GRADIENT, - from_response_id=response.id, + data_id=response.data_id, + score=response.score, # add score to gradient + from_response=response, + to_pred=pred, ) - pred.add_gradient(var_gradient) - pred.set_score(response._score) - - pred.gradients_context[var_gradient] = GradientContext( - context=conversation_str, - response_desc=response.role_desc, - variable_desc=pred.role_desc, # parameter_desc + var_gradient.add_context( + GradientContext( + input_output=conversation_str, + response_desc=response.role_desc, + variable_desc=pred.role_desc, # parameter_desc + ) ) + var_gradient.add_prompt(backward_engine_prompt_str) + pred.add_gradient(var_gradient) + if response.score is not None: + pred.set_score(response.score) def _run_callbacks( self, @@ -798,7 +1068,7 @@ def _run_callbacks( def call( self, - prompt_kwargs: Optional[Dict] = {}, # the input need to be passed to the prompt + prompt_kwargs: Optional[Dict] = {}, # supports both str and parameter value model_kwargs: Optional[Dict] = {}, use_cache: Optional[bool] = None, id: Optional[str] = None, @@ -917,6 +1187,7 @@ def _extra_repr(self) -> str: ] s += f"trainable_prompt_kwargs={prompt_kwargs_repr}" + s += f", prompt={self.prompt}" return s def to_dict(self) -> Dict[str, Any]: @@ -942,7 +1213,11 @@ class BackwardEngine(Generator): # it is a generator with defaule template __doc__ = """The backward engine is a Generator with a default template for the backward pass. - If you want to customize the template, you can create your own backward engine""" + If you want to customize the template, you can create your own backward engine. + + Yet, we will forever keep the training mode to False for the backward engine. + This is achieved by making forward the same as call. + """ def __init__(self, **kwargs): if kwargs is None: @@ -960,6 +1235,10 @@ def call(self, **kwargs) -> GeneratorOutputType: raise ValueError(f"Error in the backward engine: {output.error}") return output + def forward(self, **kwargs): + r"""Forward pass for the backward engine.""" + return self.call(**kwargs) + @staticmethod def failure_message_to_optimizer( gradient_response: GeneratorOutput, diff --git a/adalflow/adalflow/core/prompt_builder.py b/adalflow/adalflow/core/prompt_builder.py index 0d998b63..a0af7d3e 100644 --- a/adalflow/adalflow/core/prompt_builder.py +++ b/adalflow/adalflow/core/prompt_builder.py @@ -7,9 +7,10 @@ from jinja2 import Template, Environment, StrictUndefined, meta -from adalflow.core.component import Component from adalflow.core.default_prompt_template import DEFAULT_ADALFLOW_SYSTEM_PROMPT from adalflow.optim.parameter import Parameter +from dataclasses import dataclass +from adalflow.core.base_data_class import DataClass logger = logging.getLogger(__name__) @@ -17,7 +18,8 @@ T = TypeVar("T") -class Prompt(Component): +@dataclass +class Prompt(DataClass): __doc__ = r"""Renders a text string(prompt) from a Jinja2 template string. In default, we use the :ref:`DEFAULT_ADALFLOW_SYSTEM_PROMPT` as the template. @@ -125,6 +127,9 @@ def print_prompt(self, **kwargs) -> str: except Exception as e: raise ValueError(f"Error rendering Jinja2 template: {e}") + def __call__(self, *args: Any, **kwds: Any) -> Any: + return self.call(*args, **kwds) + def call(self, **kwargs) -> str: """ Renders the prompt template with keyword arguments. Allow None values. @@ -147,6 +152,15 @@ def _extra_repr(self) -> str: s += f", prompt_variables: {self.prompt_variables}" return s + def __repr__(self) -> str: + s = f"template: {self.template}" + prompt_kwargs_str = _convert_prompt_kwargs_to_str(self.prompt_kwargs) + if prompt_kwargs_str: + s += f", prompt_kwargs: {prompt_kwargs_str}" + if self.prompt_variables: + s += f", prompt_variables: {self.prompt_variables}" + return s + @classmethod def from_dict(cls: type[T], data: Dict[str, Any]) -> T: obj = super().from_dict(data) @@ -173,6 +187,16 @@ def _convert_prompt_kwargs_to_str(prompt_kwargs: Dict) -> Dict[str, str]: if isinstance(p, Parameter): prompt_kwargs_str[key] = p.data + elif isinstance(p, list): + prompt_kwargs_str[key] = [ + ( + p_elem.data_in_prompt(p_elem) + if isinstance(p_elem, Parameter) + else p_elem + ) + for p_elem in p + ] + else: prompt_kwargs_str[key] = p return prompt_kwargs_str diff --git a/adalflow/adalflow/core/retriever.py b/adalflow/adalflow/core/retriever.py index fb65a298..3778fdd8 100644 --- a/adalflow/adalflow/core/retriever.py +++ b/adalflow/adalflow/core/retriever.py @@ -13,8 +13,8 @@ from adalflow.optim.grad_component import GradComponent if TYPE_CHECKING: - from adalflow.core.generator import Generator -from adalflow.optim.parameter import Parameter + pass +from adalflow.optim.parameter import Parameter, OutputParameter from adalflow.optim.types import ParameterType log = logging.getLogger(__name__) @@ -123,41 +123,57 @@ def forward( top_k = Parameter( data=top_k or self.top_k, name="top_k", - requires_opt=True, + requires_opt=False, param_type=ParameterType.HYPERPARAM, ) if input is None: raise ValueError("Input cannot be empty") - response = super().forward(input, top_k=top_k, **kwargs) + response: OutputParameter = super().forward(input, top_k=top_k, id=id, **kwargs) + if not isinstance(response, OutputParameter): + raise ValueError( + f"Retriever forward: Expect OutputParameter, but got {type(response)}" + ) + response.trace_forward_pass( + input_args={"input": input, "top_k": top_k}, + full_response=response.data, + id=self.id, + name=self.name, + ) response.param_type = ( ParameterType.RETRIEVER_OUTPUT ) # be more specific about the type return response - def backward( - self, - response: Parameter, - id: Optional[str] = None, - backward_engine: Optional["Generator"] = None, - ): - r"""Backward the response to pass the score to predecessors. - Function as a relay component""" - log.info(f"Retriever backward: {response.name}") - children_params = response.predecessors - - # is_chain = True - if response.get_gradient_and_context_text().strip() == "": - log.info(f"Generator: Backward: No gradient found for {response}.") - - for pred in children_params: - pred.set_score(response._score) - from adalflow.utils.logger import printc - - printc( - f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}", - "blue", - ) - if pred.param_type == ParameterType.DEMOS: - pred.add_score_to_trace( - trace_id=id, score=response._score, is_teacher=self.teacher_mode - ) + # def backward( + # self, + # response: Parameter, + # id: Optional[str] = None, + # backward_engine: Optional["Generator"] = None, + # ): + # r"""Backward the response to pass the score to predecessors. + # Function as a relay component""" + # log.info(f"Retriever backward: {response.name}") + # children_params = response.predecessors + + # # is_chain = True + # if response.get_gradient_and_context_text().strip() == "": + # log.info(f"Generator: Backward: No gradient found for {response}.") + + # for pred in children_params: + # pred.set_score(response._score) + # from adalflow.utils.logger import printc + + # printc( + # f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}", + # "blue", + # ) + # if pred.param_type == ParameterType.DEMOS: + # pred.add_score_to_trace( + # trace_id=id, score=response._score, is_teacher=self.teacher_mode + # ) + + # # pass the gradients + # for grad in response.gradients: + # # make a copy of the gradient + # grad = deepcopy(grad) + # pred.add_gradient(grad) diff --git a/adalflow/adalflow/core/string_parser.py b/adalflow/adalflow/core/string_parser.py index 3001b512..b18f619d 100644 --- a/adalflow/adalflow/core/string_parser.py +++ b/adalflow/adalflow/core/string_parser.py @@ -2,10 +2,11 @@ From simple data types like boolean, integer, and float to more complex data types like JSON, YAML, and list strings.""" -from typing import Dict, List, Union +from typing import Dict, List, Union, Optional, Callable import logging +from adalflow.utils.registry import EntityMapping + -from adalflow.core.component import Component import adalflow.core.functional as F log = logging.getLogger(__name__) @@ -13,12 +14,15 @@ BOOLEAN_PARSER_OUTPUT_TYPE = bool -class Parser(Component): +class Parser: __doc__ = r"""Base class for all string parsers.""" def __init__(self): super().__init__() + def __call__(self, input: str) -> object: + return self.call(input) + def call(self, input: str) -> object: raise NotImplementedError( "Parser subclasses must implement the __call__ method" @@ -246,3 +250,72 @@ def call(self, input: str) -> YAML_PARSER_OUTPUT_TYPE: return yaml_obj except Exception as e: raise ValueError(f"Error: {e}") + + +class FuncParser(Parser): + r"""Component that wraps a function. + + Args: + fun (Callable): The function to be wrapped. + + Examples: + + function = lambda x: x + 1 + fun_component = FunComponent(function) + print(fun_component(1)) # 2 + """ + + def __init__(self, fun: Optional[Callable] = None, afun: Optional[Callable] = None): + super().__init__() + self.fun_name = fun.__name__ + EntityMapping.register(self.fun_name, fun) + + def call(self, *args, **kwargs): + fun = EntityMapping.get(self.fun_name) + return fun(*args, **kwargs) + + def __repr__(self) -> str: + return super().__repr__() + f"fun_name={self.fun_name}" + + +def func_to_parser(fun) -> FuncParser: + r"""Helper function to convert a function into a Parser class. + its own class name. + + Can be used as both a decorator and a function. + + Args: + fun (Callable): The function to be wrapped. + Returns: + FuncParser: The component that wraps the function. + + Examples: + 1. As a decorator: + >>> @func_to_parser + >>> def my_function(x): + >>> return x + 1 + >>> # is equivalent to + >>> class MyFunctionParser(FuncParser): + >>> def __init__(self): + >>> super().__init__(my_function) + + 2. As a function: + >>> my_function_parser = func_to_parser(my_function) + """ + + # Split the function name by underscores, capitalize each part, and join them back together + class_name = ( + "".join(part.capitalize() for part in fun.__name__.split("_")) + "Component" + ) + # register the function + EntityMapping.register(fun.__name__, fun) + # Define a new component class dynamically + parser_class = type( + class_name, + (FuncParser,), + {"__init__": lambda self: FuncParser.__init__(self, fun)}, + ) + # register the component + EntityMapping.register(class_name, parser_class) + + return parser_class() diff --git a/adalflow/adalflow/core/tool_manager.py b/adalflow/adalflow/core/tool_manager.py index 3538762a..fcb536a4 100644 --- a/adalflow/adalflow/core/tool_manager.py +++ b/adalflow/adalflow/core/tool_manager.py @@ -2,13 +2,27 @@ The ToolManager manages a list of tools, context, and all ways to execute functions. """ -from typing import List, Dict, Optional, Any, Callable, Awaitable, Union +from typing import ( + List, + Dict, + Optional, + Any, + Callable, + Awaitable, + Union, + overload, + Literal, +) import logging from copy import deepcopy import asyncio +from adalflow.optim.parameter import Parameter, ParameterType import nest_asyncio +import warnings -from adalflow.core import Component +from adalflow.core.container import ComponentList +from adalflow.optim.grad_component import GradComponent2 +from adalflow.core.component import Component from adalflow.core.func_tool import FunctionTool from adalflow.core.types import ( FunctionDefinition, @@ -16,6 +30,8 @@ Function, FunctionExpression, ) +from adalflow.utils import printc + from adalflow.core.functional import ( parse_function_call_expr, @@ -42,15 +58,91 @@ def run_async_in_new_loop(coro): asyncio.set_event_loop(None) +class CallFunctionTool(Component): + __doc__ = """Contains other unit gradcomponent such as calling + a FunctionTool""" + + def __init__(self): + super().__init__() + + def forward(self, func: Parameter, context: Dict[str, object]): + return self.bicall(func, context=context) + + def call(self, func: Function, context: Dict[str, object]) -> FunctionOutput: + return self.bicall(func, context=context) + + def bicall( + self, + func: Union[Function, Parameter], + context: Dict[str, object] = {}, + ): + if isinstance(func, Parameter): + # printc(f"context: {context}", color="yellow") + func_data: Function = func.map_to_successor(self) + if not isinstance(func_data, Function): + raise ValueError(f"Error parsing function expression: {func}") + tool: FunctionTool = context[func_data.name] + # print(f"tool training: {tool.training}") + output = tool.forward(*func_data.args, **func_data.kwargs) + + from adalflow.optim.grad_component import fun_to_grad_component + + # this will automatically create the outputparam, and connect output, func to the outputParam + @fun_to_grad_component() + def pass_through_output(output, func): + return output + + # NOTE: special case: handle the function which is not a grad_component + # here we have to specifically converts it to a parameter and handles the predecessors + # there is no trainable parameters inside of the tool but the tool response itself can be optimized by response optimizer + if not isinstance(output, Parameter): + # printc(f"output 1: {output}, func: {func}", color="yellow") + return pass_through_output.forward(output, func) + else: + # reconnect the predecessor for tracing as it is not done in tool.forward + output.predecessors.add(func) + return output + else: + tool: FunctionTool = context[func.name] + output = tool.call(*func.args, **func.kwargs) + return output + + +class FunctionExperssionToFunction(GradComponent2): + def __init__(self): + super().__init__(desc="Convert FunctionExpression to Function") + + def call(self, expr: FunctionExpression, context: Dict[str, object]) -> Function: + + assert isinstance( + expr, FunctionExpression + ), f"Expected FunctionExpression, got {type(expr)}" + + expr_str = expr.action + func_name, args, kwargs = parse_function_call_expr(expr_str, context) + # printc( + # f"func_name: {func_name}, args: {args}, kwargs: {kwargs}", color="yellow" + # ) + output = Function( + name=func_name, + args=args, + kwargs=kwargs, + thought=expr.thought, + ) + # printc(f"output: {output}", color="yellow") + return output + + # TODO: good to track all the failed function calls +# Tool manager is a task component class ToolManager(Component): __doc__ = r""""Manage a list of tools, context, and all ways to execute functions. - yaml and json definitions are for quick access to the definitions of the tools. - If you need more specification, such as using exclude field, you can use the function_definitions. - Args: + ToolManager is a task component that does not need its own backward function. + yaml and json definitions are for quick access to the definitions of the tools. + If you need more specification, such as using exclude field, you can use the function_definitions. """ def __init__( @@ -62,56 +154,221 @@ def __init__( ): super().__init__() nest_asyncio.apply() # Apply nest_asyncio to handle nested loops - # super(LocalDB, self).__init__() - self.tools = [ + processed_tools = [ ( FunctionTool(fn=deepcopy(tool)) if not isinstance(tool, FunctionTool) - else deepcopy(tool) + else tool ) for tool in tools ] - self._context_map = {tool.definition.func_name: tool for tool in self.tools} + # printc( + # f"processed tools: {processed_tools}, len: {len(processed_tools)} and len tools: {len(tools)}", + # color="yellow", + # ) + self.tools = ComponentList(processed_tools) + self._context_map = self.create_context_map_from_tools(self.tools) self._additional_context = additional_context or {} self.context = {**self._context_map, **self._additional_context} log.info( f"Initialized ToolManager with {len(self.tools)} tools and additional context {self._additional_context}" ) + @staticmethod + def get_context_index(tool: FunctionTool) -> Dict[str, object]: + index = tool.definition.func_name + if tool.definition.class_instance: + index = f"{tool.definition.class_instance}.{index}" + output = {index: tool} + if tool.definition.func_name == "__call__": + # add another index of directly using the classinstance + output[f"{tool.definition.class_instance}"] = tool + return output + + @staticmethod + def create_context_map_from_tools(tools: List[FunctionTool]) -> Dict[str, object]: + output: Dict[str, object] = {} + for tool in tools: + tool_map = ToolManager.get_context_index(tool) + for k, v in tool_map.items(): + if k in output: + # raise ValueError(f"Duplicate key {k} in the context map.") + warnings.warn(f"Duplicate key {k} in the context map.") + continue + output[k] = v + return output + @property def yaml_definitions(self) -> List[str]: - return [tool.definition.to_yaml() for tool in self.tools] + output = [] + for tool in self.tools: + if not tool.definition.class_instance: + output.append(tool.definition.to_yaml(exclude=["class_instance"])) + else: + output.append(tool.definition.to_yaml()) + return output @property def json_definitions(self) -> List[str]: - return [tool.definition.to_json() for tool in self.tools] + output = [] + for tool in self.tools: + if not tool.definition.class_instance: + output.append(tool.definition.to_json(exclude=["class_instance"])) + else: + output.append(tool.definition.to_json()) + output.append(tool.definition.to_json(exclude=["class_instance"])) + return output @property def function_definitions(self) -> List[FunctionDefinition]: return [tool.definition for tool in self.tools] - def parse_func_expr(self, expr: FunctionExpression) -> Function: + def parse_func_expr( + self, + expr: Union[FunctionExpression, Parameter], + map_fn: Callable = lambda x: x.data, + ) -> Union[Function, Parameter]: r"""Parse the function call expression.""" - try: - expr_str = expr.action - func_name, args, kwargs = parse_function_call_expr(expr_str, self.context) - return Function(name=func_name, args=args, kwargs=kwargs) - except Exception as e: - log.error(f"Error {e} parsing function call expression: {expr_str}") - raise ValueError(f"Error {e} parsing function call expression: {expr_str}") - def execute_func(self, func: Function) -> FunctionOutput: - r"""Execute the function. If the function is async, use asyncio.run to execute it.""" - try: - tool: FunctionTool = self.context[func.name] - if tool.is_async: - log.debug("Running async function in new loop") - return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs)) + if isinstance(expr, Parameter): + # try: + + func = FunctionExperssionToFunction() + expr.add_successor_map_fn(func, map_fn=map_fn) + # print("FunctionExperssionToFunction") + output = func.forward(expr, context=self.context) + # print(f"output data: {output.data}") + return output + + # except Exception as e: + # error_msg = ( + # f"Error {e} parsing function call expression: {map_fn(expr)}" + # ) + # return error_msg + else: + try: + expr_str = expr.action + func_name, args, kwargs = parse_function_call_expr( + expr_str, self.context + ) + return Function(name=func_name, args=args, kwargs=kwargs) + except Exception as e: + log.error(f"Error {e} parsing function call expression: {expr}") + raise ValueError(f"Error {e} parsing function call expression: {expr}") + + @overload + def call( + self, *, expr_or_fun: FunctionExpression, step: Literal["parse"] = "parse" + ) -> Function: ... + + @overload + def call( + self, *, expr_or_fun: FunctionExpression, step: Literal["execute"] = "execute" + ) -> FunctionOutput: ... + + @overload + def call( + self, *, expr_or_fun: Function, step: Literal["execute"] = "parse" + ) -> Function: ... + + @overload + def call( + self, *, expr_or_fun: Function, step: Literal["execute"] = "execute" + ) -> FunctionOutput: ... + + def call( + self, + *, + expr_or_fun: Union[FunctionExpression, Function], + step: Literal["execute"] = "execute", + ) -> Union[FunctionOutput, Function, Parameter]: + if not isinstance(expr_or_fun, (Function, FunctionExpression)): + raise ValueError( + f"expr_or_fun should be either a Function or FunctionExpression. Got {expr_or_fun}" + ) + if step == "parse": + if isinstance(expr_or_fun, Function): + return expr_or_fun + return self.parse_func_expr(expr_or_fun) + elif step == "execute": + if isinstance(expr_or_fun, Function): + return self.execute_func(expr_or_fun) + return self.execute_func_expr(expr_or_fun) + else: + raise ValueError(f"step should be either 'parse' or 'execute'. Got {step}") + + def forward( + self, + *, + expr_or_fun: Union[FunctionExpression, Function, Parameter], + step: Literal["parse", "execute"] = "execute", + map_fn: Callable = lambda x: x.data, # how to map the parameter to the needed data + ) -> Union[FunctionOutput, Function, Parameter]: + "Run a forward pass on the tool manager such as parsing function expression or executing function." + if isinstance(expr_or_fun, Parameter): + expr_or_fun_data = map_fn(expr_or_fun) + if step == "execute": + if isinstance(expr_or_fun_data, Function): + return self.execute_func(expr_or_fun, map_fn=map_fn) + else: + raise NotImplementedError( + "Only Function expressions are supported for now." + ) else: - return tool.call(*func.args, **func.kwargs) - except Exception as e: - log.error(f"Error {e} executing function: {func}") - raise ValueError(f"Error {e} executing function: {func}") + if isinstance(expr_or_fun_data, FunctionExpression): + output = self.parse_func_expr(expr_or_fun, map_fn=map_fn) + return output + else: + raise NotImplementedError( + f"Only function call expressions are supported for now. Got {expr_or_fun_data}" + ) + else: + raise ValueError(f"expr_or_fun should be a Parameter. Got {expr_or_fun}") + # return self.call(expr_or_fun=expr_or_fun, step=step) + + def execute_func( + self, func: Union[Function, Parameter], map_fn: Callable = lambda x: x.data + ) -> Union[FunctionOutput, Parameter]: + r"""Execute the function. If the function is async, use asyncio.run to execute it.""" + + if isinstance(func, Parameter): + try: + + call_func_tool = CallFunctionTool() + func.add_successor_map_fn(call_func_tool, map_fn=map_fn) + return call_func_tool.forward(func, context=self.context) + + except Exception as e: + log.error(f"Error {e} executing function: {func.data}") + error_msg = f"Error {e} executing function: {func.data}" + return error_msg + + else: + try: + tool: FunctionTool = self.context[func.name] + # printc(f"call tool: {tool}", color="yellow") + if tool.is_async: + return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs)) + + else: + output = tool.call(*func.args, **func.kwargs) + # printc(f"output: {output}", color="yellow") + return output + except Exception as e: + log.error(f"Error {e} executing function: {func}") + raise ValueError(f"Error {e} executing function: {func}") + + # try: + # tool: FunctionTool = self.context[func.name] + # if tool.is_async: + # log.debug("Running async function in new loop") + # return run_async_in_new_loop(tool.acall(*func.args, **func.kwargs)) + # else: + # # TODO ensure it is set to traing mode + # return tool.forward(*func.args, **func.kwargs) + # except Exception as e: + # log.error(f"Error {e} executing function: {func}") + # raise ValueError(f"Error {e} executing function: {func}") async def execute_func_async(self, func: Function) -> FunctionOutput: r"""Execute the function. If the function is sync, use await to execute it.""" @@ -125,16 +382,40 @@ async def execute_func_async(self, func: Function) -> FunctionOutput: log.error(f"Error {e} executing function: {func}") raise ValueError(f"Error {e} executing function: {func}") - def execute_func_expr(self, expr: FunctionExpression) -> FunctionOutput: + def execute_func_expr( + self, + expr: Union[FunctionExpression, Parameter], + map_fn: Callable = lambda x: x.data, + ) -> Union[FunctionOutput, Parameter]: r"""Execute the function expression. Support both sync and async functions.""" - func: Function = self.parse_func_expr(expr) - try: - return self.execute_func(func) - except Exception as e: - # NOTE: if the function expression is not a function call, try to execute it as a function expression - log.error(f"Error {e} executing function expression: {expr}") - raise ValueError(f"Error {e} executing function expression: {expr}") + if isinstance(expr, Parameter): + + func: Parameter = self.parse_func_expr(expr, map_fn=map_fn) + if not isinstance(func, Parameter): + raise ValueError(f"Error parsing function expression: {expr}") + + # execute the function + output: Parameter = self.execute_func(func) + if not isinstance(output, Parameter): + raise ValueError(f"Error executing function expression: {expr}") + output.predecessors.add(expr) + return output + else: + + try: + func: Function = self.parse_func_expr(expr) + if not isinstance(func, Function): + raise ValueError(f"Error parsing function expression: {expr}") + + return self.execute_func(func) + except Exception as e: + # NOTE: if the function expression is not a function call, try to execute it as a function expression + log.error(f"Error {e} executing function expression: {expr}") + # raise ValueError(f"Error {e} executing function expression: {expr}") + return FunctionOutput( + name=expr.action, input=expr, output=None, error=None + ) async def execute_func_expr_async(self, expr: FunctionExpression) -> FunctionOutput: r"""Execute the function expression. Support both sync and async functions.""" @@ -186,3 +467,54 @@ def execute_func_expr_via_eval(self, expr: FunctionExpression) -> FunctionOutput def _extra_repr(self) -> str: s = f"Tools: {self.tools}, Additional Context: {self._additional_context}" return s + + +if __name__ == "__main__": + # test tool manager + from adalflow.core.func_tool import FunctionTool + from adalflow.components.model_client import OpenAIClient + from adalflow.core.generator import Generator + from adalflow.optim.parameter import Parameter + from adalflow.utils import setup_env, printc + + setup_env() + + llm = Generator( + model_client=OpenAIClient(), + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + # llm.train() + + def llm_as_tool(input: str, id: Optional[str] = None) -> str: + """Used as a calculator tool.""" + printc(f"llm_as_tool: {input}", color="yellow") + + return llm(prompt_kwargs={"input_str": input}, id=id) + + llm_tool = FunctionTool(llm_as_tool, component=llm) + # llm_tool.train() + # output: Parameter = llm_tool("What is 2+2?") + # output.draw_graph() + # print(output) + + tool_manager = ToolManager(tools=[llm_tool]) + tool_manager.train() + expr_or_fun = Parameter( + name="expr_or_fun", + data=FunctionExpression(action="llm_as_tool('What is 2+2?')"), + eval_input="What is 2+2?", + param_type=ParameterType.INPUT, + ) + output: Parameter = tool_manager(expr_or_fun=expr_or_fun, step="parse") + print(output) + print(output.predecessors) + assert len(output.predecessors) == 1 + # output = tool_manager(output, step="execute") + # print(output) + # output.draw_graph() + + # expr_or_fun = FunctionExpression(action="llm_as_tool('What is 2+2?')") + + # tool_manager.eval() + # output = tool_manager(expr_or_fun=expr_or_fun, step="execute") + # print(output) diff --git a/adalflow/adalflow/core/types.py b/adalflow/adalflow/core/types.py index 251635ca..1cea4200 100644 --- a/adalflow/adalflow/core/types.py +++ b/adalflow/adalflow/core/types.py @@ -13,7 +13,6 @@ Literal, Callable, Awaitable, - Type, ) from collections import OrderedDict from dataclasses import ( @@ -25,6 +24,7 @@ from datetime import datetime import uuid import logging +import json from adalflow.core.base_data_class import DataClass, required_field from adalflow.core.tokenizer import Tokenizer @@ -282,19 +282,25 @@ class RetrieverOutput(DataClass): It is up to the subclass of Retriever to specify the type of query and document. """ - doc_indices: List[int] = field(metadata={"desc": "List of document indices"}) - doc_scores: Optional[List[float]] = field( + id: str = field(default=None, metadata={"desc": "The unique id of the output"}) + + doc_indices: List[int] = field( + default=required_field, metadata={"desc": "List of document indices"} + ) + doc_scores: List[float] = field( default=None, metadata={"desc": "List of document scores"} ) - query: Optional[RetrieverQueryType] = field( + query: RetrieverQueryType = field( default=None, metadata={"desc": "The query used to retrieve the documents"} ) - documents: Optional[List[RetrieverDocumentType]] = field( + documents: List[RetrieverDocumentType] = field( default=None, metadata={"desc": "List of retrieved documents"} ) -RetrieverOutputType = List[RetrieverOutput] # so to support multiple queries at once +RetrieverOutputType = Union[ + List[RetrieverOutput], RetrieverOutput +] # so to support multiple queries at once ####################################################################################### @@ -306,8 +312,13 @@ class RetrieverOutput(DataClass): @dataclass class FunctionDefinition(DataClass): __doc__ = r"""The data modeling of a function definition, including the name, description, and parameters.""" - - func_name: str = field(metadata={"desc": "The name of the tool"}) + class_instance: Optional[Any] = field( + default=None, + metadata={"desc": "The instance of the class this function belongs to"}, + ) + func_name: str = field( + metadata={"desc": "The name of the tool"}, default=required_field + ) func_desc: Optional[str] = field( default=None, metadata={"desc": "The description of the tool"} ) @@ -365,6 +376,7 @@ def add(a, b): default_factory=dict, metadata={"desc": "The keyword arguments of the function"}, ) + __output_fields__ = ["thought", "name", "kwargs"] _action_desc = """FuncName() \ @@ -407,9 +419,10 @@ def add(a, b): The benefits are less failed function calls. """ - thought: Optional[str] = field( - default=None, metadata={"desc": "Why the function is called"} - ) + # question: str = field( + # default=None, metadata={"desc": "The question to ask the LLM"} + # ) + thought: str = field(default=None, metadata={"desc": "Why the function is called"}) action: str = field( default_factory=required_field, metadata={"desc": _action_desc}, @@ -514,36 +527,19 @@ class StepOutput(DataClass, Generic[T]): default=None, metadata={"desc": "The execution result shown for this action"} ) - @classmethod - def with_action_type(cls, action_type: Type[T]) -> Type["StepOutput[T]"]: - """ - Create a new StepOutput class with the specified action type. - - Use this if you want to create schema for StepOutput with a specific action type. - - Args: - action_type (Type[T]): The type to set for the action attribute. - - Returns: - Type[StepOutput[T]]: A new subclass of StepOutput with the specified action type. - - Example: - - .. code-block:: python - - from adalflow.core.types import StepOutput, FunctionExpression - - StepOutputWithFunctionExpression = StepOutput.with_action_type(FunctionExpression) - """ - # Create a new type variable map - type_var_map = {T: action_type} - - # Create a new subclass with the updated type - new_cls = type(cls.__name__, (cls,), {"__type_var_map__": type_var_map}) - - # Update the __annotations__ to reflect the new type of action - new_cls.__annotations__["action"] = action_type - return new_cls + def to_prompt_str(self) -> str: + output: Dict[str, Any] = {} + if self.action and isinstance(self.action, FunctionExpression): + if self.action.thought: + output["thought"] = self.action.thought + output["action"] = self.action.action if self.action else None + if self.observation: + output["observation"] = ( + self.observation.to_dict() + if hasattr(self.observation, "to_dict") + else str(self.observation) + ) + return json.dumps(output) ####################################################################################### diff --git a/adalflow/adalflow/datasets/big_bench_hard.py b/adalflow/adalflow/datasets/big_bench_hard.py index f98f2517..3e628f74 100644 --- a/adalflow/adalflow/datasets/big_bench_hard.py +++ b/adalflow/adalflow/datasets/big_bench_hard.py @@ -24,7 +24,7 @@ class BigBenchHard(Dataset): Size for each split: - train: 50 examples - - val: 50 examples + - val: 100 examples - test: 100 examples Args: @@ -120,11 +120,11 @@ def _check_or_download_dataset(self, data_path: str = None, split: str = "train" ] val_examples = [ {"x": ex["input"], "y": ex["target"], "id": str(uuid.uuid4())} - for ex in examples[50:100] + for ex in examples[50:150] ] test_examples = [ {"x": ex["input"], "y": ex["target"], "id": str(uuid.uuid4())} - for ex in examples[150:250] + for ex in examples[150:] ] # ensure the @@ -150,7 +150,7 @@ def get_default_task_instruction(): if __name__ == "__main__": from adalflow.datasets.big_bench_hard import BigBenchHard - dataset = BigBenchHard(task_name="word_sorting", split="train") + dataset = BigBenchHard(task_name="object_counting", split="test") print(dataset[0:10]) print(len(dataset)) print(dataset.get_default_task_instruction()) diff --git a/adalflow/adalflow/datasets/hotpot_qa.py b/adalflow/adalflow/datasets/hotpot_qa.py index 528ae388..22919e77 100644 --- a/adalflow/adalflow/datasets/hotpot_qa.py +++ b/adalflow/adalflow/datasets/hotpot_qa.py @@ -1,13 +1,12 @@ import random import os -import csv -from typing import Literal +from typing import Literal, List from adalflow.utils.lazy_import import safe_import, OptionalPackages from adalflow.utils.data import Dataset -from adalflow.utils.file_io import save_csv +from adalflow.utils.file_io import save_csv, save_json, load_json from adalflow.datasets.utils import prepare_dataset_path from adalflow.core.base_data_class import DataClass from adalflow.datasets.types import HotPotQAData @@ -23,6 +22,16 @@ def __init__( size: int = None, **kwargs, ) -> None: + r""" + official_train: 15661 + sampled_trainset: 11745 + sampled_valset: 3916 + test: 7405 + + All answers are a phrase in the supporting context where we can choose supporting facts from the context. + + You can specify the size of the dataset to load by setting the size parameter. + """ if split not in ["train", "val", "test"]: raise ValueError("Split must be one of 'train', 'val', 'test'") @@ -36,7 +45,8 @@ def __init__( self.task_name = f"hotpot_qa_{keep_details}" data_path = prepare_dataset_path(self.root, self.task_name) # download and save - split_csv_path = os.path.join(data_path, f"{split}.csv") + split_csv_path = os.path.join(data_path, f"{split}.json") + print(f"split_csv_path: {split_csv_path}") self._check_or_download_dataset( split_csv_path, split, only_hard_examples, keep_details ) @@ -46,12 +56,20 @@ def __init__( # created_data_class = DynamicDataClassFactory.from_dict( # "HotPotQAData", {"id": "str", "question": "str", "answer": "str"} - with open(split_csv_path, newline="") as csvfile: - reader = csv.DictReader(csvfile) - for i, row in enumerate(reader): - if size is not None and i >= size: - break - self.data.append(HotPotQAData.from_dict(row)) + # with open(split_csv_path, newline="") as csvfile: + # reader = csv.DictReader(csvfile) + # for i, row in enumerate(reader): + # if size is not None and i >= size: + # break + # self.data.append(HotPotQAData.from_dict(row)) + + self.data = load_json(split_csv_path) + if size is not None: + # use random seed to make sure the same data is loaded + # random.Random(0).shuffle(self.data) + self.data = self.data[:size] + # convert to dataclass + self.data = [HotPotQAData.from_dict(d) for d in self.data] def _check_or_download_dataset( self, @@ -90,6 +108,24 @@ def _check_or_download_dataset( hf_official_dev = load_dataset( "hotpot_qa", "fullwiki", split="validation", trust_remote_code=True ) + data_path_dir = os.path.dirname(data_path) + # save all the original data + all_original_keys = hf_official_train[0].keys() + for split, examples in zip( + ["hf_official_train", "hf_official_dev"], + [hf_official_train, hf_official_dev], + ): + target_path = os.path.join(data_path_dir, f"{split}.csv") + save_csv(examples, f=target_path, fieldnames=all_original_keys) + # for example in examples: + # # is answer in the context + # print(f"example: {example}") + # context = str(json.dumps(example["context"])) + # if example["answer"] in context: + # print(f"answer in context") + # else: + # print(f"answer not in context") + print(f"saved {split} to {target_path}") keys = ["question", "answer"] if keep_details == "all": keys = [ @@ -101,33 +137,39 @@ def _check_or_download_dataset( "context", ] elif keep_details == "dev_titles": - keys = ["id", "question", "answer", "supporting_facts"] + keys = ["id", "question", "answer", "supporting_facts", "context"] - official_train = [] + official_train = [] # 15661 for raw_example in hf_official_train: if raw_example["level"] == "hard": example = {k: raw_example[k] for k in keys} if "supporting_facts" in example: example["gold_titles"] = set(example["supporting_facts"]["title"]) - del example["supporting_facts"] + # del example["supporting_facts"] official_train.append(example) + print(f"official_train: {len(official_train)}") rng = random.Random(0) rng.shuffle(official_train) - sampled_trainset = official_train[: len(official_train) * 75 // 100] + sampled_trainset = official_train[: len(official_train) * 70 // 100] # 11745 + print(f"sampled_trainset: {len(sampled_trainset)}") - sampled_valset = official_train[ - len(official_train) * 75 // 100 : + sampled_valset = official_train[ # 3916 + len(official_train) * 70 // 100 : ] # this is not the official dev set + print(f"sampled_valset: {len(sampled_valset)}") + # for example in self._train: # if keep_details == "dev_titles": # del example["gold_titles"] - test = [] + test = [] # 7405 + + print(f"raw_example: {hf_official_dev[0]}") for raw_example in hf_official_dev: assert raw_example["level"] == "hard" example = { @@ -136,24 +178,44 @@ def _check_or_download_dataset( } if "supporting_facts" in example: example["gold_titles"] = set(example["supporting_facts"]["title"]) - del example["supporting_facts"] + + # del example["supporting_facts"] test.append(example) - keys = ["id", "question", "answer", "gold_titles"] + keys = ["id", "question", "answer", "gold_titles", "context"] + + # split test into val and test + # random shuff the test + rng.shuffle(test) + test_split = test[: len(test) * 50 // 100] # 3702 + val_split = test[len(test) * 50 // 100 :] # 3703 + # save to csv for split, examples in zip( ["train", "val", "test"], - [sampled_trainset, sampled_valset, test], + [sampled_trainset, val_split, test_split], ): # target_path = prepare_dataset_path(self.root, task_name, split) - save_csv(examples, f=data_path, fieldnames=keys) + target_path = os.path.join(data_path_dir, f"{split}.json") + # filter the examples with only the keys + save_examples: List[HotPotQAData] = [] + for example in examples: + save_example = {k: example[k] for k in keys if k in example} + save_example = HotPotQAData.from_dict(save_example) + save_examples.append(save_example.to_dict()) + save_json(save_examples, f=target_path) + if split == "train": + print(f"train example: {examples[0]}") + print(f"saved {split} to {target_path}") if split == "train": return sampled_trainset elif split == "val": return sampled_valset - else: + elif split == "test": return test + else: + raise ValueError("Split must be one of 'train', 'val', 'test'") def __getitem__(self, index) -> DataClass: return self.data[index] @@ -172,3 +234,107 @@ def __len__(self): print(len(testdataset)) print(f"valdataset[0]: {valdataset[0]}") print(f"testdataset[0]: {testdataset[0]}") + # example = { + # "id": "5a8b57f25542995d1e6f1371", + # "question": "Were Scott Derrickson and Ed Wood of the same nationality?", + # "answer": "yes", + # "type": "comparison", + # "level": "hard", + # "supporting_facts": { + # "title": ["Scott Derrickson", "Ed Wood"], + # "sent_id": [0, 0], + # }, + # "context": { + # "title": [ + # "Adam Collis", + # "Ed Wood (film)", + # "Tyler Bates", + # "Doctor Strange (2016 film)", + # "Hellraiser: Inferno", + # "Sinister (film)", + # "Deliver Us from Evil (2014 film)", + # "Woodson, Arkansas", + # "Conrad Brooks", + # "The Exorcism of Emily Rose", + # ], + # "sentences": [ + # [ + # "Adam Collis is an American filmmaker and actor.", + # " He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.", + # " He also studied cinema at the University of Southern California from 1991 to 1997.", + # ' Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995).', + # ' In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".', + # ], + # [ + # "Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.", + # " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.", + # " Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.", + # ], + # [ + # "Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.", + # ' Much of his work is in the action and horror film genres, with films like "Dawn of the Dead, 300, Sucker Punch," and "John Wick."', + # " He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.", + # ' With Gunn, he has scored every one of the director\'s films; including "Guardians of the Galaxy", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', + # ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums "The Pale Emperor" and "Heaven Upside Down".', + # ], + # [ + # "Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.", + # " It is the fourteenth film of the Marvel Cinematic Universe (MCU).", + # " The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.", + # ' In "Doctor Strange", surgeon Strange learns the mystic arts after a career-ending car accident.', + # ], + # [ + # "Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.", + # ' It is the fifth installment in the "Hellraiser" series and the first "Hellraiser" film to go straight-to-DVD.', + # " It was directed by Scott Derrickson and released on October 3, 2000.", + # " The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.", + # " The film's reviews were mixed.", + # ], + # [ + # "Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.", + # " It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.", + # ], + # [ + # "Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.", + # ' The film is officially based on a 2001 non-fiction book entitled "Beware the Night" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was "inspired by actual accounts".', + # " The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.", + # ], + # [ + # "Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.", + # " Its population was 403 at the 2010 census.", + # " It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.", + # " Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.", + # " Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.", + # ], + # [ + # "Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.", + # " He moved to Hollywood, California in 1948 to pursue a career in acting.", + # ' He got his start in movies appearing in Ed Wood films such as "Plan 9 from Outer Space", "Glen or Glenda", and "Jail Bait."', + # " He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.", + # " He also has since gone on to write, produce and direct several films.", + # ], + # [ + # "The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.", + # " The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.", + # ], + # ], + # }, + # } + + # # save to csv + # keys = ["id", "question", "answer", "gold_titles", "context"] + # example["gold_titles"] = set(example["supporting_facts"]["title"]) + + # # test, save to hotpotQA + + # data = HotPotQAData.from_dict({k: example[k] for k in keys}) + # print(f"data: {data}") + + # # save to json + # save_json([data.to_dict()], f="test.json") + + # # load from json + # loaded_data = load_json("test.json") + # # convert to dataclass + # data = HotPotQAData.from_dict(loaded_data[0]) + # print(f"data: {data}") diff --git a/adalflow/adalflow/datasets/trec.py b/adalflow/adalflow/datasets/trec.py index 75267609..da20ad2d 100644 --- a/adalflow/adalflow/datasets/trec.py +++ b/adalflow/adalflow/datasets/trec.py @@ -43,8 +43,6 @@ def sample_subset_dataset(dataset, num_samples: int, sample_weights): def prepare_datasets(): from datasets import load_dataset - from datasets import Dataset as HFDataset - from adalflow.optim.sampler import ClassSampler dataset = load_dataset("trec") print(f"train: {len(dataset['train'])}, test: {len(dataset['test'])}") # 5452, 500 @@ -59,19 +57,26 @@ def prepare_datasets(): len_train_dataset = len(org_train_dataset) org_test_dataset = dataset["test"] - eval_size = 6 * num_classes - - class_sampler = ClassSampler( - org_train_dataset.select( - range(0, len_train_dataset // 3) - ), # created huggingface dataset type - num_classes=num_classes, - get_data_key_fun=lambda x: x["coarse_label"], - ) - - eval_dataset_split = [sample.data for sample in class_sampler(eval_size)] - # convert this back to huggingface dataset - eval_dataset_split = HFDataset.from_list(eval_dataset_split) + # eval_size = 18 * num_classes + + # class_sampler = ClassSampler( + # org_train_dataset.select( + # range(0, len_train_dataset // 3) + # ), # created huggingface dataset type + # num_classes=num_classes, + # get_data_key_fun=lambda x: x["coarse_label"], + # ) + + # eval_dataset_split = [sample.data for sample in class_sampler(eval_size)] + # # convert this back to huggingface dataset + # eval_dataset_split = HFDataset.from_list(eval_dataset_split) + + # sample eval from the first 1/3 of the train dataset + # eval_dataset_split = org_train_dataset.select(range(len_train_dataset // 3)) + # # sample a subset of the eval dataset, just randomly sampling + # eval_dataset_split = sample_subset_dataset( + # eval_dataset_split, eval_size, torch.ones(len(eval_dataset_split)) + # ) # (2) create train dataset from the last 2/3 of the train dataset, 100 samples per class train_dataset_split = org_train_dataset.select( @@ -85,7 +90,7 @@ def prepare_datasets(): train_dataset_split, train_size, class_weights ) print(f"train example: {train_dataset_split[0]}") - print(f"train: {len(train_dataset_split)}, eval: {len(eval_dataset_split)}") + # print(f"train: {len(train_dataset_split)}, eval: {len(eval_dataset_split)}") # get the count for each class count_by_class: Dict[str, int] = {} @@ -98,14 +103,23 @@ def prepare_datasets(): # create the test dataset from the test dataset # weights for the test dataset labels = torch.tensor(org_test_dataset["coarse_label"]) - class_weights = calculate_class_weights(labels) + # class_weights = calculate_class_weights(labels) - test_size = eval_size * 4 - # weighted sampling on the test dataset - test_dataset_split = sample_subset_dataset( - org_test_dataset, test_size, class_weights + print(f"total test dataset: {len(org_test_dataset)}") + + # shuff, and get the first 1/3 as validation, 2/3 as test + test_dataset_split = org_test_dataset.shuffle(seed=42) + eval_dataset_split = test_dataset_split.select(range(len(test_dataset_split) // 3)) + test_dataset_split = test_dataset_split.select( + range(len(test_dataset_split) // 3, len(test_dataset_split)) ) + # test_size = eval_size * 2 + # # weighted sampling on the test dataset + # test_dataset_split = sample_subset_dataset( + # org_test_dataset, test_size, torch.ones(len(org_test_dataset)) + # ) + print( f"train example: {train_dataset_split[0]}, type: {type(train_dataset_split[0])}" ) diff --git a/adalflow/adalflow/datasets/types.py b/adalflow/adalflow/datasets/types.py index 3315d2d8..84950a48 100644 --- a/adalflow/adalflow/datasets/types.py +++ b/adalflow/adalflow/datasets/types.py @@ -1,5 +1,6 @@ import uuid from dataclasses import dataclass, field +from typing import Dict from adalflow.core.base_data_class import DataClass @@ -32,6 +33,34 @@ class HotPotQAData(Example): metadata={"desc": "The set of titles that support the answer"}, default=None, ) + context: Dict[str, object] = field( + metadata={"desc": "The context of the question"}, + default=None, + ) + + __input_fields__ = ["question"] + __output_fields__ = ["answer"] + + # @staticmethod + # def from_dict(d: Dict[str, Any]) -> "HotPotQAData": + # # Preprocess gold_titles + # if "gold_titles" in d and isinstance(d["gold_titles"], str): + # try: + # d["gold_titles"] = json.loads(d["gold_titles"]) + # except json.JSONDecodeError: + # # Replace single quotes with double quotes + # fixed_str = d["gold_titles"].replace("'", '"') + # d["gold_titles"] = set(json.loads(fixed_str)) + + # # Preprocess context + # if "context" in d and isinstance(d["context"], str): + # try: + # d["context"] = json.loads(d["context"]) + # except json.JSONDecodeError: + # fixed_str = d["context"].replace("'", '"') + # d["context"] = json.loads(fixed_str) + + # return HotPotQAData(**d) @dataclass @@ -52,3 +81,31 @@ class TrecData(BaseData): __input_fields__ = ["question"] # follow this order too. __output_fields__ = ["class_name", "class_index"] + + +if __name__ == "__main__": + # test the hotpotqa data + data = HotPotQAData( + question="What is the capital of France?", + answer="Paris", + gold_titles=set(["Paris", "France"]), + context={"Paris": "The capital of France"}, + ) + + data_dict = data.to_dict() + print("data_dict", data_dict) + data = HotPotQAData.from_dict(data_dict) + print("data", data) + + from adalflow.utils.file_io import save_json, load_json + + # save json + save_json(data_dict, f="task.json") + # load json + data_dict_loaded = load_json(f="task.json") + + print("data_dict_loaded", data_dict_loaded) + + # restore the data + data_restored = HotPotQAData.from_dict(data_dict_loaded) + print("data_restored", data_restored) diff --git a/adalflow/adalflow/eval/__init__.py b/adalflow/adalflow/eval/__init__.py index 67de685c..1d9ecd08 100644 --- a/adalflow/adalflow/eval/__init__.py +++ b/adalflow/adalflow/eval/__init__.py @@ -1,5 +1,5 @@ from .answer_match_acc import AnswerMatchAcc -from .retriever_recall import RetrieverRecall +from .retriever_recall import RetrieverEvaluator from .llm_as_judge import LLMasJudge, DEFAULT_LLM_EVALUATOR_PROMPT from .g_eval import ( GEvalJudgeEvaluator, @@ -10,7 +10,7 @@ __all__ = [ "AnswerMatchAcc", - "RetrieverRecall", + "RetrieverEvaluator", "LLMasJudge", "DEFAULT_LLM_EVALUATOR_PROMPT", "GEvalJudgeEvaluator", diff --git a/adalflow/adalflow/eval/answer_match_acc.py b/adalflow/adalflow/eval/answer_match_acc.py index b45e61c1..03da6cfa 100644 --- a/adalflow/adalflow/eval/answer_match_acc.py +++ b/adalflow/adalflow/eval/answer_match_acc.py @@ -3,6 +3,7 @@ from typing import List, Literal from adalflow.eval.base import BaseEvaluator, EvaluationResult from adalflow.optim.parameter import Parameter +from adalflow.eval.utils import normalize_answer, f1_score class AnswerMatchAcc(BaseEvaluator): @@ -27,12 +28,20 @@ class AnswerMatchAcc(BaseEvaluator): 1.0 >>> acc_list [1.0, 1.0, 1.0] + + References: + 1. HotpotQA: https://github.com/hotpotqa/hotpot/blob/master/hotpot_evaluate_v1.py """ def __init__( self, type: Literal[ - "exact_match", "fuzzy_match", "rouge_score", "bleu_score", "bert_score" + "exact_match", + "fuzzy_match", + "rouge_score", + "bleu_score", + "bert_score", + "f1_score", ] = "exact_match", ): self.type = type @@ -81,11 +90,13 @@ def compute_single_item( f"Error converting pred_answer and gt_answer to string: {e}" ) if self.type == "exact_match": - return 1.0 if y == y_gt else 0.0 + return 1.0 if normalize_answer(y) == normalize_answer(y_gt) else 0.0 elif self.type == "fuzzy_match": - y = y.lower() - y_gt = y_gt.lower() + y = normalize_answer(y) + y_gt = normalize_answer(y_gt) return 1.0 if y_gt in y else 0.0 + elif self.type == "f1_score": + return f1_score(y, y_gt) elif self.type == "bert_score": from torchmetrics.text.bert import BERTScore diff --git a/adalflow/adalflow/eval/retriever_recall.py b/adalflow/adalflow/eval/retriever_recall.py index 9abe6d52..c433dc65 100644 --- a/adalflow/adalflow/eval/retriever_recall.py +++ b/adalflow/adalflow/eval/retriever_recall.py @@ -1,16 +1,23 @@ """Retriever Recall @k metric.""" -from typing import List, Union +from typing import List, Dict from adalflow.eval.base import BaseEvaluator, EvaluationResult +from adalflow.eval.utils import normalize_answer -class RetrieverRecall(BaseEvaluator): - __doc__ = r"""Recall@k measures the ratio of the number of relevant context strings in the top-k retrieved context to the total number of ground truth relevant context strings. +class RetrieverEvaluator(BaseEvaluator): + __doc__ = r"""Return Recall@k and Precision@k. + + Recall@k = Number of relevant retrieved documents/ Total number of relevant documents (len(gt_contexts)) + Precision@k = Number of relevant retrieved documents/ Total number of retrieved documents (len(retrieved_contexts)) + In our implementation, we use exact string matching between each gt context and the joined retrieved context string. You can use the longest common subsequence (LCS) or other similarity metrics(or embedding based) to decide if it is a match or not. + You can also pass ids of retrieved and the reference. + If you do not even have the ground truth context, but only grounth truth answers, you can consider using RAGAS framework for now. It computes the recall as: @@ -43,36 +50,55 @@ class RetrieverRecall(BaseEvaluator): def __init__(self): super().__init__() - def _compute_single_item( - self, retrieved_context: str, gt_context: Union[str, List[str]] - ) -> float: + def compute_single_item( + self, retrieved_context: List[str], gt_context: List[str] + ) -> Dict[str, float]: r""" Compute the recall of the retrieved context for a single query. Args: - retrieved_context (str): Retrieved context string. - gt_context (Union[str, List[str]]): Context string or list of context strings to compare against. + retrieved_context (List[str]): List of retrieved context strings. + gt_context (List[str]): List of ground truth context strings. Returns: float: Recall value. """ - if isinstance(gt_context, str): - gt_context = [gt_context] - recalled = 0 - for gt_context_sentence in gt_context: - if gt_context_sentence in retrieved_context: - recalled += 1 - return recalled / len(gt_context) + # 1 normalize the text + normalized_retrieved_context = [ + normalize_answer(doc) for doc in retrieved_context + ] + + normalized_gt_context = [normalize_answer(doc) for doc in gt_context] + + set_retrieved = set(normalized_retrieved_context) + set_gt = set(normalized_gt_context) + + # 2 calculate the recall with intersection + + recall = len(set_gt.intersection(set_retrieved)) / len(set_gt) + precision = len(set_gt.intersection(set_retrieved)) / len(set_retrieved) + + return {"recall": recall, "precision": precision} + + # if isinstance(gt_context, str): + # gt_context = [gt_context] + # recalled = 0 + # for gt_context_sentence in gt_context: + # normalized_gt_context = normalize_answer(gt_context_sentence) + # normalized_retrieved_context = normalize_answer(retrieved_context) + # if normalized_gt_context in normalized_retrieved_context: + # recalled += 1 + # return recalled / len(gt_context) def compute( self, - retrieved_contexts: Union[List[str], List[List[str]]], + retrieved_contexts: List[List[str]], gt_contexts: List[List[str]], ) -> EvaluationResult: r""" Compute the recall of the retrieved context for a list of queries. Args: - retrieved_contexts (Union[List[str], List[List[str]]): List of retrieved context strings. Using List[str] we assume you have joined all the context sentences into one string. + retrieved_context: List of retrieved context strings. gt_contexts ( List[List[str]]): List of ground truth context strings. Returns: @@ -84,15 +110,53 @@ def compute( raise ValueError( "The number of retrieved context lists and ground truth context lists should be the same." ) - k = len(retrieved_contexts) - recall_list = [] + k = len(retrieved_contexts[0]) + metric_list = [] for retrieved_context, gt_context in zip(retrieved_contexts, gt_contexts): - if isinstance(retrieved_context, list): - retrieved_context = " ".join(retrieved_context) - recall = self._compute_single_item(retrieved_context, gt_context) - recall_list.append(recall) - - avg_score = sum(recall_list) / len(recall_list) - return EvaluationResult( - avg_score, recall_list, additional_info={"type": f"RetrieverRecall@{k}"} + # if isinstance(retrieved_context, list): + # retrieved_context = " ".join(retrieved_context) + metric = self.compute_single_item(retrieved_context, gt_context) + metric_list.append(metric) + + # average through each key value + + avg_recall = sum([metric["recall"] for metric in metric_list]) / len( + metric_list ) + avg_precision = sum([metric["precision"] for metric in metric_list]) / len( + metric_list + ) + + return { + "avg_recall": avg_recall, + "avg_precision": avg_precision, + "recall_list": [metric["recall"] for metric in metric_list], + "precision_list": [metric["precision"] for metric in metric_list], + "top_k": k, + } + + # return EvaluationResult( + # avg_score, recall_list, additional_info={"type": f"RetrieverRecall@{k}"} + # ) + + +if __name__ == "__main__": + from adalflow.datasets import HotPotQA, HotPotQAData + + train_dataset = HotPotQA(split="train", size=10) + data: HotPotQAData = train_dataset[0] + gold_titles = data.gold_titles + context_titles = data.context["title"] + print(f"gold_titles: {gold_titles}, context_titles: {context_titles}") + print(f"train: {len(train_dataset)}, example: {train_dataset[0]}") + + # compute the recall and precision for 10 items + retriever_eval = RetrieverEvaluator() + + gt_contexts = [list(data.gold_titles) for data in train_dataset[:10]] + + retrieved_contexts = [list(data.context["title"]) for data in train_dataset[:10]] + + result = retriever_eval.compute(retrieved_contexts, gt_contexts) + + print(f"result: {result}") diff --git a/adalflow/adalflow/eval/utils.py b/adalflow/adalflow/eval/utils.py new file mode 100644 index 00000000..babf5b78 --- /dev/null +++ b/adalflow/adalflow/eval/utils.py @@ -0,0 +1,48 @@ +# from hotpotqa github +import re + +import string +from collections import Counter + + +def normalize_answer(s): + + def remove_articles(text): + return re.sub(r"\b(a|an|the)\b", " ", text) + + def white_space_fix(text): + return " ".join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return "".join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def f1_score(y: str, y_gt: str) -> float: + if not isinstance(y, str) or not isinstance(y_gt, str): + raise ValueError(f"y: {y},{type(y)}, y_gt: {y_gt},{type(y_gt)} must be string.") + prediction_tokens = normalize_answer(y).split() + ground_truth_tokens = normalize_answer(y_gt).split() + + common = Counter(prediction_tokens) & Counter(ground_truth_tokens) + num_same = sum(common.values()) + + if len(prediction_tokens) == len(ground_truth_tokens) == 0: + # Unlike most tasks, QReCC and SQuAD-2.0 assign 1.0 in this edge case. We don't for uniformity. + print( + "\n#> F1 Metric: Rare edge case of len(prediction_tokens) == len(ground_truth_tokens) == 0.\n" + ) + + if num_same == 0: + return 0 + + precision = 1.0 * num_same / len(prediction_tokens) + recall = 1.0 * num_same / len(ground_truth_tokens) + f1 = (2 * precision * recall) / (precision + recall) + + return f1 diff --git a/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py b/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py index a78c1ec6..9c509598 100644 --- a/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py +++ b/adalflow/adalflow/optim/few_shot/bootstrap_optimizer.py @@ -14,6 +14,7 @@ from adalflow.core.functional import random_sample from adalflow.optim.optimizer import DemoOptimizer from adalflow.optim.types import ParameterType +from adalflow.utils import printc log = logging.getLogger(__name__) @@ -219,7 +220,10 @@ def samples_to_str( yaml_str = sample.to_yaml(exclude=exclude_fields) else: - yaml_str = sample.to_yaml(exclude=["id", "score"]) + yaml_str = sample.to_yaml( + include=sample.get_input_fields() + sample.get_output_fields() + ) + printc(f"yaml_str: {yaml_str}") sample_strs.append(yaml_str + "\n") except Exception as e: print(f"Error: {e} to yaml for {sample}") diff --git a/adalflow/adalflow/optim/grad_component.py b/adalflow/adalflow/optim/grad_component.py index b73e536e..067fec9d 100644 --- a/adalflow/adalflow/optim/grad_component.py +++ b/adalflow/adalflow/optim/grad_component.py @@ -1,19 +1,39 @@ """Base class for Autograd Components that can be called and backpropagated through.""" -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Callable, Optional, Dict from collections import OrderedDict +import uuid import logging +from copy import deepcopy if TYPE_CHECKING: from adalflow.core.generator import BackwardEngine - from adalflow.optim.parameter import Parameter + from adalflow.core import ModelClient +from adalflow.optim.parameter import ( + Parameter, + OutputParameter, + Gradient, + GradientContext, +) from adalflow.optim.types import ParameterType +from adalflow.core.types import GeneratorOutput +from adalflow.utils import printc + +import json from adalflow.core.component import Component from adalflow.optim.function import BackwardContext +from adalflow.core.prompt_builder import Prompt +from adalflow.optim.text_grad.backend_engine_prompt import ( + GRAD_COMPONENT_CONVERSATION_TEMPLATE_STRING, + LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN, + OBJECTIVE_INSTRUCTION_BASE, + OBJECTIVE_INSTRUCTION_CHAIN, +) -__all__ = ["GradComponent"] + +__all__ = ["GradComponent", "FunGradComponent", "fun_to_grad_component"] log = logging.getLogger(__name__) @@ -28,22 +48,43 @@ class GradComponent(Component): The __call__ method will check if the component is in training mode, and call the `forward` method to return a `Parameter` object if it is in training mode, otherwise, it will call the `call` method to return the output such as "GeneratorOutput", "RetrieverOutput", etc. + + Note: Avoid using the attributes and methods that are defined here and in the `Component` class unless you are overriding them. """ backward_engine: "BackwardEngine" _component_type = "grad" + id = None + _component_desc = "GradComponent" def __init__(self, *args, **kwargs): super().__init__() super().__setattr__("backward_engine", None) - - def __call__(self, *args, **kwargs): - if self.training: - return self.forward(*args, **kwargs) + super().__setattr__("id", str(uuid.uuid4())) + + # def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs): + # raise NotImplementedError("set_backward_engine method is not implemented") + def set_backward_engine( + self, + backward_engine: "BackwardEngine" = None, + model_client: "ModelClient" = None, + model_kwargs: Dict[str, object] = None, + ): + from adalflow.core.generator import BackwardEngine + + self.backward_engine = backward_engine + if not backward_engine: + log.info( + "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs." + ) + self.backward_engine = BackwardEngine(model_client, model_kwargs) else: - return self.call(*args, **kwargs) + if type(backward_engine) is not BackwardEngine: + raise TypeError( + f"EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine. Got {type(backward_engine)}." + ) - def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs): - raise NotImplementedError("set_backward_engine method is not implemented") + def disable_backward_engine(self): + self.backward_engine = None def call(self, *args, **kwargs): raise NotImplementedError("call method is not implemented") @@ -59,17 +100,12 @@ def forward(self, *args, **kwargs) -> "Parameter": 3. Return the parameter object. """ - from adalflow.optim.parameter import Parameter + from adalflow.optim.parameter import Parameter, OutputParameter log.debug( f"Forwarding through {self.name} with args: {args} and kwargs: {kwargs}" ) - # if "id" not in kwargs: - # raise ValueError( - # "id must be provided in the kwargs of a GradComponent for tracing." - # ) - # 1. get all predecessors from all args and kwargs input_args = OrderedDict() @@ -77,6 +113,9 @@ def forward(self, *args, **kwargs) -> "Parameter": for idx, arg in enumerate(args): input_args[f"arg_{idx}"] = arg + # Get data id from the kwargs + data_id = kwargs.get("id", None) + # Add keyword args to the ordered dict, preserving order predecessors = [] for v in input_args.values(): @@ -84,11 +123,30 @@ def forward(self, *args, **kwargs) -> "Parameter": predecessors.append(v) if v.param_type == ParameterType.INPUT: v.data_id = kwargs.get("id", None) - for v in kwargs.values(): + if data_id is None: + data_id = v.data_id + # printc(f"kwargs: {kwargs}") + # discard_keys = [] + for k, v in kwargs.items(): if isinstance(v, Parameter): predecessors.append(v) if v.param_type == ParameterType.INPUT: v.data_id = kwargs.get("id", None) + if data_id is None: + data_id = v.data_id + # support list of Parameters by flattening them + elif isinstance(v, list): + for i, p in enumerate(v): + if isinstance(p, Parameter): + predecessors.append(p) + # flat_key = f"{k}_{i}" + # input_args[flat_key] = p + # pop the original list + # discard_keys.append(k) + # for k in discard_keys: + # kwargs.pop(k) + + # setattr(self, f"{k}_{i}", p) # 2. unwrap the parameter object to take only the data, successor_map_fn: lambda x: x.data in default # unwrap args @@ -104,6 +162,14 @@ def forward(self, *args, **kwargs) -> "Parameter": for k, v in kwargs.items(): if isinstance(v, Parameter): unwrapped_kwargs[k] = v.map_to_successor(self) + elif isinstance(v, list): + values = [] + for p in v: + if isinstance(p, Parameter): + values.append(p.map_to_successor(self)) + else: + values.append(p) + unwrapped_kwargs[k] = values else: unwrapped_kwargs[k] = v @@ -115,21 +181,39 @@ def forward(self, *args, **kwargs) -> "Parameter": call_response = self.call(*unwrapped_args, **unwrapped_kwargs) + if isinstance(call_response, Parameter): + raise ValueError( + f"A GradComponent call should not return Parameter, got {call_response.name}" + ) + predecessors.append(call_response) + return call_response + # 4. Create a Parameter object to trace the forward pass - input_args.update(kwargs) - response = Parameter( + # input_args.update(kwargs) + # use unwrapped args and unwrapped kwargs to trace the forward pass + tracing_args = {i: v for i, v in enumerate(unwrapped_args)} + tracing_args.update(**unwrapped_kwargs) + + response = OutputParameter( data=call_response, name=self.name + "_output", role_desc=self.name + " response", param_type=ParameterType.OUTPUT, + data_id=data_id, ) response.set_predecessors(predecessors) - response.trace_forward_pass(input_args=input_args, full_response=call_response) + response.trace_forward_pass( + input_args=tracing_args, + full_response=call_response, + id=self.id, # this is component id + name=self.name, + ) response.set_grad_fn( BackwardContext( backward_fn=self.backward, response=response, - id=kwargs.get("id", None), + id=data_id, + input_kwargs=kwargs, ) ) return response @@ -141,21 +225,531 @@ def backward(self, *, response: "Parameter", id: str = None, **kwargs): Subclass should implement this method if you need additional backward logic. """ + log.info(f"GradComponent backward: {response.name}") children_params = response.predecessors if response.get_gradient_and_context_text().strip() == "": log.info(f"Generator: Backward: No gradient found for {response}.") - for pred in children_params: - pred.set_score(response._score) - from adalflow.utils.logger import printc + # backward the backward engine disable signal + if response.backward_engine_disabled: + for pred in children_params: + pred.backward_engine_disabled = True + + for _, pred in enumerate(children_params): + if response.score is not None: + pred.set_score(response.score) - printc( - f"Retriever: Backward: {pred.name} set_score: {response._score}, {response.name}", - "blue", - ) if pred.param_type == ParameterType.DEMOS: pred.add_score_to_trace( - trace_id=id, score=response._score, is_teacher=self.teacher_mode + trace_id=id, score=response.score, is_teacher=self.teacher_mode + ) + + # pass the current gradient to pred + + # TODO: each gradcomponent will have its own context, but + # passing the successor's gradient.data to the current. + + for grad in response.gradients: + # NOTE: make a copy of the gradient, we should not modify the original gradient + grad = deepcopy(grad) + # update the gradient context and from and to + # grad.update_from_to(response, pred) + grad.is_default_copy = ( + True # response and pred will keep the original gradient + ) + # NOTE: test of keep the initial gradient context + # grad.add_context( + # GradientContext( + # variable_desc=pred.role_desc, + # response_desc=response.name, + # input_output=f"""{response.component_trace.to_context_str()}""", + # ) + # ) + + pred.add_gradient(grad) + + +class GradComponent2(GradComponent): + "Graduable functional component" + + def __init__( + self, + desc: str, + name: Optional[str] = None, + backward_engine: Optional["BackwardEngine"] = None, + model_client: "ModelClient" = None, + model_kwargs: Dict[str, object] = None, + ): + super().__init__() + self.desc = desc + self.backward_engine = backward_engine + self.model_client = model_client + self.name = name or f"{self.__class__.__name__}" + + self.backward_engine = None + if backward_engine is None: + log.info( + "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs." + ) + if model_client and model_kwargs: + + self.set_backward_engine(backward_engine, model_client, model_kwargs) + else: + if not isinstance(backward_engine, BackwardEngine): + raise TypeError( + "EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine." + ) + self.backward_engine = backward_engine + + # def set_backward_engine( + # self, + # backward_engine: "BackwardEngine" = None, + # model_client: "ModelClient" = None, + # model_kwargs: Dict[str, object] = None, + # ): + # from adalflow.core.generator import BackwardEngine + + # self.backward_engine = backward_engine + # if not backward_engine: + # log.info( + # "EvalFnToTextLoss: No backward engine provided. Creating one using model_client and model_kwargs." + # ) + # self.backward_engine = BackwardEngine(model_client, model_kwargs) + # else: + # if type(backward_engine) is not BackwardEngine: + # raise TypeError( + # f"EvalFnToTextLoss: backward_engine must be an instance of BackwardEngine. Got {type(backward_engine)}." + # ) + + @staticmethod + def _backward_through_one_predecessor( + pred: Parameter, + kwargs: Dict[str, Parameter], + response: Parameter, + desc: str, + backward_engine: "BackwardEngine", + ground_truth: object = None, + is_intermediate_node: bool = False, # if the node is an intermediate node in the backpropagation chain + metadata: Dict[str, str] = None, + ): + if not pred.requires_opt: + if response.score is not None: + pred.set_score(response.score) + log.debug( + f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." + ) + return + log.debug( + f"EvalFnToTextLoss: Backward through {pred}, is_intermediate_node: {is_intermediate_node}" + ) + + if pred.check_if_already_computed_gradient_respect_to(response.id): + log.info( + f"EvalFnToTextLoss: Gradient already computed for {pred.role_desc} with respect to {response.role_desc}" + ) + + return + + if backward_engine is None: + log.error( + "EvalFnToTextLoss: backward_engine is required for text prompt optimization." + ) + raise ValueError( + "EvalFnToTextLoss: backward_engine is required for text prompt optimization." + ) + + instruction_str, objective_str = None, None + + # convert kwargs to key, (value, type(eval_input)) + + inputs = {} + + for k, v in kwargs.items(): + if isinstance(v, Parameter): + inputs[k] = (v.get_param_info(), str(type(v.eval_input))) + elif isinstance(v, list): + # flat the list to multiple parameters + + for i, p in enumerate(v): + if isinstance(p, Parameter): + flat_key = f"{k}_{i}" + inputs[flat_key] = (p.get_param_info(), str(type(p.eval_input))) + + # response information + conversation_str = Prompt( + GRAD_COMPONENT_CONVERSATION_TEMPLATE_STRING, + prompt_kwargs={ + "inputs": inputs, + "component_desc": desc, + "response_value": response.get_prompt_data(), + "metadata": json.dumps(metadata) if metadata else None, + }, + )() + + conv_ins_template = LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN + obj_ins_template = OBJECTIVE_INSTRUCTION_BASE + + if is_intermediate_node: + printc(f"is_intermediate_node: {is_intermediate_node}") + # conv_ins_template = CONVERSATION_START_INSTRUCTION_STRING_FN_CHAIN + obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN + + instruction_str = Prompt( + conv_ins_template, + prompt_kwargs={ + "variable": pred.get_param_info(), + "conversation_str": conversation_str, + }, + )() + response_gradient = response.get_gradients_str() + # response_gradient = response.get_gradients_component_schema() + if not response_gradient: + raise ValueError( + f"Generator: No gradient found for {response}. Please check the response. pred: {pred}" + ) + objective_str = Prompt( + obj_ins_template, + prompt_kwargs={ + "response_name": response.name, + "response_desc": response.role_desc, + "response_gradient": response_gradient, + }, + )() + + log.info(f"EvalFnToTextLoss: Instruction: {instruction_str}") + log.info(f"EvalFnToTextLoss: Objective: {objective_str}") + log.info(f"EvalFnToTextLoss: Conversation: {conversation_str}") + + # Compute the gradient + backward_engine_prompt_kwargs = { + "conversation_sec": instruction_str, + "objective_instruction_sec": objective_str, + # "evaluate_variable_instruction_sec": eval_str, + } + gradient_value: GeneratorOutput = backward_engine( + prompt_kwargs=backward_engine_prompt_kwargs + ) + gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs) + # print(f"Backward engine prompt: {gradient_prompt}") + gradient_value_data = ( + gradient_value.data + or backward_engine.failure_message_to_optimizer( + gradient_response=gradient_value + ) + ) + + gradient_value_data = ( + f"expected answer: {ground_truth},\n Feedback: {gradient_value_data}" + ) + # print(f"gradient_value_data: {gradient_value_data}") + + log.debug(f"EvalFnToTextLoss: Gradient for {pred}: {gradient_value_data}") + + # score should be passed to grad + gradient_param = Gradient( + data=gradient_value_data, + data_id=response.data_id, + score=response.score, + from_response=response, + to_pred=pred, + ) + gradient_param.add_prompt(gradient_prompt) + gradient_param.add_context( + GradientContext( + input_output=conversation_str, + response_desc=response.role_desc, + variable_desc=pred.role_desc, + # ground_truth=ground_truth, + ) + ) + pred.add_gradient(gradient_param) + + # backward the end to end score + # TODO: not really useful + if response.score is not None: + pred.set_score(response.score) + pred.set_gt(ground_truth) + + # TODO: reduce meta + + def backward(self, *, response: "OutputParameter", id: str = None, **kwargs): + """Backward pass of the function. In default, it will pass all the scores to the predecessors. + + Note: backward is mainly used internally and better to only allow kwargs as the input. + + Subclass should implement this method if you need additional backward logic. + """ + + log.info(f"GradComponent backward: {response.name}") + children_params = response.predecessors + + input_kwargs = kwargs.get("input_kwargs", {}) + + is_intermediate_node = False + response_gradient_context = response.get_gradient_and_context_text().strip() + if response_gradient_context != "": + log.info("EvalFnToTextLoss is an intermediate node.") + is_intermediate_node = True + + if response.get_gradient_and_context_text().strip() == "": + log.info(f"Generator: Backward: No gradient found for {response}.") + + # backward the backward engine disable signal + if response.backward_engine_disabled: + for pred in children_params: + pred.backward_engine_disabled = True + + # use pass through gradient when there is one predecessor + if not self.backward_engine or len(children_params) < 2: + super().backward(response=response, id=id) + + else: + + for _, pred in enumerate(children_params): + if response.score is not None: + pred.set_score(response.score) + printc(f"score {response.score} for pred name: {pred.name}") + if not pred.requires_opt: + continue + + if pred.param_type == ParameterType.DEMOS: + pred.add_score_to_trace( + trace_id=id, score=response.score, is_teacher=self.teacher_mode + ) + + printc(f"pred: {pred.name}, response: {response.name}") + + self._backward_through_one_predecessor( + pred=pred, + kwargs=input_kwargs, + response=response, + backward_engine=self.backward_engine, + desc=self.desc, + is_intermediate_node=is_intermediate_node, + ) + + +class FunGradComponent(GradComponent2): + # r"""Wraps a function as a GradComponent. + + # Args: + # fun (Callable): The function to be wrapped. + + # Examples: + + # function = lambda x: x + 1 + # fun_component = FunComponent(function) + # print(fun_component(1)) # 2 + # """ + + def __init__( + self, + fun: Optional[Callable] = None, + afun: Optional[Callable] = None, + desc: str = "", + doc_string=None, + ): + desc = desc or fun.__doc__ or f"Function: {fun.__name__}" + + super().__init__(desc=desc, name=fun.__name__) + self.fun_name = fun.__name__ + self.fun = fun + # set the docstring + self.doc_string = doc_string + print(f"fun doc: {self.doc_string}, desc: {desc}") + setattr( + self.fun, + "__doc__", + doc_string or fun.__doc__ or f"Function: {fun.__name__}", + ) + + setattr(self.fun, "__name__", fun.__name__) + + # def __call__(self, *args, **kwargs): + # output = super().__call__(*args, **kwargs) + # # set __doc__ and __name__ for this function + + # set normal function attributes + # self.__name__ = fun.__name__ + # self.__doc__ = doc_string or fun.__doc__ or f"Function: {fun.__name__}" + # setattr(self, "__name__", fun.__name__) + # setattr( + # self, "__doc__", doc_string or fun.__doc__ or f"Function: {fun.__name__}" + # ) + # EntityMapping.register(self.fun_name, (fun, desc)) + + def call(self, *args, **kwargs): + # fun, desc = EntityMapping.get(self.fun_name) + + # if isinstance(fun_doc_string, Parameter): + # remove the __doc__ keyword argument + kwargs.pop("doc_string", None) + + return self.fun(*args, **kwargs) + + def forward(self, *args, **kwargs) -> Parameter: + """add func_doc_string to the kwargs before calling the super().forward""" + kwargs["doc_string"] = self.doc_string + output = super().forward(*args, **kwargs) + return output + + def _extra_repr(self) -> str: + return ( + super()._extra_repr() + + f"fun_name={self.fun_name}, fun={self.fun.__name__}, fun_doc={self.fun.__doc__}" + ) + + +# def fun_to_grad_component(fun, desc: str = "") -> FunGradComponent: +# r"""Helper function to convert a function into a Component with +# its own class name. + +# Can be used as both a decorator and a function. + +# Args: +# fun (Callable): The function to be wrapped. +# Returns: +# FunComponent: The component that wraps the function. + +# Examples: +# 1. As a decorator: +# >>> @fun_to_component +# >>> def my_function(x): +# >>> return x + 1 +# >>> # is equivalent to +# >>> class MyFunctionComponent(FunComponent): +# >>> def __init__(self): +# >>> super().__init__(my_function) + +# 2. As a function: +# >>> my_function_component = fun_to_component(my_function) +# """ + +# # Split the function name by underscores, capitalize each part, and join them back together +# class_name = ( +# "".join(part.capitalize() for part in fun.__name__.split("_")) + "GradComponent" +# ) +# # register the function +# EntityMapping.register(fun.__name__, fun) +# # Define a new component class dynamically +# component_class = type( +# class_name, +# (FunGradComponent,), +# {"__init__": lambda self: FunGradComponent.__init__(self, fun, desc)}, +# ) +# # register the component +# EntityMapping.register(class_name, component_class) + +# return component_class() + + +def fun_to_grad_component(desc: str = "", doc_string=None) -> Callable: + """ + Return a decorator that, when applied to a function `fun`, + wraps it in a GradComponent with the given `desc`. + + Examples: + + 1. As a decorator: + + + ::code-block :: python + + @fun_to_grad_component(desc="This is a test function", doc_string=Parameter( + data="Finish the task with verbatim short factoid responses from retrieved context.", + param_type=ParameterType.PROMPT, + requires_opt=True, + role_desc="Instruct how the agent creates the final answer from the step history.", + )) + def my_function(x): + return x + 1 + + print(my_function(1)) + + 2. As a function: + + ::code-block :: python + + def my_function(x): + return x + 1 + + my_function_component = fun_to_grad_component(desc="This is a test function")(my_function) + """ + + def decorator(fun): + # 1) build the class name + class_name = ( + "".join(part.capitalize() for part in fun.__name__.split("_")) + + "GradComponent" + ) + # 2) register the function + # EntityMapping.register(fun.__name__, fun) + # 3) define the new class + printc(f"fun doc 1: {doc_string}, desc: {desc}") + component_class = type( + class_name, + (FunGradComponent,), + { + "__init__": lambda self: FunGradComponent.__init__( + self, fun=fun, desc=desc, doc_string=doc_string ) + }, + ) + # 4) register the new component class + # EntityMapping.register(class_name, component_class) + # 5) return instance of that class + return component_class() + + return decorator + + +if __name__ == "__main__": + # Test FunGradComponent + from adalflow.optim.parameter import Parameter + + def my_function(x): + __doc__ = Parameter( # noqa F841 + data="Finish the task with verbatim short factoid responses from retrieved context.", + param_type=ParameterType.PROMPT, + requires_opt=True, + role_desc="Instruct how the agent creates the final answer from the step history.", + ) + return x + 1 + + my_function_component = fun_to_grad_component()(my_function) + print(my_function_component) # 2 + # eval mode + output = my_function_component(1) + print(output) + # training mode + my_function_component.train() + output = my_function_component(Parameter(data=1, name="input")) + print(output) + + # now test the decorator + @fun_to_grad_component( + desc="This is a test function", + doc_string=Parameter( + data="Finish the task with verbatim short factoid responses from retrieved context.", + param_type=ParameterType.PROMPT, + requires_opt=True, + role_desc="Instruct how the agent creates the final answer from the step history.", + ), + ) + def my_function(x): + + return x + 1 + + print(my_function(1)) + # eval mode + output = my_function(1) + print(output) + assert output == 2 + + # training mode + my_function.train() + print(my_function) + output = my_function(Parameter(data=1, name="input")) + print(output) diff --git a/adalflow/adalflow/optim/loss_component.py b/adalflow/adalflow/optim/loss_component.py index e53ac609..bfe6d875 100644 --- a/adalflow/adalflow/optim/loss_component.py +++ b/adalflow/adalflow/optim/loss_component.py @@ -1,6 +1,7 @@ """Base class for Autograd Components that can be called and backpropagated through.""" from typing import TYPE_CHECKING +import uuid if TYPE_CHECKING: from adalflow.core.generator import BackwardEngine @@ -27,10 +28,12 @@ class LossComponent(Component): """ backward_engine: "BackwardEngine" _component_type = "loss" + id = None def __init__(self, *args, **kwargs): super().__init__() super().__setattr__("backward_engine", None) + super().__setattr__("id", str(uuid.uuid4())) def __call__(self, *args, **kwargs): return self.forward(*args, **kwargs) @@ -38,6 +41,9 @@ def __call__(self, *args, **kwargs): def set_backward_engine(self, backward_engine: "BackwardEngine", *args, **kwargs): raise NotImplementedError("set_backward_engine method is not implemented") + def disable_backward_engine(self): + self.backward_engine = None + def forward(self, *args, **kwargs) -> "Parameter": r"""Default just wraps the call method.""" raise NotImplementedError("forward method is not implemented") diff --git a/adalflow/adalflow/optim/parameter.py b/adalflow/adalflow/optim/parameter.py index 5b60995c..5a510083 100644 --- a/adalflow/adalflow/optim/parameter.py +++ b/adalflow/adalflow/optim/parameter.py @@ -13,14 +13,17 @@ Callable, TYPE_CHECKING, ) -from pyvis.network import Network from collections import defaultdict +from pyvis.network import Network import logging import os from dataclasses import dataclass, field import uuid from adalflow.optim.types import ParameterType from adalflow.core.base_data_class import DataClass +from adalflow.utils.logger import printc +import html + if TYPE_CHECKING: from adalflow.optim.text_grad.tgd_optimizer import TGDData, TGDOptimizerTrace @@ -31,23 +34,40 @@ @dataclass -class GradientContext: +class GradientContext(DataClass): + """GradientContext is used to describe the component's function and trace its input and output. + + To get the component's function desc, use GradientContext.to_yaml_signature() + To get the data: use instance.to_yaml() + """ + variable_desc: str = field( metadata={"desc": "The description of the target parameter"} ) - response_desc: str = field( - metadata={"desc": "The description of the response parameter"} - ) - context: str = field( + # from template LOSS_CONVERSATION_TEMPLATE_STRING + # LLM_CONVERSATION_TEMPLATE from backward_engine_prompt + input_output: str = field( metadata={ "desc": "The context of the gradient in form of a conversation indicating \ - the relation of the current parameter to the response parameter (gradient)" + the relation of the current parameter to the response parameter" } ) + response_desc: str = field( + metadata={"desc": "The description of the response parameter"} + ) + # input: Dict[str, Any] = field( + # metadata={"desc": "The input to the whole system"}, default=None + # ) + + # ground_truth: Any = field( + # metadata={"desc": "The ground truth of the response parameter"}, default=None + # ) @dataclass -class ComponentTrace: +class ComponentTrace(DataClass): + name: str = field(metadata={"desc": "The name of the component"}, default=None) + id: str = field(metadata={"desc": "The unique id of the component"}, default=None) input_args: Dict[str, Any] = field( metadata={"desc": "The input arguments of the GradComponent forward"}, default=None, @@ -55,6 +75,9 @@ class ComponentTrace: full_response: object = field( metadata={"desc": "The full response of the GradComponent output"}, default=None ) + raw_response: str = field( + metadata={"desc": "The raw response of the generator"}, default=None + ) api_kwargs: Dict[str, Any] = field( metadata={ "desc": "The api_kwargs for components like Generator and Retriever that pass to the model client" @@ -62,6 +85,10 @@ class ComponentTrace: default=None, ) + def to_context_str(self): + output = f""": {self.input_args}. : {self.full_response}""" + return output + # TODO: use this to better trace the score @dataclass @@ -75,28 +102,47 @@ class ScoreTrace: ) +@dataclass(frozen=True) +class ComponentNode(DataClass): + """Used to represent a node in the component graph.""" + + id: str = field(metadata={"desc": "The unique id of the component"}) + name: str = field(metadata={"desc": "The name of the component"}) + type: Literal["INPUT", "COMPONENT"] = field( + metadata={"desc": "The type of the node"}, default="COMPONENT" + ) + + COMBINED_GRADIENTS_TEMPLATE = r""" -{% if combined_gradients %} -Batch size: {{ combined_gradients|length }} +{% if component_schema %} + +Gradients are from {{ component_schema | length }} components. +{% for component_id, schema in component_schema.items() %} +{{ schema }} +{% endfor %} + {% endif %} -{% for g in combined_gradients %} -{% set gradient = g[0] %} -{% set gradient_context = g[1] %} -{% if gradient_context %} -{{loop.index}}. -{{gradient_context.context}} +{% if combined_gradients %} +{% for group in combined_gradients %} + +{{ group.average_score|round(2) }} +{% for gradient in group.gradients %} +{{ loop.index }}. +INPUT_OUTPUT: {{ gradient.context }} +{% if gradient.score is not none %} +{{ gradient.score | round(3) }} +{{ gradient.gradient }} {% endif %} +{% endfor %} + -{% if gradient.data %} - {% if gradient_context %} -{#The output is used as <{{gradient_context.response_desc}}>#} -{{gradient.data}} -{% else %} -{{gradient.data}} -{% endif %} + +{% endfor %} {% endif %} -{% endfor %}""" +""" + +# id: {{ component_id }}, remove using component id class Parameter(Generic[T]): @@ -127,6 +173,14 @@ class Parameter(Generic[T]): 1. https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py """ + allowed_types = { + ParameterType.NONE, + ParameterType.PROMPT, + ParameterType.DEMOS, + ParameterType.HYPERPARAM, + ParameterType.INPUT, + } + id: str = None # Unique id of the parameter name: str = None # Name of the parameter, easier to read for humans role_desc: str = "" # Description of the role of the parameter @@ -139,41 +193,38 @@ class Parameter(Generic[T]): proposing: bool = False # State of the parameter predecessors: Set["Parameter"] = set() # Predecessors of the parameter peers: Set["Parameter"] = set() # Peers of the parameter - # TODO: input_args should be OrderedDict to keep the order of args - input_args: Dict[str, Any] = None # Input arguments of the GradComponent forward - full_response: object = None # Full response of the GradComponent output eval_input: object = None # Eval input passing to the eval_fn or evaluator you use successor_map_fn: Dict[str, Callable] = ( None # Map function to get the data from the output ) - from_response_id: str = ( - None # for parameterType GRADIENT, the id of the response parameter - ) + backward_engine_disabled: bool = ( False # Disable the backward engine for the parameter ) - component_trace: ComponentTrace = None # Trace of the component tgd_optimizer_trace: "TGDOptimizerTrace" = None # Trace of the TGD optimizer + data_in_prompt: Callable = ( + None # Callable to get the str of the data to be used in the prompt + ) + gt: object = None # Ground truth of the parameter + def __init__( self, *, - id: Optional[str] = None, + id: Optional[str] = None, # unique id of the parameter data: T = None, # for generator output, the data will be set up as raw_response data_id: str = None, # for tracing the data item in the training/val/test set requires_opt: bool = True, role_desc: str = "", param_type: ParameterType = ParameterType.NONE, name: str = None, # name is used to refer to the parameter in the prompt, easier to read for humans - gradient_prompt: str = None, - raw_response: str = None, # use this to track the raw response of generator instead of the data (can be parsed) instruction_to_optimizer: str = None, instruction_to_backward_engine: str = None, score: Optional[float] = None, eval_input: object = None, - from_response_id: Optional[str] = None, successor_map_fn: Optional[Dict[str, Callable]] = None, + data_in_prompt: Callable = None, ): self.id = id or str(uuid.uuid4()) self.data_id = data_id @@ -188,24 +239,24 @@ def __init__( else f"param_{self.id}" ) self.param_type = param_type + # allow subclasses to override allowed_types dynamically + allowed_types = getattr(self.__class__, "allowed_types", set()) + if param_type not in allowed_types: + raise ValueError( + f"{param_type.name} is not allowed for {self.__class__.__name__}" + ) + self.data = data # often string and will be used in the prompts self.requires_opt = requires_opt self.data_type = type(data) self.set_eval_fn_input(eval_input=data) - self.gradients: List[Parameter] = [] # gradient.data - self.gradient_prompt: str = ( - gradient_prompt # the whole llm prompt to compute the gradient - ) - self.gradients_context: Dict[Parameter, GradientContext] = defaultdict( - lambda: None - ) # input and output from an operator, each operator should have a template - # ... + self.gradients: Set[Gradient] = set() + self.grad_fn = None self.previous_data = None # used to store the previous data # context of the forward pass - self.raw_response = raw_response self.instruction_to_optimizer: str = instruction_to_optimizer self.instruction_to_backward_engine: str = instruction_to_backward_engine @@ -214,7 +265,7 @@ def __init__( self._traces: Dict[str, DataClass] = {} # id to data items (DynamicDataClass) self._student_traces: Dict[str, DataClass] = {} # id - self._score: float = ( + self.score: float = ( score # end to end evaluation score, TODO: might have multiple scores if using multiple eval fns # score is set in the gradients in the backward pass ) @@ -224,9 +275,15 @@ def __init__( self._previous_demos: List[DataClass] = [] self.eval_input = eval_input - self.from_response_id = from_response_id # for gradient parameter self.successor_map_fn = successor_map_fn or {} - self.component_trace = ComponentTrace() + + def default_prompt_map_fn(param: Parameter): + # if isinstance(param.data, GeneratorOutput): + # return param.data.raw_response + return param.data + + self.data_in_prompt = data_in_prompt or default_prompt_map_fn + self.gt = None def map_to_successor(self, successor: object) -> T: """Apply the map function to the successor based on the successor's id.""" @@ -245,14 +302,262 @@ def check_if_already_computed_gradient_respect_to(self, response_id: str) -> boo from_response_ids = [g.from_response_id for g in self.gradients] return response_id in from_response_ids - def add_gradient(self, gradient: "Parameter"): - if gradient.param_type != ParameterType.GRADIENT: - raise ValueError("Cannot add non-gradient parameter to gradients list.") + ############################################################################################################ + # Handle gt + ############################################################################################################ + def set_gt(self, gt: object): + + self.gt = gt + + def get_gt(self) -> object: + return self.gt + + # ############################################################################################################ + # Handle gradients and context + # ############################################################################################################ + def add_gradient(self, gradient: "Gradient"): + # if gradient.param_type != ParameterType.GRADIENT: + # raise ValueError("Cannot add non-gradient parameter to gradients list.") if gradient.from_response_id is None: raise ValueError("Gradient must have a from_response_id.") - self.gradients.append(gradient) + start_order = len(self.gradients) + gradient.order = start_order + + self.gradients.add(gradient) + # sort the gradients by the data_id, response_component_id, and score + self.sort_gradients() + + def reset_gradients(self): + self.gradients = set() + + def get_gradients_names(self) -> str: + names = [g.name for g in self.gradients] + names = ", ".join(names) + return names + + def get_prompt_data(self) -> str: + return self.data_in_prompt(self) + + def get_gradients_str(self) -> str: + if not self.gradients: + return "" + + gradients_str = "" + for i, g in enumerate(self.gradients): + gradients_str += f"{i}. {g.data}\n" + + return gradients_str + + def get_gradient_and_context_text(self, skip_correct_sample: bool = False) -> str: + """Aggregates and returns: + 1. the gradients + 2. the context text for which the gradients are computed + + Sort the gradients from the lowest score to the highest score. + Highlight the gradients with the lowest score to the optimizer. + """ + from adalflow.core.prompt_builder import Prompt + + if not self.gradients: + return "" + + # sore gradients by the score from low to high + # self.gradients = sorted( + # self.gradients, key=lambda x: x.score if x.score is not None else 1 + # ) + # print the score for the sorted gradients + lowest_score_gradients = [] + for i, g in enumerate(self.gradients): + if skip_correct_sample: + if g.score > 0.5: + continue + lowest_score_gradients.append(g) + + gradient_context_combined_str = "" + if lowest_score_gradients and len(lowest_score_gradients) > 0: + + # parse the gradients and context. + # gradients_and_context: List[Dict[str, Any]] = ( + # [] + # ) # {gradient: data, context: GradientContext.input_output} + # for g in lowest_score_gradients: + # gradients_and_context.append( + # { + # "data_id": g.data_id, + # "gradient": g.data, + # "context": g.context.input_output, + # "score": g.score, + # } + # ) + + # group gradients by data_id and calculate average scores + grouped_gradients = defaultdict( + lambda: {"gradients": [], "score_sum": 0, "count": 0} + ) + for g in lowest_score_gradients: + group = grouped_gradients[g.data_id] + group["gradients"].append( + { + "gradient": g.data, + "context": g.context.input_output, + "score": g.score, + } + ) + group["score_sum"] += g.score if g.score is not None else 0 + group["count"] += 1 + + # Calculate average scores and sort groups + grouped_list = [] + for data_id, group in grouped_gradients.items(): + average_score = ( + group["score_sum"] / group["count"] if group["count"] > 0 else 0 + ) + grouped_list.append( + { + "data_id": data_id, + "average_score": average_score, + "gradients": group["gradients"], + } + ) + sorted_groups = sorted(grouped_list, key=lambda x: x["average_score"]) + + gradient_context_combined_str = Prompt( + template=COMBINED_GRADIENTS_TEMPLATE, + prompt_kwargs={"combined_gradients": sorted_groups}, + )().strip() + + # get component id: gradient + component_id_to_gradient: Dict[str, Gradient] = {} + for g in lowest_score_gradients: + component_id_to_gradient[g.from_response_component_id] = g + + componend_id_to_schema: Dict[str, str] = {} + for id, g in component_id_to_gradient.items(): + componend_id_to_schema[id] = g.context.to_yaml(exclude={"input_output"}) + + # if there are multiple successors, there will be multiple component schemas + + return gradient_context_combined_str + + def get_gradients_component_schema(self, skip_correct_sample: bool = False) -> str: + """Aggregates and returns: + 1. the gradients + 2. the context text for which the gradients are computed + + Sort the gradients from the lowest score to the highest score. + Highlight the gradients with the lowest score to the optimizer. + """ + from adalflow.core.prompt_builder import Prompt + + # print( + # f"len of gradients: {len(self.gradients)}, scores: {[g._score for g in self.gradients]} for {self.name}" + # ) + + if not self.gradients: + return "" + + # sore gradients by the _score from low to high + # self.gradients = sorted( + # self.gradients, key=lambda x: x.score if x.score is not None else 1 + # ) + # print the score for the sorted gradients + lowest_score_gradients = [] + for i, g in enumerate(self.gradients): + if skip_correct_sample: + if g.score > 0.5: + continue + lowest_score_gradients.append(g) + + # Group gradients by `data_id` and calculate average scores + grouped_gradients = defaultdict( + lambda: {"gradients": [], "score_sum": 0, "count": 0} + ) + for g in lowest_score_gradients: + group = grouped_gradients[g.data_id] + group["gradients"].append( + { + "gradient": g.data, + "context": g.context.input_output, + "score": g.score, + } + ) + group["score_sum"] += g.score if g.score is not None else 0 + group["count"] += 1 + + # Calculate average scores and sort groups + grouped_list = [] + for data_id, group in grouped_gradients.items(): + average_score = ( + group["score_sum"] / group["count"] if group["count"] > 0 else 0 + ) + grouped_list.append( + { + "data_id": data_id, + "average_score": average_score, + "gradients": group["gradients"], + } + ) + sorted_groups = sorted(grouped_list, key=lambda x: x["average_score"]) + + # get component id: gradient + component_id_to_gradient: Dict[str, Gradient] = {} + for g in lowest_score_gradients: + component_id_to_gradient[g.from_response_component_id] = g + + componend_id_to_schema: Dict[str, str] = {} + for id, g in component_id_to_gradient.items(): + componend_id_to_schema[id] = g.context.to_yaml(exclude=["input_output"]) + + # parse the gradients and context. + gradients_and_context: List[Dict[str, Any]] = ( + [] + ) # {gradient: data, context: GradientContext.input_output} + for g in lowest_score_gradients: + gradients_and_context.append( + { + "data_id": g.data_id, + "gradient": g.data, + "context": g.context.input_output, + "score": g.score, + } + ) + + gradient_context_combined_str = Prompt( + template=COMBINED_GRADIENTS_TEMPLATE, + prompt_kwargs={ + "combined_gradients": sorted_groups, + "component_schema": componend_id_to_schema, + }, + )().strip() + + # if there are multiple successors, there will be multiple component schemas + + return gradient_context_combined_str + + def merge_gradients_for_cycle_components(self): + """Merge data_id, from_response_component_id into the same gradient""" + + def sort_gradients(self): + """With rules mentioned in Graient class, we will track the gradients by data_id, then response_component_id, then score""" + + self.gradients = sorted( + self.gradients, + key=lambda x: ( + x.data_id, + x.from_response_component_id, + -x.order if x.order is not None else 0, + x.from_response_id, + x.score, + ), + ) + # make it a set again + self.gradients = set(self.gradients) + + ############################################################################################################ + # Setters and getters + ############################################################################################################ def set_predecessors(self, predecessors: List["Parameter"] = None): if predecessors is None: @@ -269,11 +574,14 @@ def set_grad_fn(self, grad_fn): self.grad_fn = grad_fn def get_param_info(self): + """Used to represent the parameter in the prompt.""" return { "name": self.name, "role_desc": self.role_desc, - "data": self.data, + "prompt_data": self.data_in_prompt(self), # default to use all data "param_type": self.param_type, + "requires_opt": self.requires_opt, + "eval_input": self.eval_input, # for output passing to the eval_fn } def set_peers(self, peers: List["Parameter"] = None): @@ -291,27 +599,13 @@ def set_peers(self, peers: List["Parameter"] = None): # Trace the tgd optimizer data ############################################################################################################ def trace_optimizer(self, api_kwargs: Dict[str, Any], response: "TGDData"): + r"""Trace the inputs and output of a TGD optimizer.""" from adalflow.optim.text_grad.tgd_optimizer import TGDOptimizerTrace self.tgd_optimizer_trace = TGDOptimizerTrace( api_kwargs=api_kwargs, output=response ) - ############################################################################################################ - # Trace component, include trace_forward_pass & trace_api_kwargs for now - ############################################################################################################ - def trace_forward_pass(self, input_args: Dict[str, Any], full_response: object): - r"""Trace the forward pass of the parameter.""" - self.input_args = input_args - self.full_response = full_response - # TODO: remove the input_args and full_response to use component_trace - self.component_trace.input_args = input_args - self.component_trace.full_response = full_response - - def trace_api_kwargs(self, api_kwargs: Dict[str, Any]): - r"""Trace the api_kwargs for components like Generator and Retriever that pass to the model client.""" - self.component_trace.api_kwargs = api_kwargs - def set_eval_fn_input(self, eval_input: object): r"""Set the input for the eval_fn.""" self.eval_input = eval_input @@ -326,7 +620,12 @@ def set_score(self, score: float): But this score is only used to relay the score to the demo parametr. """ - self._score = score + score = float(score) + if not isinstance(score, float): + raise ValueError( + f"score is not float, but {type(score)}, parameter name: {self.name}" + ) + self.score = score def add_dataclass_to_trace(self, trace: DataClass, is_teacher: bool = True): r"""Called by the generator.forward to add a trace to the parameter. @@ -381,10 +680,6 @@ def revert_data(self, include_demos: bool = False): self.previous_data = None self.proposing = False - # reset the gradients and context - # self.reset_gradients() - # self.reset_gradients_context() - # cant reset gradients yet for the loss if include_demos: self._demos = self._previous_demos @@ -398,9 +693,6 @@ def step_data(self, include_demos: bool = False): self.previous_data = None self.proposing = False - # reset the gradients and context - # self.reset_gradients() - # self.reset_gradients_context() if include_demos: self._previous_demos = [] @@ -421,61 +713,6 @@ def update_value(self, data: T): self.data_type = type(data) self.data = data - def reset_gradients(self): - self.gradients = [] - - def reset_gradients_context(self): - self.gradients_context = defaultdict(lambda: None) - - def get_gradients_names(self) -> str: - names = [g.name for g in self.gradients] - names = ", ".join(names) - return names - - def get_gradient_and_context_text(self, skip_correct_sample: bool = False) -> str: - """Aggregates and returns: - 1. the gradients - 2. the context text for which the gradients are computed - - Sort the gradients from the lowest score to the highest score. - Highlight the gradients with the lowest score to the optimizer. - """ - from adalflow.core.prompt_builder import Prompt - - # print( - # f"len of gradients: {len(self.gradients)}, scores: {[g._score for g in self.gradients]} for {self.name}" - # ) - - # sore gradients by the _score from low to high - self.gradients = sorted( - self.gradients, key=lambda x: x._score if x._score is not None else 1 - ) - # print the score for the sorted gradients - lowest_score_gradients = [] - for i, g in enumerate(self.gradients): - if skip_correct_sample: - if g._score > 0.5: - continue - lowest_score_gradients.append(g) - print(f"{i} Score: {g._score} for {g.name}, {type(g._score)}") - - gradient_context_combined = list( - zip( - lowest_score_gradients, - [self.gradients_context[g] for g in lowest_score_gradients], - ) - ) - # set all gradients value to None - # for g in self.gradients: - # g.data = None - - gradient_context_combined_str = Prompt( - template=COMBINED_GRADIENTS_TEMPLATE, - prompt_kwargs={"combined_gradients": gradient_context_combined}, - )().strip() - - return gradient_context_combined_str - # TODO: dont use short value def get_short_value(self, n_words_offset: int = 10) -> str: """ @@ -485,7 +722,8 @@ def get_short_value(self, n_words_offset: int = 10) -> str: :type n_words_offset: int """ # 1. ensure the data is a string - data = self.data + # data = self.data + data = self.get_prompt_data() if not isinstance(self.data, str): data = str(self.data) words = data.split(" ") @@ -498,6 +736,12 @@ def get_short_value(self, n_words_offset: int = 10) -> str: ) return short_value + def reset_all_gradients(self): + """Traverse the graph and reset the gradients for all nodes.""" + nodes, _ = Parameter.trace_graph(self) + for node in nodes: + node.reset_gradients() + @staticmethod def trace_graph( root: "Parameter", @@ -507,6 +751,8 @@ def trace_graph( def build_graph(node: "Parameter"): if node in nodes: return + if node is None: + raise ValueError("Node is None") nodes.add(node) for pred in node.predecessors: edges.add((pred, node)) @@ -515,21 +761,6 @@ def build_graph(node: "Parameter"): build_graph(root) return nodes, edges - def report_cycle(cycle_nodes: List["Parameter"]): - """ - Report the detected cycle and provide guidance to the user on how to avoid it. - """ - cycle_names = [node.name for node in cycle_nodes] - log.warning(f"Cycle detected: {' -> '.join(cycle_names)}") - print(f"Cycle detected in the graph: {' -> '.join(cycle_names)}") - - # Provide guidance on how to avoid the cycle - print("To avoid the cycle, consider the following strategies:") - print("- Modify the graph structure to remove cyclic dependencies.") - print( - "- Check the relationships between these nodes to ensure no feedback loops." - ) - def backward( self, ): @@ -559,63 +790,94 @@ def build_topo(node: Parameter): if not node.requires_opt: log.debug(f"Skipping {node.name} as it does not require optimization") continue - log.debug(f"v: {node.data}, grad_fn: {node.grad_fn}, {node.get_grad_fn()}") + component_name = None + if hasattr(node, "component_trace"): + component_name = node.component_trace.name + printc( + f"node: {node.name}, component: {component_name}, grad_fn: {node.grad_fn}." + ) if node.get_grad_fn() is not None: # gradient function takes in the engine log.debug(f"Calling gradient function for {node.name}") node.grad_fn() - # def backward( - # self, - # ): # engine should be the llm or customized backwards function to pass feedback - - # # topological sort of all the predecessors of the current parameter in the graph - # log.debug(f"Backward pass for {self.data}, backward function: {self.grad_fn}") - # topo: List[Parameter] = [] - # visited = set() - # in_stack = set() # Nodes currently being visited to detect cycles - # cycle_detected = False # Flag to check if any cycle was detected - - # def build_topo(node: Parameter, stack: Set[Parameter] = set()): - # nonlocal cycle_detected - - # if stack is None: - # stack = [] - - # # If the node is already in the stack, we have detected a cycle - # if node in in_stack: - # cycle_detected = True - # cycle_nodes = stack + [node] # The cycle includes the current path - # self.report_cycle(cycle_nodes) - # return False # Stop further processing due to cycle - # if node in visited: - # return - # visited.add(node) - # in_stack.add(node) - # stack.append(node) - # for pred in node.predecessors: - # build_topo(pred) - # topo.append(node) - # stack.pop() # Backtrack, remove the node from the current path - - # in_stack.remove(node) # Remove from the stack after processing - # return True - - # # build_topo(self) - # if not build_topo(self): - # log.error("Cycle detected, stopping backward pass.") - # return # Stop the backward pass due to cycle detection - # # backpropagation - - # self.gradients = set() - # for node in reversed(topo): - # if not node.requires_opt: - # log.debug(f"Skipping {node.name} as it does not require optimization") - # continue - # node.gradients = _check_and_reduce_gradients(node) - # log.debug(f"v: {node.data}, grad_fn: {node.grad_fn}, {node.get_grad_fn()}") - # if node.get_grad_fn() is not None: # gradient function takes in the engine - # log.debug(f"Calling gradient function for {node.name}") - # node.grad_fn() + @staticmethod + def generate_node_html(node: "Parameter", output_dir="node_pages"): + """Generate an HTML page for a specific node.""" + import json + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + filename = f"{output_dir}/{node.name}.html" + + # Gather gradients as JSON objects + gradients = [] + for i, g in enumerate(node.gradients): + gradient = g.to_json_obj() + for k, v in gradient.items(): + if isinstance(v, str): + gradient[k] = v.replace("<", "<").replace(">", ">") + gradients.append(gradient) + + data_json = None + node_data_type = str(type(node.data)).replace("<", "<").replace(">", ">") + printc(f"Node data type: {node_data_type}") + if isinstance(node.data, dict): + data_json = data_json + elif isinstance(node.data, DataClass): + try: + data_json = node.data.to_json_obj() + except Exception: + + data_json = str(node.data) + + else: + data_json = str(node.data) + data_json = {"data": data_json} + + gradients_json = json.dumps(gradients, indent=4, ensure_ascii=False) + + optimizer_trace = None + if node.tgd_optimizer_trace: + optimizer_trace = node.tgd_optimizer_trace.to_json_obj() + optimizer_trace = json.dumps(optimizer_trace, indent=4, ensure_ascii=False) + + with open(filename, "w") as file: + file.write( + f""" + + + + + + {node.name} + + + +

Details for Node: {node.name}

+

ID: {node.id}

+

Role: {node.role_desc}

+

DataType: {node_data_type}

+
Data: \n{json.dumps(data_json, indent=4)}
+

Data ID: {node.data_id}

+

Previous Value: {node.previous_data}

+

Requires Optimization: {node.requires_opt}

+

Type: {node.param_type.value} ({node.param_type.description})

+
Gradients:\n{gradients_json}
+
TGD Optimizer Trace:\n{optimizer_trace}
+ + + + """ + ) + print(f"Generated HTML for node: {node.name} at {filename}") def draw_interactive_html_graph( self, @@ -636,18 +898,34 @@ def draw_interactive_html_graph( """ from jinja2 import Template - # Define the output file path output_file = "interactive_graph.html" - final_file = filepath + "_" + output_file if filepath else output_file + filepath = filepath or "output" + os.makedirs(filepath, exist_ok=True) + final_file = os.path.join(filepath, output_file) - # Create a pyvis Network instance net = Network(height="750px", width="100%", directed=True) + node_colors = { + ParameterType.PROMPT: "lightblue", + ParameterType.DEMOS: "orange", + ParameterType.INPUT: "gray", + ParameterType.OUTPUT: "green", + ParameterType.GENERATOR_OUTPUT: "purple", + ParameterType.RETRIEVER_OUTPUT: "red", + ParameterType.LOSS_OUTPUT: "pink", + ParameterType.SUM_OUTPUT: "blue", + } + # Add nodes to the graph node_ids = set() for node in nodes: + self.generate_node_html(node, output_dir=filepath) + + node_id = node.id + node_show_name = node.name.replace(f"_{node_id}", "") label = ( - f"Name: {node.name}
" + f"""
""" + f"Name: {node_show_name}
" f"Role: {node.role_desc.capitalize()}
" f"Value: {node.data}
" f"Data ID: {node.data_id}
" @@ -655,18 +933,16 @@ def draw_interactive_html_graph( if node.proposing: label += "Proposing: Yes
" label += f"Previous Value: {node.previous_data}
" - if node.requires_opt: - label += "Requires Optimization: Yes
" + label += f"Requires Optimization: {node.requires_opt}
" if node.param_type: - label += f"Type: {node.param_type}
" - if node.gradients: - label += f"Gradients: {node.get_gradients_names()}
" + label += f"Type: {node.param_type.value}
" net.add_node( - node.id, - label=node.name, + n_id=node.id, + label=node_show_name, title=label, - color="lightblue" if node.proposing else "orange", + color=node_colors.get(node.param_type, "gray"), + url=f"./{node.name}.html", # Relative path ) node_ids.add(node.id) @@ -679,38 +955,145 @@ def draw_interactive_html_graph( f"Skipping edge from {source.name} to {target.name} as one of the nodes does not exist." ) - # Enable physics for better layout net.toggle_physics(True) net.template = Template( """ - - - - - - - -
- - - - """ + + + + + + + + +
+
+ +
+ +
+ Legend: +
+ PROMPT +
+
+ DEMOS +
+
+ INPUT +
+
+ OUTPUT +
+
+ GENERATOR_OUTPUT +
+
+ RETRIEVER_OUTPUT +
+
+ LOSS_OUTPUT +
+
+ SUM_OUTPUT +
+
+ +
+ + + + """ ) - # Save the graph as an HTML file - net.show(final_file) print(f"Interactive graph saved to {final_file}") return {"graph_path": final_file} + @staticmethod + def wrap_and_escape(text, width=40): + r"""Wrap text to the specified width, considering HTML breaks, and escape special characters.""" + try: + import textwrap + except ImportError as e: + raise ImportError( + "Please install textwrap using 'pip install textwrap' to use this feature" + ) from e + + def wrap_text(text, width): + """Wrap text to the specified width, considering HTML breaks.""" + lines = textwrap.wrap( + text, width, break_long_words=False, replace_whitespace=False + ) + return "
".join(lines) + + if not isinstance(text, str): + text = str(text) + text = ( + text.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + .replace('"', """) + .replace("'", "'") + .replace( + "\n", "
" + ) # Convert newlines to HTML line breaks if using HTML labels + ) + return wrap_text(text, width) + def draw_graph( self, add_grads: bool = True, @@ -739,26 +1122,9 @@ def draw_graph( "Please install graphviz using 'pip install graphviz' to use this feature" ) from e - # try: - # from tensorboardX import SummaryWriter - # except ImportError as e: - # raise ImportError( - # "Please install tensorboardX using 'pip install tensorboardX' to use this feature" - # ) from e assert rankdir in ["LR", "TB"] - try: - import textwrap - except ImportError as e: - raise ImportError( - "Please install textwrap using 'pip install textwrap' to use this feature" - ) from e root_path = get_adalflow_default_root_path() - # # prepare the log directory - # log_dir = os.path.join(root_path, "logs") - - # # Set up TensorBoard logging - # writer = SummaryWriter(log_dir) filename = f"trace_graph_{self.name}_id_{self.id}" filepath = ( @@ -766,77 +1132,69 @@ def draw_graph( if filepath else os.path.join(root_path, "graphs", filename) ) + # final_path = f"{filepath}.{format}" print(f"Saving graph to {filepath}.{format}") - def wrap_text(text, width): - """Wrap text to the specified width, considering HTML breaks.""" - lines = textwrap.wrap( - text, width, break_long_words=False, replace_whitespace=False - ) - return "
".join(lines) - - def wrap_and_escape(text, width=40): - r"""Wrap text to the specified width, considering HTML breaks, and escape special characters.""" - if not isinstance(text, str): - text = str(text) - text = ( - text.replace("&", "&") - .replace("<", "<") - .replace(">", ">") - .replace('"', """) - .replace("'", "'") - .replace( - "\n", "
" - ) # Convert newlines to HTML line breaks if using HTML labels - ) - return wrap_text(text, width) - nodes, edges = self.trace_graph(self) - dot = Digraph(format=format, graph_attr={"rankdir": rankdir}) + dot = Digraph(format=format, graph_attr={"rankdir": rankdir, "dpi": "300"}) node_names = set() for n in nodes: label_color = "darkblue" node_label = ( f"" - f"" - f"" - f"" + f"" + f"" + f"" + f"" ) if n.data_id is not None: - node_label += f"" + node_label += f"" if n.proposing: node_label += f"" - node_label += f"" + node_label += f"" if n.requires_opt: node_label += f"" if n.param_type: - node_label += f"" - if full_trace and n.component_trace.api_kwargs is not None: - node_label += f"" + node_label += f"" + if ( + full_trace + and hasattr(n, "component_trace") + and n.component_trace.api_kwargs is not None + ): + node_label += f"" # show the score for intermediate nodes - if n._score is not None and len(n.predecessors) > 0: - node_label += f"" + if n.score is not None and len(n.predecessors) > 0: + node_label += f"" if add_grads: - node_label += f"" + node_label += f"" # add a list of each gradient with short value # combine the gradients and context - combined_gradients_contexts = zip( - n.gradients, [n.gradients_context[g] for g in n.gradients] - ) - for g, context in combined_gradients_contexts: - gradient_context = context + # combined_gradients_contexts = zip( + # n.gradients, [n.gradients_context[g] for g in n.gradients] + # ) + # if "output" in n.name: + for g in n.gradients: + gradient_context = g.context log.info(f"Gradient context display: {gradient_context}") log.info(f"data: {g.data}") - node_label += f"" - if gradient_context != "": - node_label += f"" + node_label += f"" + # if gradient_context != "": + # node_label += f"" + # if g.prompt: + # node_label += f"" if len(n._traces.values()) > 0: - node_label += f"" - node_label += f"" + node_label += f"" + node_label += f"" if n.tgd_optimizer_trace is not None: - node_label += f"" + node_label += f"" + + # show component trace, id and name + if hasattr(n, "component_trace") and n.component_trace.id is not None: + node_label += f"" + if hasattr(n, "component_trace") and n.component_trace.name is not None: + node_label += f"" node_label += "
Name: {wrap_and_escape(n.name)}
Role: {wrap_and_escape(n.role_desc.capitalize())}
Value: {wrap_and_escape(n.data)}
Name: {self.wrap_and_escape(n.id)}
Name: {self.wrap_and_escape(n.name)}
Role: {self.wrap_and_escape(n.role_desc.capitalize())}
Value: {self.wrap_and_escape(n.data)}
Data ID: {wrap_and_escape(n.data_id)}
Data ID: {self.wrap_and_escape(n.data_id)}
Proposing{{'Yes'}}
Previous Value: {wrap_and_escape(n.previous_data)}
Previous Value: {self.wrap_and_escape(n.previous_data)}
Requires Optimization: {{'Yes'}}
Type: {wrap_and_escape(n.param_type.name)}
API kwargs: {wrap_and_escape(str(n.component_trace.api_kwargs))}
Type: {self.wrap_and_escape(n.param_type.name)}
API kwargs: {self.wrap_and_escape(str(n.component_trace.api_kwargs))}
Score: {str(n._score)}
Score: {str(n.score)}
Gradients: {wrap_and_escape(n.get_gradients_names())}
Gradients: {self.wrap_and_escape(n.get_gradients_names())}
Gradient {g.name} Feedback: {wrap_and_escape(g.data)}
Gradient {g.name} Context: {wrap_and_escape(gradient_context)}
Gradient {g.name} Feedback: {self.wrap_and_escape(g.data)}
Gradient {g.name} Context: {wrap_and_escape(gradient_context)}
Gradient {g.name} Prompt: {wrap_and_escape(g.prompt)}
Traces: keys: {wrap_and_escape(str(n._traces.keys()))}
Traces: values: {wrap_and_escape(str(n._traces.values()))}
Traces: keys: {self.wrap_and_escape(str(n._traces.keys()))}
Traces: values: {self.wrap_and_escape(str(n._traces.values()))}
TGD Optimizer Trace: {wrap_and_escape(str(n.tgd_optimizer_trace))}
TGD Optimizer Trace: {self.wrap_and_escape(str(n.tgd_optimizer_trace))}
Component Trace ID: {self.wrap_and_escape(str(n.component_trace.id))}
Component Trace Name: {self.wrap_and_escape(str(n.component_trace.name))}
" # check if the name exists in dot @@ -854,57 +1212,299 @@ def wrap_and_escape(text, width=40): for g in n.gradients: log.info(f"Gradient: {g.name}, {g.to_dict()}") - log.info(f"Gradient prompt: {g.gradient_prompt}") + log.info(f"Gradient prompt: {g.prompt}") for n1, n2 in edges: dot.edge(n1.name, n2.name) - dot.render(filepath, format=format, cleanup=True) - # from PIL import Image - # try: - # import matplotlib.pyplot as plt - # except ImportError as e: - # raise ImportError( - # "Please install matplotlib using 'pip install matplotlib' to use this feature" - # ) from e - # ) from e - # from io import BytesIO - # import numpy as np - - # # Read the rendered image file into memory using matplotlib - # with open(f"{filepath}.{format}", "rb") as f: - # image_bytes = f.read() - - # # Use matplotlib to read the image from bytes - # image = plt.imread(BytesIO(image_bytes), format=format) - - # # Ensure the image is in the format [H, W, C] - # if image.ndim == 2: # Grayscale image - # image = np.expand_dims(image, axis=2) - - # Read the rendered image file - # writer.add_image("graph", image, dataformats="HWC", global_step=1) - # writer.close() - - # filename = f"{filepath}_prompts.json" - # prompts = {} - # for n in nodes: - # prompts[n.name] = { - # "raw_response": n.raw_response, - # } - # for g in n.gradients: - # prompts[g.name] = { - # "gradient_prompt": g.gradient_prompt, - # } - - # save_json(prompts, filename) - # save root node to_dict to json + # dot.render(filepath, format=format, cleanup=True) + save_json(self.to_dict(), f"{filepath}_root.json") # draw interactive graph - self.draw_interactive_html_graph( - filepath=filepath, nodes=[n for n in nodes], edges=edges + graph_file: Dict[str, str] = self.draw_interactive_html_graph( + filepath=filepath, nodes=nodes, edges=edges ) - return {"graph_path": filepath, "root_path": f"{filepath}_root.json"} + output = { + # "graph_path": final_path, + "root_path": f"{filepath}_root.json", + "interactive_html_graph": graph_file["graph_path"], + } + print(f"Graph saved as {filepath}.{format}") + return output + + def draw_output_subgraph( + self, + add_grads: bool = True, + format: str = "png", + rankdir: str = "TB", + filepath: str = None, + ) -> Dict: + """ + Build and visualize a subgraph containing only OUTPUT parameters. + + Args: + add_grads (bool): Whether to include gradient edges. + format (str): Format for output (e.g., png, svg). + rankdir (str): Graph layout direction ("LR" or "TB"). + filepath (str): Path to save the graph. + """ + + assert rankdir in ["LR", "TB"] + from adalflow.utils.global_config import get_adalflow_default_root_path + + try: + from graphviz import Digraph + + except ImportError as e: + raise ImportError( + "Please install graphviz using 'pip install graphviz' to use this feature" + ) from e + + root_path = get_adalflow_default_root_path() + + filename = f"trace_component_output_graph_{self.name}_id_{self.id}.{format}" + filepath = ( + os.path.join(filepath, filename) + if filepath + else os.path.join(root_path, "graphs", filename) + ) + + # Step 1: Collect OUTPUT nodes and edges + nodes, edges = self._collect_output_subgraph() + + # Step 2: Render using Graphviz + print(f"Saving OUTPUT subgraph to {filepath}") + + dot = Digraph(format=format, graph_attr={"rankdir": rankdir}) + node_ids = set() + + for node in nodes: + + node_label = f""" + + + + """ + # add the component trace id and name + if hasattr(node, "component_trace") and node.component_trace.id is not None: + escaped_ct_id = html.escape(str(node.component_trace.id)) + node_label += f"" + if ( + hasattr(node, "component_trace") + and node.component_trace.name is not None + ): + escaped_ct_name = html.escape(str(node.component_trace.name)) + node_label += f"" + + node_label += "
Name:{self.wrap_and_escape(node.name)}
Type:{self.wrap_and_escape(node.param_type.name)}
Value:{self.wrap_and_escape(node.get_short_value())}
Component Trace ID:{escaped_ct_id}
Component Trace Name:{escaped_ct_name}
" + dot.node( + name=node.id, + label=f"<{node_label}>", + shape="plaintext", + color="lightblue" if node.requires_opt else "gray", + ) + node_ids.add(node.id) + + for source, target in edges: + if source.id in node_ids and target.id in node_ids: + dot.edge(source.id, target.id) + + # Step 3: Save and render + dot.render(filepath, cleanup=True) + print(f"Graph saved as {filepath}") + return {"output_subgraph": filepath} + + def draw_component_subgraph( + self, + format: str = "png", + rankdir: str = "TB", + filepath: str = None, + ): + """ + Build and visualize a subgraph containing only OUTPUT parameters. + + Args: + format (str): Format for output (e.g., png, svg). + rankdir (str): Graph layout direction ("LR" or "TB"). + filepath (str): Path to save the graph. + """ + assert rankdir in ["LR", "TB"] + from adalflow.utils.global_config import get_adalflow_default_root_path + + try: + from graphviz import Digraph + except ImportError as e: + raise ImportError( + "Please install graphviz using 'pip install graphviz' to use this feature" + ) from e + + # Step 1: Collect OUTPUT nodes and edges + component_nodes, edges, component_nodes_orders = ( + self._collect_component_subgraph() + ) + root_path = get_adalflow_default_root_path() + + # Step 2: Setup graph rendering + filename = f"output_component_{self.name}_{self.id}.{format}" + filepath = filepath or f"./{filename}" + + filepath = ( + os.path.join(filepath, filename) + if filepath + else os.path.join(root_path, "graphs", filename) + ) + print(f"Saving OUTPUT subgraph to {filepath}") + + dot = Digraph(format=format, graph_attr={"rankdir": rankdir}) + + # Add nodes + for node in component_nodes: + node_label = """ + """ + + if node.name: + node_label += """""" + if node.type: + node_label += """""" + + # add the list of orders + if node.id in component_nodes_orders: + node_label += f"" + node_label += "
Name:{node.name}
TYPE:{node.type}
Order:{component_nodes_orders[node.id]}
" + dot.node( + name=node.id if node.id else "id missing", + label=f"<{node_label}>", + shape="plaintext", + color="lightblue", + ) + + # Add edges with order labels + for source_id, target_id, edge_order in edges: + dot.edge(source_id, target_id) # , label=str(edge_order), color="black") + + # Step 3: Save and render + dot.render(filepath, cleanup=True) + print(f"Graph saved as {filepath}") + return {"component_graph": f"{filepath}"} + + def _collect_output_subgraph( + self, + ) -> Tuple[Set["Parameter"], List[Tuple["Parameter", "Parameter"]]]: + """ + Collect nodes of type OUTPUT and their relationships. + + Returns: + nodes (Set[Parameter]): Set of OUTPUT nodes. + edges (List[Tuple[Parameter, Parameter]]): Edges between OUTPUT nodes. + """ + output_nodes = set() + edges = [] + + visited = set() # check component_trace.id and name + + def traverse(node: "Parameter"): + if node in visited: + return + visited.add(node) + + # Add OUTPUT nodes to the set + if ( + node.param_type == ParameterType.OUTPUT + or "OUTPUT" in node.param_type.name + ): + output_nodes.add(node) + + # Traverse predecessors and add edges + for pred in node.predecessors: + if ( + pred.param_type == ParameterType.OUTPUT + or "OUTPUT" in pred.param_type.name + ): + edges.append((pred, node)) + traverse(pred) + + traverse(self) + return output_nodes, edges + + def _collect_component_subgraph( + self, + ) -> Tuple[Set[ComponentNode], List[Tuple[str, str]]]: + """ + Collect OUTPUT nodes and their relationships as ComponentNodes. + + Returns: + component_nodes (Set[ComponentNode]): Set of component nodes (id and name only). + edges (List[Tuple[str, str]]): Edges between component IDs. + """ + component_nodes = set() # To store component nodes as ComponentNode + component_nodes_orders: Dict[str, List[int]] = ( + {} + ) # To store component nodes order + edges = [] # To store edges between component IDs + + visited = set() # Track visited parameters to avoid cycles + edge_counter = [0] # Mutable counter for edge order tracking + + def traverse(node: "Parameter"): + if node in visited: + return + visited.add(node) + + # Check if node is of OUTPUT type + if ( + node.param_type == ParameterType.OUTPUT + or "OUTPUT" in node.param_type.name + ): + component_id = node.component_trace.id or f"unknown_id_{uuid.uuid4()}" + component_name = node.component_trace.name or "Unknown Component" + + # Create a ComponentNode and add to the set + component_node = ComponentNode(id=component_id, name=component_name) + component_nodes.add(component_node) + + # Traverse predecessors and add edges + for pred in node.predecessors: + # if pred.param_type != ParameterType.OUTPUT: + # continue + pred_id = f"unknown_id_{uuid.uuid4()}" + pred_name = "Unknown Component" + + if hasattr(pred, "component_trace") and pred.component_trace.id: + pred_id = pred.component_trace.id + pred_name = pred.component_trace.name + + # Add edge if predecessor is also of OUTPUT type + if ( + pred.param_type == ParameterType.OUTPUT + or "OUTPUT" in pred.param_type.name + ): + edges.append((pred_id, component_id, edge_counter[0])) + component_nodes.add(ComponentNode(id=pred_id, name=pred_name)) + edge_counter[0] += 1 + + if pred.param_type == ParameterType.INPUT: + pred_id = pred.id + pred_name = pred.name + pred_node = ComponentNode( + id=pred_id, name=pred_name, type="INPUT" + ) + component_nodes.add(pred_node) + # add an edge from input to the first output + edges.append((pred_id, component_id, edge_counter[0])) + edge_counter[0] += 1 + + traverse(pred) + + # Start traversal from the current parameter + traverse(self) + # Reverse the edge order + # total_edges = len(edges) + # edges = [ + # (source, target, (total_edges - 1) - edge_number) + # for idx, (source, target, edge_number) in enumerate(edges) + # ] + + return component_nodes, edges, component_nodes_orders def to_dict(self): return { @@ -918,17 +1518,11 @@ def to_dict(self): "predecessors": [pred.to_dict() for pred in self.predecessors], "gradients": [grad.to_dict() for grad in self.gradients], "previous_data": self.previous_data, - "gradients_context": [ - (k.name, v) for k, v in self.gradients_context.items() - ], "grad_fn": str( self.grad_fn ), # Simplify for serialization, modify as needed - "gradient_prompt": str(self.gradient_prompt), - "raw_response": self.raw_response, - "score": self._score, + "score": self.score, "traces": {k: v.to_dict() for k, v in self._traces.items()}, - "input_args": self.input_args, # demos "demos": [d.to_dict() for d in self._demos], } @@ -946,21 +1540,322 @@ def from_dict(cls, data: dict): predecessors=predecessors, gradients=[cls.from_dict(grad) for grad in data["gradients"]], previous_data=data["previous_data"], - gradient_prompt=data["gradient_prompt"], - raw_response=data["raw_response"], - input_args=data["input_args"], score=data["score"], # demos demos=[DataClass.from_dict(d) for d in data["demos"]], ) # Reconstruct gradients_context from the list of tuples - param.gradients_context = defaultdict( - lambda: None, {cls.from_dict(k): v for k, v in data["gradients_context"]} - ) param._traces = {k: DataClass.from_dict(v) for k, v in data["traces"].items()} return param # TODO: very hard to read directly, need to simplify and let users use to_dict for better readability def __repr__(self): return f"Parameter(name={self.name}, requires_opt={self.requires_opt}, param_type={self.param_type}, role_desc={self.role_desc}, data={self.data}, predecessors={self.predecessors}, gradients={self.gradients},\ - raw_response={self.raw_response}, input_args={self.input_args}, traces={self._traces})" + traces={self._traces})" + + +# TODO: separate the Parameter class into different classes and each class will have its own methods instead of all in one class +class InputParameter(Parameter): + """One of the simplest types of parameters, representing an input to the system. + Input parameter will not be trainable, but serves a tracing purpose in the computation graph. + """ + + def __init__( + self, + name: str, + role_desc: str, + data: Any, + requires_opt: bool = False, + param_type: ParameterType = ParameterType.INPUT, + ): + super().__init__( + name=name, + role_desc=role_desc, + data=data, + requires_opt=requires_opt, + param_type=param_type, + ) + + +class HyperParameter(Parameter): + """One of the simplest types of parameters, representing a hyperparameter to the system.""" + + def __init__( + self, + name: str, + role_desc: str, + data: Any, + requires_opt: bool = False, + param_type: ParameterType = ParameterType.HYPERPARAM, + ): + super().__init__( + name=name, + role_desc=role_desc, + data=data, + requires_opt=requires_opt, + param_type=param_type, + ) + + +class PromptParameter(Parameter): + + def __init__( + self, + name: str, + role_desc: str, + data: Any, + requires_opt: bool = True, + param_type: ParameterType = ParameterType.PROMPT, + ): + super().__init__( + name=name, + role_desc=role_desc, + data=data, + requires_opt=requires_opt, + param_type=param_type, + ) + + +class DemoParameter(Parameter): + + def __init__( + self, + name: str, + role_desc: str, + data: Any, + requires_opt: bool = True, + param_type: ParameterType = ParameterType.DEMOS, + ): + super().__init__( + name=name, + role_desc=role_desc, + data=data, + requires_opt=requires_opt, + param_type=param_type, + ) + + +class OutputParameter(Parameter): + __doc__ = r"""The output parameter is the most complex type of parameter in the system. + + It will trace the predecessors, set up a grad_fn, store gradients, and trace the forward pass by tracking the component_trace. + """ + allowed_types = { + ParameterType.OUTPUT, + ParameterType.LOSS_OUTPUT, + ParameterType.GENERATOR_OUTPUT, + ParameterType.SUM_OUTPUT, + } + component_trace: ComponentTrace = ( + None # Trace of the component that produced this output + ) + full_response: object = None # The full response from the component + + def __init__( + self, + *, + id: Optional[str] = None, # unique id of the parameter + data: T = None, # for generator output, the data will be set up as raw_response + data_id: str = None, # for tracing the data item in the training/val/test set + requires_opt: bool = True, + role_desc: str = "", + param_type: ParameterType = ParameterType.OUTPUT, + name: str = None, # name is used to refer to the parameter in the prompt, easier to read for humans + instruction_to_optimizer: str = None, + instruction_to_backward_engine: str = None, + score: Optional[float] = None, + eval_input: object = None, + successor_map_fn: Optional[Dict[str, Callable]] = None, + data_in_prompt: Optional[Callable] = None, + full_response: Optional[Any] = None, + ): + super().__init__( + id=id, + data=data, + data_id=data_id, + requires_opt=requires_opt, + role_desc=role_desc, + param_type=param_type, + name=name, + instruction_to_optimizer=instruction_to_optimizer, + instruction_to_backward_engine=instruction_to_backward_engine, + score=score, + eval_input=eval_input, + successor_map_fn=successor_map_fn, + data_in_prompt=data_in_prompt, + ) + + self.component_trace = ComponentTrace() + self.full_response = full_response + + ############################################################################################################ + # Trace component, include trace_forward_pass & trace_api_kwargs for now + ############################################################################################################ + def trace_forward_pass( + self, + input_args: Dict[str, Any], + full_response: object, + id: str = None, + name: str = None, + ): + r"""Trace the forward pass of the parameter. Adding the component information to the trace""" + self.input_args = input_args + self.full_response = full_response + # TODO: remove the input_args and full_response to use component_trace + self.component_trace.input_args = input_args + self.component_trace.full_response = full_response + self.component_trace.id = id + self.component_trace.name = name + # just for convenience to trace full response separately + self.full_response = full_response + + def trace_api_kwargs(self, api_kwargs: Dict[str, Any]): + r"""Trace the api_kwargs for components like Generator and Retriever that pass to the model client.""" + self.component_trace.api_kwargs = api_kwargs + + def to_dict(self): + super_dict = super().to_dict() + super_dict.update( + { + "component_trace": self.component_trace.to_dict(), + } + ) + + # def to_json(self): + # import json + + # return json.dumps(self.to_dict()) + + @classmethod + def from_dict(cls, data: dict): + component_trace = ComponentTrace.from_dict(data["component_trace"]) + return super().from_dict(data).update({"component_trace": component_trace}) + + def __repr__(self): + super_repr = super().__repr__() + start = super_repr.find("Parameter") + if start == 0: + end = start + len("Parameter") + super_repr = super_repr[:start] + "OutputParameter" + super_repr[end:] + return super_repr + + +# gradients= List[Gradient] + + +@dataclass +class Gradient(DataClass): + __doc__ = r"""It will handle gradients and feedbacks. + + It tracks the d_from_response_id / d_to_pred_id and the score of the whole response. + + if two gradients have the same data_id, different from_response_id, and same from_response_component_id, this is a cycle component structure. + """ + data_id: Optional[str] = None # the id of the response from data in the dataset + from_response_component_id: str = ( + None # the id of the component from which the gradient is calculated + ) + order: Optional[int] = None # the order of the gradient in the list of gradients + + from_response_id: str = ( + None # the id of the response from which the gradient is calculated + ) + + to_pred_id: str = ( + None # the id of the parameter to which the gradient is calculated and attached to d(from_response_id) / d(to_pred_id) + ) + + score: Optional[float] = None + + context: GradientContext = None + data: Any = None + prompt: Optional[str] = None # the LLM prompt to generate the gradient + + is_default_copy: bool = False # whether the gradient is a default copy + + def __init__( + self, + *, + from_response: "Parameter", + to_pred: "Parameter", + id: Optional[str] = None, # the id of the gradient + score: Optional[float] = None, + data_id: Optional[str] = None, + data: Any = None, + ): + self.id = id or str(uuid.uuid4()) + self._generate_name(from_response, to_pred) + self.from_response_component_id = from_response.component_trace.id + if not self.from_response_component_id: + raise ValueError( + "The from_response_component_id should not be None. Please ensure the component_trace is set." + ) + self.from_response_id = from_response.id + self.to_pred_id = to_pred.id + self.score = score + self.data_id = data_id + if self.data_id is None: + raise ValueError("The data_id should not be None.") + self.data = data + self.order = None + + def _generate_name(self, response: "Parameter", pred: "Parameter"): + self.name = f"d_{response.name}_/_{pred.name}({response.id}_/_{pred.id})" + self.role_desc = f"Gradient from {response.name} to {pred.name}" + + def add_context(self, context: GradientContext): + self.context = context + + def add_data(self, data: Any): + self.data = data + + def update_from_to(self, from_response: "Parameter", to_pred: "Parameter"): + self.from_response_id = from_response.id + self.to_pred_id = to_pred.id + self._generate_name(from_response, to_pred) + self.from_response_component_id = from_response.component_trace.id + + def add_prompt(self, prompt: str): + self.prompt = prompt + + def __hash__(self): + # Use immutable and unique attributes to compute the hash + return hash((self.id, self.data_id, self.from_response_id, self.to_pred_id)) + + def __eq__(self, other): + # Ensure equality comparison is based on the same unique attributes + if not isinstance(other, Gradient): + return False + return ( + self.id == other.id + and self.data_id == other.data_id + and self.from_response_id == other.from_response_id + and self.to_pred_id == other.to_pred_id + ) + + +if __name__ == "__main__": + + # test gradient hash and to_dict + from_response = OutputParameter( + name="p1", + role_desc="role1", + data=1, + ) + from_response.component_trace = ComponentTrace(id="1") + g1 = Gradient( + from_response=from_response, + to_pred=Parameter(name="p2", role_desc="role2", data=2), + data_id="1", + ) + g2 = Gradient( + from_response=from_response, + to_pred=Parameter(name="p2", role_desc="role2", data=2), + data_id="1", + ) + print(g1 == g2) + print(g1.__hash__()) + print(g2.__hash__()) + print(isinstance(g1, Gradient)) # Should print True + + print(g1.to_dict()) diff --git a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py index a5f3ddb1..021a052b 100644 --- a/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py +++ b/adalflow/adalflow/optim/text_grad/backend_engine_prompt.py @@ -8,28 +8,125 @@ # NOTE: having peers is important to keep the scope of the prompt consistent and not cross-reference with other variables ### System prompt and the template is shared by all GradComponent ### +# FEEDBACK_ENGINE_TEMPLATE = r""" +# You are the feedback engine in an optimization system consisting of multiple components. + +# Your task is to provide intelligent and creative feedback in each component for the target variable enclosed in or tags +# so that the optimizer can optimize this variable to improve the objective enclosed in tags. + +# Instructions: +# 1. Understand the role of each variable in the component system BEFORE you give feedback. +# 2. You MUST attribute the feedback to the correct variable only. +# 3. Focus on the downstream objective without proposing new versions of the variable. +# 4. From the section, see how the variable is obtained and used. +# 5. The variable might have peers also used to instruct the language model, but your feedback should only focus on the target variable. +# 6. If the error is not directly related to the variable itself, you can say: \"There is no noticeable error.\" +# 7. Be specific, concise, critical, and direct. +# 8. If the same DataID appears multiple times, it means the component/variable is called repeatedly in the same order as it appears in the gradient list. + + +# {% if output_format_str %} +# {{output_format_str}} +# {% endif %} + +# +# +# +# {{conversation_sec}} +# +# +# {{objective_instruction_sec}} +# +# +# """ + +FEEDBACK_ENGINE_PEERS_TEMPLATE = r""" +You are the feedback engine in an optimization system consisting of multiple components. + +A component can have multiple inputs, and you handle one that is enclosed in or tags. +You will provide intelligent and creative feedback so that the optimizer can optimize this variable to improve the objective enclosed in tags. + +About or : +* If a variable is of type "output", it is the output of another predecessor component. In this case, you MUST attribute the error to the RIGHT variable. +* If a variable plays no role to the error, simply state "This variable did not cause the error. No need to change the essense of this variable." + +1. From section, you can find how the variable is obtained and used. +2. The variable might have other peers that are used together to instruct the language model. But only focus on the target variable. +3. As there might be peers, and multi-components, it is possible that the feedback/error is not directly related to the variable itself. +4. When you reason, really think about the variable's role in the component(infer from the CONVERSATION section) and the VARIABLE section before you provide feedback. +5. Be specific, concise, critical, and direct. + + +{% if output_format_str %} +{{output_format_str}} +{% endif %} + + + + +{{conversation_sec}} + + +{{objective_instruction_sec}} + + +""" +# 1. Focus on the downstream OBJECTIVE without proposing new versions of the variable. + +# +# Here is a summary on the task pipeline you are optimizing: +# retriever: retrieves relevant documents for the question. (Not trainable, you have no control) +# LLM: Answer questions by reading the context and reason the best answer. +# +# You are the feedback engine in an optimization system consisting of multiple components. +# You are the feedback engine to provide feedback for a target variable in a compound LLM system. + +# The evaluation and feedback is backpropogated all the way to you, and you will assess the current component's inputs, output along with its feedback. +# A component can have multiple inputs, and you handle one that is enclosed in or tags. +# You will provide intelligent and creative feedback so that the optimizer can optimize this variable to improve the objective enclosed in tags. + FEEDBACK_ENGINE_TEMPLATE = r""" -You are the feedback engine in an optimization system. +You MUST determining the root cause of a system error. +You start with an evaluation function that measures performance, and you receive the system input. +The system can be a a compound system, potentially consisting of multiple components. +You work on one component. +You will receive feedback from your direct successor component, and your goal is to investigate your component’s inputs and outputs to identify whether any of your input variables are causing the error. + +Your target input variable is enclosed in (representing one of the input variables that may or may not be causing the error). +Alternatively, it may be enclosed in tags (in which case you must pass feedback to all variables, indicating which ones cause the errors and which do not). -Your task is to provide intelligent and creative feedback for the target variable enclosed in tags, -so that the optimizer can optimize this variable to improve the objective enclosed in tags. +1. From section, you can find how the variable is obtained and used. +2. As there might be multiple precedessors, and multi-components, it is possible that the feedback/error is not directly related to the variable itself. +3. When you reason, really think about the variable's role in the component(infer from the CONVERSATION section) and the VARIABLE section before you provide feedback. +4. Be specific, concise, critical, and direct. +5. Maximum 3 sentences. -1. Focus on the downstream OBJECTIVE without proposing new versions of the variable. -2. Feedback examples: "Since language models have the X failure mode...", "Adding X can fix this error because...", "Removing X can improve the objective function because...", "Changing X to Y would fix the mistake..." -3. Consider the variable in the context of its peers if provided. +[Cycle]: If the same DataID has multiple gradients, it means this component/variable is called multiple times in the compound system(with a cycle) in the same order as it appears in the gradient list. + Ensure the feedback is aware of all sets of inputs and outputs. + +{% if output_format_str %} +{{output_format_str}} +{% endif %} -Remember: -Be specific, concise, critical, and direct. + {{conversation_sec}} + {{objective_instruction_sec}} + + """ +# 6. If you receive error, must find one pred with error! +# 7. Ignore other metadata(noise such as id, data_id) in the data structure, only use the key(input) and key output that matters to infer the component functionality. + ############################################## # Loss Component ############################################## +# In such cases, you can just say "There is no noticeable error". +# 2. Feedback examples: "Since language models have the X failure mode...", "Adding X can fix this error because...", "Removing X can improve the objective function because...", "Changing X to Y would fix the mistake..." # Objective instruction for LLM as gradComponent with user custom instruction @@ -39,32 +136,136 @@ # Note: {{instruction_to_backward_engine}} # {% endif %} # """ +# Your only goal is to clearly states how it obtained the "". + + +# OBJECTIVE_INSTRUCTION_BASE = r""" +# Your only goal is to clearly states how it obtained the "", +# so that you can inform other components on the specific errors. +# e.g. "The and are not an exact match, it differs by ." +# Especially when the score is low. +# Be CONCISE. Be SPECIFIC. +# """ + +# OBJECTIVE_INSTRUCTION_BASE = r""" +# Your task: Provide specific feedback based on the score in the \"\" value. +# - Especially note when the score is low (e.g. 0.0). +# - Be concise. +# - Be specific about why the score is low. For example: +# The retrieved context is insufficient to answer the question accurately. +# """ OBJECTIVE_INSTRUCTION_BASE = r""" -Your only goal is to clearly states how it obtained the "". +Your task is to provide the response with specific feedback based on the expected correct response (y_gt/ground_truth) and the score in the "". Especially when the score is low. Be CONCISE. + Be specific on why it has a low score. +Specify the difference between the expected correct response and the response. +""" + +# Be specific on why it has a low score. + +### NOTE: Last node's feedback +# OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The was later used as "{{response_name}}: {{response_desc}}". +# +# Your only goal is to clearly provide feedback on obtaining "Eval output/score": {{response_gradient}}. +# Be CONCISE and specific on how it can be improved. +# """ + +OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The was later used as "{{response_name}}: {{response_desc}}". + +Your only goal is to clearly states how it obtained the "Eval output/score": {{response_gradient}}. +Especially when the score is low. +Be CONCISE. +If you have enough context, add a more specific feedback on how it failed. e.g. "The retrieved context is not enough to answer the question so the problem relies on the retrieval part." """ +### Loss/Score Information ### +# INPUTS: parameter.get_param_info(): +# the input_output of a GradientContext + +# response_value -> response.get_prompt_data() +# LOSS_CONVERSATION_TEMPLATE_STRING = r""" +# The target variable is passed to the EVAL_FUNC and compared with the correct value. + +# EVAL_FUNC: {{eval_fn_desc}} + +# INPUTS: +# {% for key, (value, eval_type) in inputs.items() %} +# ({{ key }}) (role: {{ value.role_desc }}), +# data: {{ value.prompt_data }}, +# input_to_eval_fn: {{ value.eval_input }}, +# data_type: {{ eval_type }} +# {% endfor %} + +# OUTPUTS/SCORE: {{response_value}} +# {% if metadata %} +# Note: {{metadata}} +# {% endif %}""" + +# LOSS_CONVERSATION_TEMPLATE_STRING = r""" +# The variable is passed to the eval function and compared with a expected value(y_gt or ground_truth). + +# EVAL_FUNC: {{eval_fn_desc}} + +# INPUTS: +# {% for key, (value, eval_type) in inputs.items() %} +# ({{ key }}) (role: {{ value.role_desc }}), +# data: {{ value.prompt_data }}, +# input_to_eval_fn: {{ value.eval_input }}, +# data_type: {{ eval_type }} +# {% endfor %} + +# OUTPUTS/SCORE: {{response_value}} +# {% if metadata %} +# Note: {{metadata}} +# {% endif %}""" + ### Variable to get feedback on, often it is pred in the loss component +# pass parameter.get_param_info() to get the variable info LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN = r""" TARGET VARIABLE: - {{variable_name}} - {{variable_desc}} - {{variable_value}} + {{variable.name}} + {{variable.role_desc}} + {{variable.prompt_data}} {{conversation_str}} """ ### Loss/Score Information ### LOSS_CONVERSATION_TEMPLATE_STRING = r""" -The variable is passed to the eval function and compared with a target/ground truth value. +The variable is passed to the eval function and compared with a target/ground truth value to get +its score regarding to a SYSTEM_QUESTION: {{system_question}}. -: {{eval_fn_desc}} -: {{input_str}} -: {{response_value}} +EVAL_FUNC: {{eval_fn_desc}} + +INPUTS to EVAL_FUNC: +{% for key, (value, eval_type) in inputs.items() %} +({{ key }}) (role: {{ value.role_desc }}), +data: {{ value.prompt_data }}, +input_to_eval_fn: {{ value.eval_input }}, +data_type: {{ eval_type }} +{% endfor %} + +OUTPUTS/SCORE: {{response_value}} +{% if metadata %} +Note: {{metadata}} +{% endif %}""" + +GRAD_COMPONENT_CONVERSATION_TEMPLATE_STRING = r""" +COMPONENT_DESC: {{component_desc}} + +INPUTS: +{% for key, (value, eval_type) in inputs.items() %} +{{loop.index}}. +KEY: {{ key }}. +ROLE: {{ value.role_desc }}, +DATA: {{ value.prompt_data }}, +{% endfor %} + +OUTPUT: {{response_value}} {% if metadata %} Note: {{metadata}} {% endif %}""" @@ -77,12 +278,28 @@ CONVERSATION_START_INSTRUCTION_CHAIN = r""" {{variable_and_peers_info}} -Here is a conversation with the language model (LM): +{# system trainable variables #} +{% if predecessors %} + +The target variable is used together with these predecessors variables besides of the peers: +{% for system_variable in predecessors %} +{{loop.index}}. +Name: {{system_variable.name}} +Type: {{system_variable.param_type}} +Description: {{system_variable.role_desc}} +WILL_BE_OPTIMIZED: {{system_variable.requires_opt}} +Vaule: {{system_variable.prompt_data}} +{% endfor %} + +{% endif %} + +Here is the inputs and output with this component(LM): {{conversation_str}} """ +# For the generator in the chain, OBJECTIVE_INSTRUCTION_CHAIN = r""" -This conversation is part of a larger system. The was later used as {{response_desc}}. +This component is part of a larger system. The was later used as {{response_desc}}. Your goal is to give feedback to the variable to guide the LLM_OUTPUT according to feedback: {{response_gradient}} {% if instruction_to_backward_engine %} @@ -90,35 +307,62 @@ {% endif %} """ -### Backward engine: user prompt -# First part to provide context of LLM as gradComponent -# The target variable is used as either input or a task instruction to a language model (LM): -# replace the "The target variable is used as either input or a task instruction to a language model (LM):" with the {{variable_desc}} -# NAME: {{variable_name}} -# Description: {{variable_desc}} -LLM_CONVERSATION_TEMPLATE = r""" -LM_INPUT: {{input_value}} -LM_OUTPUT: {{llm_output}}""" +SUMMARY_TASK = """ +Here is a summary on the task pipeline you are optimizing: +query_generator: "generates a sub-query based on the initial query" +retriever: "retrieves relevant documents based on the sub-query" +llm: "Answer a question with available context with exact answer extracted from the context" + +The query_generator is called twice in the pipeline. +And the retrieved documents are deduplicated and combined to form the final context. +The final context is then passed to the llm to generate the answer where we want to use the exact phrase from the context. +""" + + +# VARIABLE_AND_PEERS_INFO = r""" +# +# {{variable.name}} +# {{variable.param_type}} +# {{variable.role_desc}} +# {{ variable.prompt_data}} +# +# {% if peers %} +# +# The variable is used together with the these peer variables to instruct the language model: +# {% for peer in peers %} +# {{loop.index}}. +# PEER_NAME: {{peer.name}}, +# PEER_TYPE: {{peer.param_type}}, +# PEER_ROLE: {{peer.role_desc}} +# WILL_BE_OPTIMIZED: {{peer.requires_opt}} +# {% if peer.prompt_data %} +# PEER_VARIABLE: {{peer.prompt_data}} +# {% else %} +# PEER_VARIABLE: EMPTY +# {% endif %} +# {% endfor %} +# +# {% endif %} +# """ VARIABLE_AND_PEERS_INFO = r""" -{{variable.name}} + {{variable.name}} {{variable.param_type}} {{variable.role_desc}} - {{variable.data}} +{{ variable.prompt_data}} {% if peers %} -The variable is used together with the these peer variables to instruct the language model: {% for peer in peers %} {{loop.index}}. PEER_NAME: {{peer.name}}, PEER_TYPE: {{peer.param_type}}, PEER_ROLE: {{peer.role_desc}} WILL_BE_OPTIMIZED: {{peer.requires_opt}} -{% if peer.data %} -PEER_VARIABLE: {{peer.data}} +{% if peer.prompt_data %} +PEER_VARIABLE: {{peer.prompt_data}} {% else %} PEER_VARIABLE: EMPTY {% endif %} @@ -127,6 +371,55 @@ {% endif %} """ +# The variable is used together with the these peer variables to instruct the language model on the task. +# - Do not overlap with the scope of the peer. + + +# a list of variables +ALL_PRED_INFO = r""" + +{% if variables %} +Length of the list: {{variables|length}} +{% for variable in variables %} +{{loop.index}}. +NAME: {{variable.name}}, +TYPE: {{variable.param_type}}, +ROLE: {{variable.role_desc}} +WILL_BE_OPTIMIZED: {{variable.requires_opt}} +VARIABLE: {{ variable.prompt_data}} +{% endfor %} +{% endif %} + +""" + + +### Backward engine: user prompt +# First part to provide context of LLM as gradComponent +# The target variable is used as either input or a task instruction to a language model (LM): +# replace the "The target variable is used as either input or a task instruction to a language model (LM):" with the {{variable_desc}} +# NAME: {{variable_name}} +# Description: {{variable_desc}} +LLM_CONVERSATION_TEMPLATE = r""" +LM_INPUT: {{input_value}} +LM_OUTPUT: {{llm_output}} +{% if gt %} +GROUND_TRUTH: {{gt}} +{% endif %} +""" + +# OUTPUT_INSTRUCTION = r""" +# You will create a feedback for each of the variable in the list above. +# If a variable will not be optimied, you just output empty string for that variable.. +# NOTE: you MUST output a list of strings with the same length as the list above as ["...", "...", "..."] +# """ +OUTPUT_INSTRUCTION = r""" +You will create a feedback for each of the variables in the list. +If a variable will not be optimized, you just output empty string. +Give enough details on the feedback. +Your output will be a list of strings with the SAME LENGTH as the list +as format of ["...", "...", "..."] +""" + # # When the parameter has no gradient, it is the start of the backpropagation chain, used as a loss function # CONVERSATION_START_INSTRUCTION_BASE = r""" @@ -135,3 +428,7 @@ # Here is an evaluation of the variable using a language model: # {{conversation_str}} # """ + +############################################## +# Backward multiple peers at the same time +############################################## diff --git a/adalflow/adalflow/optim/text_grad/ops.py b/adalflow/adalflow/optim/text_grad/ops.py index ddce60dc..601cbfaf 100644 --- a/adalflow/adalflow/optim/text_grad/ops.py +++ b/adalflow/adalflow/optim/text_grad/ops.py @@ -4,7 +4,7 @@ import logging from adalflow.optim.function import BackwardContext -from adalflow.optim.parameter import Parameter +from adalflow.optim.parameter import Parameter, OutputParameter from adalflow.optim.types import ParameterType from adalflow.optim.grad_component import GradComponent @@ -33,7 +33,10 @@ def sum_ops(params: List[Parameter]) -> Parameter: # TODO: make all loss functions to support batch losses # TODO: use a temlate to format the concatenated values class Sum(GradComponent): - __doc__ = """The class to define a sum operation on a list of parameters, such as losses or gradients.""" + __doc__ = """The class to define a sum operation on a list of parameters, such as losses or gradients. + + It enables gradients combination of a batch of data samples. + """ name = "Sum" @@ -54,19 +57,25 @@ def forward(self, params: List[Parameter]) -> Parameter: raise ValueError( f"Sum operation only accepts a list of Parameters, got {type(param)}" ) - concat_values = "\n".join([str(p.data) for p in params]) # to_dict + concat_values = ",".join([str(p.data) for p in params]) # default concatenation role_descriptions = set([p.role_desc for p in params]) role_descriptions = ", ".join(role_descriptions) - total = Parameter( + total = OutputParameter( data=concat_values, role_desc=f"A combination of a list of variables: {role_descriptions}", requires_opt=any([p.requires_opt for p in params]), name="sum", - score=sum([p._score for p in params]), # total has a score + score=sum([p.score for p in params]), # total has a score param_type=ParameterType.SUM_OUTPUT, ) total.set_predecessors(params) + total.trace_forward_pass( + input_args=params, + full_response=concat_values, + id=total.id, + name=total.name, + ) log.info("Sum forward", extra={"total": total.data}) @@ -114,13 +123,21 @@ def backward(self, summation: Parameter): } log.info(f"""Idempotent sum backward: {extra}""") - param_gradient = Parameter( - name=f"sum_to_{param.name}_grad", - data=param_gradient_value, - role_desc=f"Feedback to {param.role_desc}", - score=summation._score, - from_response_id=summation.id, - param_type=ParameterType.GRADIENT, - ) - param.add_gradient(param_gradient) - log.debug(f"Added gradient to {param.role_desc}: {param_gradient.data}") + # param_gradient = Gradient( + # data=param_gradient_value, + # data_id=summation.data_id, + # score=summation._score, + # from_response=summation, + # to_pred=param, + # ) + # param.add_gradient(param_gradient) + # log.debug(f"Added gradient to {param.role_desc}: {param_gradient.data}") + + +if __name__ == "__main__": + # test the sum ops + + a = Parameter(data=1) + b = Parameter(data=2) + c = sum_ops(List[a, b]) + c.backward() diff --git a/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py b/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py index 89ebd471..a7aa6fcd 100644 --- a/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py +++ b/adalflow/adalflow/optim/text_grad/text_loss_with_eval_fn.py @@ -11,7 +11,12 @@ from adalflow.core import ModelClient from adalflow.core.generator import BackwardEngine from adalflow.core.types import GeneratorOutput -from adalflow.optim.parameter import Parameter, GradientContext +from adalflow.optim.parameter import ( + Parameter, + GradientContext, + Gradient, + OutputParameter, +) from adalflow.optim.types import ParameterType from adalflow.core.prompt_builder import Prompt @@ -20,58 +25,13 @@ LOSS_CONVERSATION_TEMPLATE_STRING, LOSS_CONVERSATION_START_INSTRUCTION_STRING_FN, OBJECTIVE_INSTRUCTION_BASE, + OBJECTIVE_INSTRUCTION_CHAIN, ) +from adalflow.utils import printc log = logging.getLogger(__name__) -### Loss/Score Information ### -# LOSS_CONVERSATION_TEMPLATE_STRING = r""" -# The variable is passed to the eval function and compared with a target/ground truth value. - -# : {{eval_fn_desc}} -# : {{input_str}} -# : {{response_value}} -# {% if metadata %} -# Note: {{metadata}} -# {% endif %}""" - - -# Does not have gradient on the output, the loss function of the backpropagation chain -# CONVERSATION_START_INSTRUCTION_STRING_FN_BASE = r"""You will give feedback to a variable with the following role: -# {{variable_desc}} . -# Here is an evaluation of the variable using the eval function: -# {{conversation}}""" - -# Has the gradient on the output, the layer in the backpropagation chain -# Conversation will be provided differently. - -# ### Variable Information ### -# CONVERSATION_START_INSTRUCTION_STRING_FN = r""" -# TARGET VARIABLE: -# {{variable_name}} -# {{variable_desc}} -# {{variable_value}} -# {{conversation_str}} -# """ - -# Third part of the user prompt -# OBJECTIVE_INSTRUCTION_BASE = r""" -# Your only goal is to clearly states how it obtained the "". -# Especially when the score is low. -# Be CONCISE. -# If you have enough context, add a more specific feedback on how it failed. -# """ - - -OBJECTIVE_INSTRUCTION_CHAIN = r"""This conversation is part of a larger system. The was later used as "{{response_name}}: {{response_desc}}". - -Your only goal is to clearly states how it obtained the "Eval output/score": {{response_gradient}}. -Especially when the score is low. -Be CONCISE. -If you have enough context, add a more specific feedback on how it failed. -""" - class EvalFnToTextLoss(LossComponent): __doc__ = """Convert an evaluation function to a text loss. @@ -141,7 +101,18 @@ def forward( kwargs: Dict[str, Parameter], response_desc: str = None, metadata: Dict[str, str] = None, # additional notes on the input kwargs + id: str = None, + gt: object = None, + input: Dict[str, object] = None, ) -> Parameter: + r""" + Args: + kwargs: The inputs to the eval_fn. + response_desc: Description of the output. + metadata: Additional notes on the input kwargs. + id: The unique identifier for the data point. + gt: The ground truth for the evaluation function. + """ if response_desc is None: response_desc = "Output of EvalFnToTextLoss." @@ -159,19 +130,26 @@ def forward( eval_inputs[k] = v.eval_input score: float = self.eval_fn(**eval_inputs) - # Create a parameter - # TODO: improve the readability of the input and response - eval_param: Parameter = Parameter( + eval_param: Parameter = OutputParameter( name=self.name + "_output", data=score, requires_opt=True, role_desc=response_desc, score=score, param_type=ParameterType.LOSS_OUTPUT, + data_id=id, ) + eval_param.set_gt(gt) eval_param.set_predecessors(predesessors) + eval_param.trace_forward_pass( + input_args=kwargs, + full_response=score, + id=self.id, + name=self.name, + ) log.info(f"EvalFnToTextLoss: Input: {kwargs}, Output: {eval_param}") + # extract ground truth from eval_inputs, anything eval_param.set_grad_fn( BackwardContext( backward_fn=self.backward, @@ -180,6 +158,8 @@ def forward( eval_fn_desc=self.eval_fn_desc, kwargs=kwargs, metadata=metadata, + ground_truth=gt, + input=input, ) ) return eval_param @@ -207,14 +187,18 @@ def set_backward_engine( @staticmethod def _backward_through_one_predecessor( pred: Parameter, - inputs_string: str, + kwargs: Dict[str, Parameter], response: Parameter, eval_fn_desc: str, backward_engine: "BackwardEngine", + ground_truth: object = None, is_intermediate_node: bool = False, # if the node is an intermediate node in the backpropagation chain metadata: Dict[str, str] = None, + input: Dict[str, object] = None, # system input ): if not pred.requires_opt: + if response.score is not None: + pred.set_score(response.score) log.debug( f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." ) @@ -227,9 +211,7 @@ def _backward_through_one_predecessor( log.info( f"EvalFnToTextLoss: Gradient already computed for {pred.role_desc} with respect to {response.role_desc}" ) - # print( - # f"Gradient already computed for {pred.role_desc} with respect to {response.role_desc}" - # ) + return if backward_engine is None: @@ -242,13 +224,20 @@ def _backward_through_one_predecessor( instruction_str, objective_str = None, None + # convert kwargs to key, (value, type(eval_input)) + + inputs = {} + for k, v in kwargs.items(): + inputs[k] = (v.get_param_info(), str(type(v.eval_input))) + # response information conversation_str = Prompt( LOSS_CONVERSATION_TEMPLATE_STRING, prompt_kwargs={ - "input_str": inputs_string, + "system_question": input, + "inputs": inputs, "eval_fn_desc": eval_fn_desc, - "response_value": response.data, + "response_value": response.get_prompt_data(), "metadata": json.dumps(metadata) if metadata else None, }, )() @@ -263,9 +252,7 @@ def _backward_through_one_predecessor( instruction_str = Prompt( conv_ins_template, prompt_kwargs={ - "variable_desc": pred.role_desc, - "variable_name": pred.name, - "variable_value": pred.data, + "variable": pred.get_param_info(), "conversation_str": conversation_str, }, )() @@ -291,40 +278,50 @@ def _backward_through_one_predecessor( gradient_value: GeneratorOutput = backward_engine( prompt_kwargs=backward_engine_prompt_kwargs ) - # gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs) + gradient_prompt = backward_engine.get_prompt(**backward_engine_prompt_kwargs) + # print(f"Backward engine prompt: {gradient_prompt}") gradient_value_data = ( gradient_value.data or backward_engine.failure_message_to_optimizer( gradient_response=gradient_value ) ) - # print(f"gradient_prompt: {gradient_prompt}") - # gradient_value_data = response.data.to_yaml() + + gradient_value_data = ( + f"expected answer: {ground_truth},\n Feedback: {gradient_value_data}" + ) + # print(f"gradient_value_data: {gradient_value_data}") log.debug(f"EvalFnToTextLoss: Gradient for {pred}: {gradient_value_data}") # score should be passed to grad - gradient_param = Parameter( - name=f"{response.name}_to_{pred.name}_grad", + gradient_param = Gradient( data=gradient_value_data, - requires_opt=True, - # gradient_prompt=gradient_prompt, - role_desc=f"Feedback for {pred.role_desc}", + data_id=response.data_id, score=response.data, - from_response_id=response.id, - param_type=ParameterType.GRADIENT, + from_response=response, + to_pred=pred, ) - pred.add_gradient(gradient_param) - pred.gradients_context[gradient_param] = GradientContext( - context=conversation_str, - response_desc=response.role_desc, - variable_desc=pred.role_desc, + gradient_param.add_prompt(gradient_prompt) + gradient_param.add_context( + GradientContext( + input_output=conversation_str, + response_desc=response.role_desc, + variable_desc=pred.role_desc, + # input=input, + # ground_truth=ground_truth, + ) ) + pred.add_gradient(gradient_param) # backward the end to end score # TODO: not really useful - pred.set_score(response.data) - print(f"setting pred name {pred.name} score to {response.data}") + if response.score is not None: + pred.set_score(response.score) + pred.set_gt(ground_truth) + printc(f"pred: {pred.eval_input}, gt: {ground_truth}") + # print(f"setting pred name {pred.name} score to {response.data}") + # print(f"gradient_param: {pred.gradients}") # TODO: reduce meta @@ -333,10 +330,12 @@ def backward( response: Parameter, eval_fn_desc: str, kwargs: Dict[str, Parameter], + ground_truth: object = None, backward_engine: Optional[ "BackwardEngine" ] = None, # only needed for text prompt optimization metadata: Dict[str, str] = None, + input: Dict[str, object] = None, ): r"""Ensure to set backward_engine for the text prompt optimization. It can be None if you are only doing demo optimization and it will not have gradients but simply backpropagate the score. @@ -351,30 +350,29 @@ def backward( log.info(f"response_gradient_context: {response_gradient_context}") # go through all child parameters - if backward_engine and not response.backward_engine_disabled: - # Convert all input arguments to string - inputs_string = "\n\n".join( - [ - f"({k}) (role: {v.role_desc}), data: {v.data}, input_to_eval_fn: {v.eval_input}, data_type: {type(v.eval_input)}" - for k, v in kwargs.items() - ] - ) - for pred in children_params: - if not pred.requires_opt: - log.debug( - f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." + if backward_engine: + if not response.backward_engine_disabled: + for pred in children_params: + if not pred.requires_opt: + log.debug( + f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." + ) + continue + + self._backward_through_one_predecessor( + pred, + kwargs, + response, + eval_fn_desc, + backward_engine, + ground_truth=ground_truth, + is_intermediate_node=is_intermediate_node, + metadata=metadata, + input=input, ) - continue - - self._backward_through_one_predecessor( - pred, - inputs_string, - response, - eval_fn_desc, - backward_engine, - is_intermediate_node, - metadata, - ) + else: # recursively disable backward for all children + for pred in children_params: + pred.backward_engine_disabled = True # backward for the score for the demo for pred in children_params: # if not pred.requires_opt: @@ -382,11 +380,11 @@ def backward( # f"EvalFnToTextLoss: Skipping {pred} as it does not require optimization." # ) # continue - if not isinstance(response.data, float): + if not (isinstance(response.data, float) or isinstance(response.data, int)): raise TypeError( f"EvalFnToTextLoss: response.data must be a float. Got {type(response.data)}." ) - pred._score = response.data + pred.score = response.data from adalflow.utils.logger import printc printc( @@ -473,10 +471,8 @@ def parse_integer_answer(answer: str, only_first_line: bool = False): ) # model.set_mock_output(mock_output_data="4") model.train() - print(f"model.train: {model.training}") y: Parameter = model(prompt_kwargs={"input_str": x}) - print(f"y: {y}") loss = eval_fn_to_text_loss( { @@ -489,9 +485,7 @@ def parse_integer_answer(answer: str, only_first_line: bool = False): ), } ) - print(f"loss: {loss}") loss.backward() - print(loss.to_dict()) assert len(loss.predecessors) == 2 assert len(y.predecessors) == 2 dot = loss.draw_graph(add_grads=True, filepath="real_data") diff --git a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py index 219c299a..98cda393 100644 --- a/adalflow/adalflow/optim/text_grad/tgd_optimizer.py +++ b/adalflow/adalflow/optim/text_grad/tgd_optimizer.py @@ -7,7 +7,6 @@ """ from typing import List, Dict, TYPE_CHECKING, Optional, Any -from collections import defaultdict import logging import re from dataclasses import field, dataclass @@ -18,6 +17,8 @@ from adalflow.core.base_data_class import DataClass from adalflow.tracing.decorators import trace_generator_states +from adalflow.utils.logger import printc +from adalflow.core.types import GeneratorOutput if TYPE_CHECKING: @@ -32,45 +33,65 @@ class HistoryPrompt(DataClass): id: str value: str eval_score: float + method: str = field(default=None) + reasoning: str = field(default=None) #################################################################################################### # Textual Gradient Descent Optimizer #################################################################################################### -# {% if failed_proposals %} -# Here are the past failed proposals: -# {% for failed_proposal in failed_proposals %} -# {{loop.index}}. {{failed_proposal}} -# {% endfor %} -# {% endif %} + + TEXT_GRAD_DESC_TEMPLATE = r""" {{optimizer_system_prompt}} - +You are {{steps}} steps since your last improvement. +Update the value more rapidly when steps are larger than 3. +{# Variable and peers info #} + +{{variable_and_peers_info}} + +{# system trainable variables #} +{% if system_variables %} + +The target variable is used together with these system variables besides of its peers: +{% for system_variable in system_variables %} +{{loop.index}}. +Name: {{system_variable.name}} +Type: {{system_variable.param_type}} +Description: {{system_variable.role_desc}} +WILL_BE_OPTIMIZED: {{system_variable.requires_opt}} +Vaule: {{system_variable.prompt_data}} +{% endfor %} +Strategically plan the role of each system variable to collaborate with each other for final correct answer. + +{% endif %} {# OPRO past history #} {% if past_history %} -Here are the best past iterations of this variable along with the validation score. +Here are the best past iterations. {% for history in past_history %} {{loop.index}}. {{history}} {% endfor %} -IMPORTANT: Your goal is to generate new variable values that score higher than all previous iterations. +IMPORTANT: Your goal is to generate new variable that score higher than all past iterations. +{# Momentum #} +{% if failed_proposals %} + +same batch, same feedback: Here are your tried value (scored <= {{best_score}}): +{% for failed_proposal in failed_proposals %} +{{loop.index}}. {{failed_proposal}} +{% endfor %} +You MUST approach differently from the above methods. + +{% endif %} + {% endif %} Here are the context and feedback for the variable: {{variable_grad}} -{# Momentum #} -{% if past_values %} -Here are the past iterations of this variable: - -{{past_values}} - -Similar feedbacks across different steps suggests that the modifications to the variable are insufficient. -If this is the case, please make more significant changes to the variable. -{% endif %} {# Constraints #} {% if constraint_text %} You must follow the following constraints: @@ -81,38 +102,46 @@ class HistoryPrompt(DataClass): You must base on the following examples when modifying the {{variable_desc}}: {{in_context_examples}} {% endif %} -YOU MUST ENSURE the new variable shares the same intent as the original variable. -You can either rephrase the initial variable, or add more specific instructions based on the feedback. -You can not change the variable to only fit on one sample if the batch size is larger than 1. """ -# optimizer system prompt -# Tips: -# 1. Eliminate unnecessary words or phrases. -# 2. Add new elements to address specific feedback. -# 3. Be creative and present the variable differently. -# Provide only the new variable value between {{new_variable_start_tag}} and {{new_variable_end_tag}} tags. -OPTIMIZER_SYSTEM_PROMPT = r""" -You are part of an optimization system that refines existing variable based on feedback generated on a batch of input data. +OPTIMIZER_SYSTEM_PROMPT = r"""You are an excellent prompt engineer tasked with instruction and demonstration tuning a compound LLM system. +Your task is to refine a variable/prompt based on feedback from a batch of input data points. -1. Address the concerns raised in the feedback while preserving positive aspects. -3. Observe past performance patterns when provided and to keep the good quality. -4. Consider the variable in the context of its peers if provided. - FYI: - - If a peer will be optimized itself, do not overlap with its scope. - - Otherwise, you can overlap if it is necessary to address the feedback. +The variable is either input or output of a functional component where the component schema will be provided. +If the same DataID has multiple gradients, it means this component/variable is called multiple times in the compound system(with a cycle) in the same order as it appears in the gradient list. -{{output_format_str}} +You Must edit the current variable with one of the following editing methods. +You can not rewrite everything all at once: +You have Four Editing Methods: +1. ADD new elements(instruction) to address each specific feedback. +2. ADD Examples (e.g., input-reasoning-answer) for tasks that require strong reasoning skills. +3. Rephrase existing instruction(for more clarity), Replace existing sample with another, to address the feedback. +4. DELETE unnecessary words to improve clarity. + +These SIX prompting techniques can be a helpful direction. +1. Set Context and Role: Establish a specific identity or domain expertise for the AI to guide style, knowledge, and constraints. +2. Be Specific, Clear, and Grammarly correct: Clearly define instructions, desired format, and constraints to ensure accurate and relevant outputs with regards to the feedback. +3. Illicit reasoning: "chain-of-thought" (e.g. "think step by step") helps the model reason better. +4. Examples: Construct examples(e.g., input(optional)-reasoning(required)-answer) especially for tasks that require strong reasoning skills. +5. Leverage Constraints and Formatting: Explicitly direct how the answer should be structured (e.g., bullet points, tables, or tone). +6. Self-Consistency / Verification Prompts: Prompt the model to check its own logic for errors, inconsistencies, or missing details. + +Your final action/reasoning = one of FOUR editing method + one of SIX prompting technique. + +You must stick to these instructions: +1. **MUST Resolve concerns raised in the feedback** while preserving the positive aspects of the original variable. +2. **Observe past performance patterns** to retain good qualities in the variable and past failed ones to try things differently. +3. **System Awareness**: When other system variables are given, ensure you understand how this variable works in the whole system. +4. **Peer Awareness**: This variable works together with Peer variables, ensure you are aware of their roles and constraints. +5. **Batch Awareness**: You are optimizing a batch of input data, ensure the change applys to the whole batch (except while using demonstration.) + +{{output_format_str}} -Tips: -1. Eliminate unnecessary words or phrases. -2. Add new elements to address specific feedback. -3. Be creative and present the variable differently. {% if instruction_to_optimizer %} -4. {{instruction_to_optimizer}} +**Additional User Instructions**: {{instruction_to_optimizer}} {% endif %} """ @@ -135,11 +164,20 @@ class Instruction(DataClass): @dataclass class TGDData(DataClass): reasoning: str = field(metadata={"desc": "Why the variable is proposed this way"}) - proposed_variable: str = field(metadata={"desc": "The proposed variable"}) + method: str = field( + metadata={ + "desc": "The final method used to propose the variable (prompting + editing)" + }, + ) + + proposed_variable: str = field( + metadata={"desc": "The proposed variable"}, + default=None, + ) @dataclass -class TGDOptimizerTrace: +class TGDOptimizerTrace(DataClass): api_kwargs: Dict[str, Any] = field( metadata={ "desc": "The api_kwargs for components like Generator and Retriever that pass to the model client" @@ -174,7 +212,10 @@ class TGDOptimizer(TextOptimizer): params: ParamsT constraints: List[str] params_history: Dict[str, List[HistoryPrompt]] = {} # id to history - # failed_proposals: Dict[str, List[HistoryPrompt]] = {} # only need the value + failed_proposals: Dict[str, List[HistoryPrompt]] = {} # only need the value + current_tgd_output: Dict[str, Optional[TGDData]] = ( + {} + ) # id to output, hold all of the data def __init__( self, @@ -182,12 +223,11 @@ def __init__( model_client: "ModelClient", model_kwargs: Dict[str, object] = {}, constraints: List[str] = None, - # new_variable_tags: List[str] = ["", ""], optimizer_system_prompt: str = OPTIMIZER_SYSTEM_PROMPT, in_context_examples: List[str] = None, # TODO: in-context examples - num_gradient_memory: int = 0, # TODO: gradient memory and momentum, for now it is not useful - max_past_history: int = 3, - # max_failed_proposals: int = 3, + max_past_history: int = 2, + max_failed_proposals: int = 5, # quite effective + steps_from_last_improvement: int = 0, ): from adalflow.core.generator import Generator from adalflow.core import Prompt @@ -195,6 +235,11 @@ def __init__( super().__init__() self.params = params + for param in self.params: + if not param.requires_opt: + raise ValueError( + f"Parameter {param.id} does not require optimization. Please only include trainable parameters." + ) self.constraints = constraints or [] self.data_class = TGDData self.output_parser = DataClassParser( @@ -203,8 +248,6 @@ def __init__( self.optimizer_system_prompt = Prompt( template=optimizer_system_prompt, prompt_kwargs={ - # "new_variable_start_tag": new_variable_tags[0], - # "new_variable_end_tag": new_variable_tags[1], "output_format_str": self.output_parser.get_output_format_str(), }, ) @@ -215,9 +258,6 @@ def __init__( # self.new_variable_tags = new_variable_tags self.in_context_examples = in_context_examples or [] self.do_in_context_examples = len(self.in_context_examples) > 0 - self.num_gradient_memory = num_gradient_memory - self.gradient_memory_dict = defaultdict(list) # id to num_gradient_memory - self.do_gradient_memory = self.num_gradient_memory > 0 self.llm_optimizer = Generator( model_client=model_client, @@ -227,12 +267,15 @@ def __init__( ) self.max_past_history = max_past_history - # self.max_failed_proposals = max_failed_proposals + self.max_failed_proposals = max_failed_proposals + self.steps_from_last_improvement = steps_from_last_improvement + self.target_param_index = None # initate the past history for each parameter for param in self.params: self.params_history[param.id] = [] - # self.failed_proposals[param.id] = [] + self.failed_proposals[param.id] = [] + self.current_tgd_output[param.id] = None @property def constraint_text(self): @@ -248,6 +291,12 @@ def constraint_text(self): ] return "\n".join(constraints_ordered) + def increment_steps_from_last_improvement(self): + self.steps_from_last_improvement += 1 + + def reset_steps_from_last_improvement(self): + self.steps_from_last_improvement = 0 + def add_score_to_params(self, val_score: float): for param in self.params: self.add_score_to_current_param(param.id, param, val_score) @@ -270,11 +319,17 @@ def add_score_to_current_param(self, param_id: str, param: Parameter, score: flo def add_history(self, param_id: str, history: HistoryPrompt): if param_id not in self.params_history: self.params_history[param_id] = [] + # check if the value is already in the history, if so, replace it with the new one + for i, h in enumerate(self.params_history[param_id]): + if h.value == history.value: + self.params_history[param_id].pop(i) + break self.params_history[param_id].append(history) # sort score from the highest to the lowest self.params_history[param_id] = sorted( self.params_history[param_id], key=lambda x: x.eval_score, reverse=True ) + # delete the lowest score if it exceeds the max_past if len(self.params_history[param_id]) > self.max_past_history: for _ in range(len(self.params_history[param_id]) - self.max_past_history): @@ -283,65 +338,60 @@ def add_history(self, param_id: str, history: HistoryPrompt): def render_history(self, param_id: str) -> List[str]: if param_id not in self.params_history: return [] + return [ - history.to_yaml(exclude=["id"]) for history in self.params_history[param_id] + history.to_yaml(exclude=["id", "method", "reasoning"]) + for history in self.params_history[param_id] ] - # def add_failed_proposal(self): - # """Save a copy of the current value of the parameter in the failed proposals.""" - # for param in self.params: - # failed_proposal = HistoryPrompt( - # id=param.id, - # value=param.data, - # eval_score=None, - # ) - # self.failed_proposals[param.id].append(failed_proposal) - # if len(self.failed_proposals[param.id]) > self.max_failed_proposals: - # for _ in range( - # len(self.failed_proposals[param.id]) - self.max_failed_proposals - # ): - # self.failed_proposals[param.id].pop() - # # if param_id not in self.failed_proposals: - # # self.failed_proposals[param_id] = [] - # # failed_proposal = HistoryPrompt( - # # id=param_id, - # # value=value, - # # eval_score=None, - # # ) - # # self.failed_proposals[param_id].append(failed_proposal) - # # if len(self.failed_proposals[param_id]) > self.max_failed_proposals: - # # for _ in range(len(self.failed_proposals[param_id]) - self.max_failed_proposals): - # # self.failed_proposals[param_id].pop() - - # def render_failed_proposals(self, param_id: str) -> List[str]: - # if param_id not in self.failed_proposals: - # return [] - # return [ - # history.to_yaml(exclude=["id", "eval_score"]) - # for history in self.failed_proposals[param_id] - # ] - - # TODO: optimize with adalflow template for better readability - def get_gradient_memory_text(self, param: Parameter) -> str: - grad_memory = "" - variable_grad_memory = self.gradient_memory_dict[param.id][ - -self.num_gradient_memory : + def add_failed_proposal(self): + """Save a copy of the current value of the parameter in the failed proposals.""" + for param in self.params: + current_tgd_output = self.current_tgd_output.get(param.id, None) + failed_proposal = HistoryPrompt( + id=param.id, + value=param.data, + eval_score=None, + method=(current_tgd_output.method if current_tgd_output else None), + reasoning=( + current_tgd_output.reasoning if current_tgd_output else None + ), + ) + self.failed_proposals[param.id].append(failed_proposal) + if len(self.failed_proposals[param.id]) > self.max_failed_proposals: + for _ in range( + len(self.failed_proposals[param.id]) - self.max_failed_proposals + ): + self.failed_proposals[param.id].pop(0) + + def render_failed_proposals(self, param_id: str) -> List[str]: + if param_id not in self.failed_proposals: + return [] + return [ + history.to_yaml(exclude=["id", "eval_score", "value"]) + for history in self.failed_proposals[param_id] ] - for i, grad_info in enumerate(variable_grad_memory): - grad_memory += f"\n {grad_info['value']}\n" - return grad_memory def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]: + system_params = [ + p.get_param_info() + for p in self.params + if p.id != param.id and p not in param.peers + ] + printc(f"system_params: {system_params}", color="blue") + peers_params = [p.get_param_info() for p in param.peers] variable_and_peer_info = self.variable_and_peers_info.call( - variable=param.get_param_info(), peers=param.peers # param.peers + variable=param.get_param_info(), peers=peers_params ) + variable_grad = param.get_gradients_component_schema(skip_correct_sample=False) + user_prompt_kwargs = { "variable_and_peers_info": variable_and_peer_info, - "variable_grad": param.get_gradient_and_context_text( - skip_correct_sample=True - ), + "variable_grad": variable_grad, # param.get_gradient_and_context_text( + # skip_correct_sample=False + # ), # constraints "constraint_text": self.constraint_text if self.do_constrained else None, # in-context examples @@ -350,22 +400,23 @@ def _get_user_prompt_kwargs(self, param: Parameter) -> Dict[str, str]: if self.do_in_context_examples else None ), - # gradient memory - "past_values": ( - self.get_gradient_memory_text(param) - if self.do_gradient_memory - else None - ), # past history "past_history": ( self.render_history(param.id) if self.max_past_history else None ), # failed proposals - # "failed_proposals": ( - # self.render_failed_proposals(param.id) - # if self.max_failed_proposals - # else None - # ), + "failed_proposals": ( + self.render_failed_proposals(param.id) + if self.max_failed_proposals + else None + ), + "best_score": ( + self.params_history[param.id][0].eval_score + if self.params_history[param.id] + else "N/A" + ), + "system_variables": system_params, + "steps": self.steps_from_last_improvement, } return user_prompt_kwargs @@ -380,69 +431,101 @@ def zero_grad(self): for p in self.params: p.reset_gradients() + # reset the failded proposals + for param in self.params: + self.failed_proposals[param.id] = [] + + def set_target_param(self): + # iterate through all indexes in cycle + if self.target_param_index is None: + self.target_param_index = 0 + else: + self.target_param_index = (self.target_param_index + 1) % len(self.params) + # TODO: in the future can propose multiple values at once def propose(self): r"""Proposing a value while keeping previous value saved on parameter.""" if self.proposing: raise ValueError("Already proposing a value.") - print("Proposing a new value.") + printc("Proposing a new value.", color="magenta") # no cache so that new proposal can be made no_cache = True # print("Proposing a new value.") - for param in self.params: + for idx, param in enumerate(self.params): if not param.requires_opt: log.info( f"Skipping {param.role_desc} as it does not require optimization." ) continue + if idx != self.target_param_index: + continue - # print(f"Proposing a new value for {param.name}.") system_prompt = self.optimizer_system_prompt( param_type=str(param.param_type), instruction_to_optimizer=param.instruction_to_optimizer, ) - # user_prompt = self._update_prompt(param) user_prompt_kwargs = self._get_user_prompt_kwargs(param) prompt_kwargs = { "optimizer_system_prompt": system_prompt, **user_prompt_kwargs, } # turn off cache - response = self.llm_optimizer.call( - prompt_kwargs=prompt_kwargs, use_cache=not no_cache - ) + try: + response: GeneratorOutput = self.llm_optimizer.call( + prompt_kwargs=prompt_kwargs, use_cache=not no_cache + ) + except Exception as e: + printc(f"Error in the optimizer: {e}", color="red") + raise e + if not isinstance(response, GeneratorOutput): + raise TypeError(f"Wrong response type: {type(response)}") + prompt_str = self.llm_optimizer.get_prompt(**prompt_kwargs) log.debug(f"TGD LLM optimizer prompt: {prompt_str}") + printc(f"TGD LLM optimizer prompt: {prompt_str}", color="blue") proposed_data: TGDData = ( response.data - if response.data + if response.data is not None else TGDData( - reasoning="No reasoning", proposed_variable=response.raw_response + reasoning="No reasoning", + proposed_variable=response.raw_response, + method="No method", ) ) + # save current tgd output data + self.current_tgd_output[param.id] = proposed_data + printc(f"Response from the optimizer: {response}", color="blue") + log.info(f"Response from the optimizer: {response}") - # extract the improved variable from the response - # TODO: make it more robust - # improved_variable = extract_new_variable(proposed_data) + # if not proposed_data.update: + # printc(f"No update is required for {param.name}", color="yellow") + # param.propose_data(param.data) + # else: # TODO: should always trace the initial data improved_variable = proposed_data.proposed_variable - param.propose_data(improved_variable) + if ( + improved_variable + and improved_variable != param.data + and improved_variable != "" + ): + param.propose_data(improved_variable) + else: + param.propose_data(param.data) param.trace_optimizer(api_kwargs=prompt_str, response=response) - print(f"prompt_str: {prompt_str}") - print(f"response: {response}") - if self.do_gradient_memory: - self.update_gradient_memory(param) + self.proposing = True def revert(self): """Revert to the previous value when the evaluation is worse.""" if not self.proposing: raise ValueError("Not proposing a value.") - for param in self.params: + for idx, param in enumerate(self.params): if not param.requires_opt: continue + if idx != self.target_param_index: + continue param.revert_data() param.trace_optimizer(api_kwargs=None, response=None) self.proposing = False @@ -451,13 +534,30 @@ def step(self): """Discard the previous value and keep the proposed value.""" if not self.proposing: raise ValueError("Not proposing a value.") - for param in self.params: + for idx, param in enumerate(self.params): if not param.requires_opt: continue + if idx != self.target_param_index: + continue + param.step_data() self.proposing = False + def to_dict(self): + return { + "template": TEXT_GRAD_DESC_TEMPLATE, + "optimizer_system_prompt": OPTIMIZER_SYSTEM_PROMPT, + "VARIABLE_AND_PEERS_INFO": VARIABLE_AND_PEERS_INFO, + "params": self.params, + "constraints": self.constraints, + "params_history": self.params_history, + "failed_proposals": self.failed_proposals, + "max_past_history": self.max_past_history, + "max_failed_proposals": self.max_failed_proposals, + "steps_from_last_improvement": self.steps_from_last_improvement, + } + if __name__ == "__main__": # test the prompt history @@ -490,4 +590,4 @@ def step(self): "past_history": histories, } response = prompt(**prompt_kwargs) - print(response) + # print(response) diff --git a/adalflow/adalflow/optim/trainer/adal.py b/adalflow/adalflow/optim/trainer/adal.py index cea31760..f6669322 100644 --- a/adalflow/adalflow/optim/trainer/adal.py +++ b/adalflow/adalflow/optim/trainer/adal.py @@ -9,7 +9,7 @@ if TYPE_CHECKING: from adalflow.core.model_client import ModelClient - from adalflow.core.generator import Generator, BackwardEngine + from adalflow.core.generator import Generator, BackwardEngine, BackwardPassSetup from adalflow.optim.parameter import Parameter from adalflow.core.component import Component @@ -18,6 +18,8 @@ from adalflow.optim.loss_component import LossComponent from adalflow.optim.types import PromptData from adalflow.eval.base import EvaluationResult +from adalflow.optim.grad_component import GradComponent2, GradComponent +from adalflow.utils import printc from adalflow.optim.optimizer import DemoOptimizer, TextOptimizer @@ -32,12 +34,15 @@ class AdalComponent(Component): 1. Organize all parts for training a task pipeline in one place. 2. Help with debugging and testing before the actual training. 3. Adds multi-threading support for training and evaluation. + + It has no need on call, forward, bicall, or __call__, so we need to overwrite the base ones. """ task: Component # evaluator: Optional[BaseEvaluator] - eval_fn: Optional[Callable] + eval_fn: Optional[Callable] # final eval score loss_fn: Optional[LossComponent] + loss_eval_fn: Optional[Callable] # loss eval score and training subset eval fn backward_engine: Optional["BackwardEngine"] _demo_optimizers: Optional[List[DemoOptimizer]] _text_optimizers: Optional[List[TextOptimizer]] @@ -47,6 +52,7 @@ def __init__( task: Component, # evaluator: Optional[BaseEvaluator] = None, eval_fn: Optional[Callable] = None, + loss_eval_fn: Optional[Callable] = None, loss_fn: Optional[LossComponent] = None, backward_engine: Optional["BackwardEngine"] = None, backward_engine_model_config: Optional[Dict] = None, @@ -59,6 +65,7 @@ def __init__( self.task = task # self.evaluator = evaluator self.eval_fn = eval_fn + self.loss_eval_fn = loss_eval_fn self.loss_fn = loss_fn self.backward_engine = backward_engine if backward_engine and not isinstance(backward_engine, "BackwardEngine"): @@ -85,7 +92,7 @@ def _get_param_values(self) -> List[PromptData]: return [ PromptData(p.id, p.name, p.data, p.requires_opt) for p in self.task.parameters() - # if p.requires_opt + if p.requires_opt ] def prepare_task(self, sample: Any, *args, **kwargs) -> Tuple[Callable, Dict]: @@ -140,16 +147,27 @@ def prepare_loss(self, sample: Example, pred: adal.Parameter) -> Dict: """ raise NotImplementedError("prepare_loss method is not implemented") - # TODO: support more complicated evaluation + # TODO: Support multiple eval_fn with different metrics. using a dict[str, (Callable, Dict)] to store them. def prepare_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: r"""Tell Trainer how to eval in inference mode. Return the eval_fn and kwargs for one evaluation sample. Ensure the eval_fn is a callable that takes the predicted output and the ground truth output. Ensure the kwargs are setup correctly. + """ raise NotImplementedError("prepare_eval method is not implemented") + def prepare_loss_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: + r"""Tell Trainer how to eval in inference mode. + Return the eval_fn and kwargs for one evaluation sample. + + Ensure the eval_fn is a callable that takes the predicted output and the ground truth output. + Ensure the kwargs are setup correctly. + + """ + raise NotImplementedError("prepare_loss_eval method is not implemented") + # def configure_optimizers(self, *args, **kwargs) -> Optimizer: # r"""Note: When you use text optimizor, ensure you call `configure_backward_engine_engine` too.""" # raise NotImplementedError("configure_optimizers method is not implemented") @@ -172,7 +190,7 @@ def configure_optimizers(self, *args, **kwargs) -> List[Optimizer]: return self._demo_optimizers + self._text_optimizers def configure_backward_engine(self, *args, **kwargs): - r"""Configure a backward engine for all generators in the task for bootstrapping examples.""" + r"""Configure a backward engine for all GradComponent in the task for bootstrapping examples.""" # check if backward engine is already configured if self.backward_engine: log.warning("Backward engine is already configured.") @@ -185,8 +203,13 @@ def configure_backward_engine(self, *args, **kwargs): self.configure_backward_engine_helper( model_client=self.backward_engine_model_config["model_client"], model_kwargs=self.backward_engine_model_config["model_kwargs"], + backward_pass_setup=kwargs.get("backward_pass_setup", None), ) + def disable_backward_engine(self): + r"""Disable the backward engine for all GradComponent in the task.""" + self.disable_backward_engine_helper() + # def configure_backward_engine(self, *args, **kwargs): # raise NotImplementedError("configure_backward_engine method is not implemented") @@ -196,8 +219,10 @@ def evaluate_samples( y_preds: List, metadata: Optional[Dict[str, Any]] = None, num_workers: int = 2, + use_loss_eval_fn: bool = False, ) -> EvaluationResult: - r"""Run evaluation on samples using parallel processing. Utilizes ``prepare_eval`` defined by the user. + r"""Evaluate predictions against the ground truth samples. + Run evaluation on samples using parallel processing. Utilizes ``prepare_eval`` defined by the user. Metadata is used for storing context that you can find from generator input. @@ -210,6 +235,9 @@ def evaluate_samples( Returns: EvaluationResult: An object containing the average score and per-item scores. """ + if use_loss_eval_fn and not self.loss_eval_fn: + raise ValueError("Loss eval function is not configured.") + from adalflow.optim.parameter import Parameter if not isinstance(y_preds, list) or len(y_preds) == 0: @@ -229,13 +257,22 @@ def evaluate_samples( for i, (sample, y_pred) in enumerate(zip(samples, y_preds)): if metadata is None: - eval_fn, kwargs = self.prepare_eval(sample, y_pred) + if not use_loss_eval_fn: + eval_fn, kwargs = self.prepare_eval(sample, y_pred) + else: + eval_fn, kwargs = self.prepare_loss_eval(sample, y_pred) future = executor.submit(eval_fn, **kwargs) # future = executor.submit(self.evaluate_one_sample, sample, y_pred) else: - eval_fn, kwargs = self.prepare_eval( - sample, y_pred, metadata=metadata - ) + if not use_loss_eval_fn: + eval_fn, kwargs = self.prepare_eval( + sample, y_pred, metadata=metadata + ) + else: + + eval_fn, kwargs = self.prepare_eval( + sample, y_pred, metadata=metadata + ) future = executor.submit(eval_fn, **kwargs) # future = executor.submit( # self.evaluate_one_sample, sample, y_pred, metadata=metadata @@ -358,16 +395,17 @@ def pred_step( num_workers: int = 2, running_eval: bool = False, min_score: Optional[float] = None, - ): - r"""Applies to both train and eval mode. - - If you require self.task.train() to be called before training, you can override this method as: + use_loss_eval_fn: bool = False, + ) -> Tuple[List["Parameter"], List, Dict[int, float]]: + r"""Applies to only the eval mode. - .. code-block:: python - - def train_step(self, batch, batch_idx, num_workers: int = 2) -> List: - self.task.train() - return super().train_step(batch, batch_idx, num_workers) + Args: + batch (Any): The input batch to predict. + batch_idx (int): The index of the batch. + num_workers (int): Number of worker threads for parallel processing. + running_eval: bool = False, + Returns: + Tuple[List["Parameter"], List, Dict[int, float]]: The predicted outputs, the samples, and the scores. """ from adalflow.optim.parameter import Parameter @@ -402,8 +440,6 @@ def train_step(self, batch, batch_idx, num_workers: int = 2) -> List: if isinstance(y_pred, Parameter): raise ValueError(f"y_pred_{i} is a Parameter, {y_pred}") - print(f"y_pred: {y_pred})") - assert ( y_pred.id == sample.id ), f"ID mismatch: {y_pred.id} != {sample.id}, type: {type(y_pred)}" @@ -412,7 +448,11 @@ def train_step(self, batch, batch_idx, num_workers: int = 2) -> List: if running_eval and not isinstance(y_pred, Parameter): # evaluate one sample - eval_fn, kwargs = self.prepare_eval(sample, y_pred) + + if not use_loss_eval_fn: + eval_fn, kwargs = self.prepare_eval(sample, y_pred) + else: + eval_fn, kwargs = self.prepare_loss_eval(sample, y_pred) score = eval_fn(**kwargs) index_to_score[i] = score eval_score = np.mean(list(index_to_score.values())).item() @@ -449,6 +489,8 @@ def train_step(self, batch, batch_idx, num_workers: int = 2) -> List: return completed_y_preds, completed_samples, index_to_score def train_step(self, batch, batch_idx, num_workers: int = 2) -> List: + r"""Run a training step and return the predicted outputs. + Likely a list of Parameters.""" self.task.train() y_preds = self._train_step(batch, batch_idx, num_workers) for i, y_pred in enumerate(y_preds): @@ -468,8 +510,18 @@ def validation_step( batch_idx, num_workers: int = 2, minimum_score: Optional[float] = None, + use_loss_eval_fn: bool = False, ) -> EvaluationResult: - r"""If you require self.task.eval() to be called before validation, you can override this method as: + r""" + Args: + batch (Any): The input batch to validate, can be a whole dataset + batch_idx (int): The index of the batch. or current_step + num_workers (int): Number of worker threads for parallel processing. + minimum_score (Optional[float]): The max potential score needs to be larger than this to continue evaluating. + + Evaluate a batch or the validate dataset by setting the batch=val_dataset. + Uses self.eval_fn to evaluate the samples. + If you require self.task.eval() to be called before validation, you can override this method as: .. code-block:: python @@ -478,11 +530,26 @@ def validation_step(self, batch, batch_idx, num_workers: int = 2) -> List: return super().validation_step(batch, batch_idx, num_workers) """ # TODO: let use decide which mode to be + eval_fn = self.eval_fn + if use_loss_eval_fn: + eval_fn = self.loss_eval_fn + if not eval_fn: + raise ValueError("Loss eval function is not configured.") + self.task.eval() self.task.use_teacher(mode=False) # ensure the teacher is not used - completed_y_preds, completed_samples, index_to_score = self.pred_step( - batch, batch_idx, num_workers, running_eval=True, min_score=minimum_score - ) + try: + completed_y_preds, completed_samples, index_to_score = self.pred_step( + batch, + batch_idx, + num_workers, + running_eval=True, + min_score=minimum_score, + use_loss_eval_fn=use_loss_eval_fn, + ) + except Exception as e: + raise ValueError(f"Error in validation step: {e}") + if index_to_score: # compute score from index_to_score @@ -495,12 +562,16 @@ def validation_step(self, batch, batch_idx, num_workers: int = 2) -> List: avg_score=avg_score, per_item_scores=acc_list ) else: + try: - eval_results = self.evaluate_samples( - samples=completed_samples, - y_preds=completed_y_preds, - num_workers=num_workers, - ) + eval_results = self.evaluate_samples( + samples=completed_samples, + y_preds=completed_y_preds, + num_workers=num_workers, + use_loss_eval_fn=use_loss_eval_fn, + ) + except Exception as e: + raise ValueError(f"Error in evaluation: {e}") return eval_results def loss_step( @@ -578,11 +649,29 @@ def configure_teacher_generator_helper( generator.set_teacher_generator(teacher_generator) print("Teacher generator configured.") + def disable_backward_engine_helper(self): + r"""Disable the backward engine for all generators in the task.""" + all_grads = self._find_all_grad_components() + for _, grad in all_grads: + if hasattr(grad, "disable_backward_engine") and callable( + getattr(grad, "disable_backward_engine", None) + ): + grad.disable_backward_engine() + print("Backward engine disabled for GradComponents") + + if not self.loss_fn: + raise ValueError("Loss function is not configured.") + + # configure it for loss_fn + if self.loss_fn: + self.loss_fn.disable_backward_engine() + def configure_backward_engine_helper( self, model_client: "ModelClient", model_kwargs: Dict[str, Any], template: Optional[str] = None, + backward_pass_setup: Optional["BackwardPassSetup"] = None, ): r"""Configure a backward engine for all generators in the task for bootstrapping examples.""" from adalflow.core.generator import BackwardEngine @@ -592,13 +681,18 @@ def configure_backward_engine_helper( model_kwargs=model_kwargs, template=template, ) + if backward_pass_setup is not None: + self.backward_engine.update_default_backward_pass_setup(backward_pass_setup) # set all generator's backward engine - all_generators = self._find_all_generators() - for _, generator in all_generators: - generator.set_backward_engine(self.backward_engine) - print("Backward engine configured for all generators.") + all_grads = self._find_all_grad_components() + for _, grad in all_grads: + if hasattr(grad, "set_backward_engine") and callable( + getattr(grad, "set_backward_engine", None) + ): + grad.set_backward_engine(self.backward_engine) + print("Backward engine configured for GradComponents") if not self.loss_fn: raise ValueError("Loss function is not configured.") @@ -654,6 +748,17 @@ def _find_all_generators(self) -> List[Tuple[str, "Generator"]]: log.debug(f"all_generators: {all_generators}") return all_generators + def _find_all_grad_components(self) -> List[Tuple[str, GradComponent2]]: + r"""Find all generators automatically from the task.""" + # from adalflow.core import Generator + + all_grads: List[Tuple[str, GradComponent2]] = [] + for name, comp in self.task.named_components(): + if isinstance(comp, GradComponent2) or isinstance(comp, GradComponent): + all_grads.append((name, comp)) + log.debug(f"all_grads: {all_grads}") + return all_grads + def _auto_generator_callbacks(self, save_dir: str = "traces") -> List[str]: r"""Automatically generate callbacks.""" from adalflow.core.types import GeneratorOutput @@ -723,7 +828,7 @@ def configure_demo_optimizer_helper(self) -> List[DemoOptimizer]: def configure_text_optimizer_helper( self, model_client: "ModelClient", model_kwargs: Dict[str, Any] ) -> List[TextOptimizer]: - r"""One text optimizer can handle multiple text parameters.""" + r"""Text optimizer hands prompt parameter type. One text optimizer can handle multiple text parameters.""" from adalflow.optim.text_grad.tgd_optimizer import TGDOptimizer from adalflow.optim.parameter import ParameterType @@ -739,9 +844,25 @@ def configure_text_optimizer_helper( to = TGDOptimizer( params=parameters, model_client=model_client, model_kwargs=model_kwargs ) + + printc( + f"Text optimizer configured for {len(parameters)} parameters. names: { [(p.name, p.data) for p in parameters] }" + ) return [to] def _extra_repr(self): s = f"eval_fn: {self.eval_fn.__name__}, backward_engine: {self.backward_engine}, " s += f"backward_engine_model_config: {self.backward_engine_model_config}, teacher_model_config: {self.teacher_model_config}, text_optimizer_model_config: {self.text_optimizer_model_config}" return s + + def __call__(self, *args, **kwargs): + pass + + def bicall(self, *args, **kwargs): + pass + + def call(self, *args, **kwargs): + pass + + def forward(self, *args, **kwargs): + pass diff --git a/adalflow/adalflow/optim/trainer/trainer.py b/adalflow/adalflow/optim/trainer/trainer.py index 91a2fd16..50393d77 100644 --- a/adalflow/adalflow/optim/trainer/trainer.py +++ b/adalflow/adalflow/optim/trainer/trainer.py @@ -8,12 +8,17 @@ import numpy as np import uuid import time +from copy import copy from adalflow.core.component import Component from adalflow.optim.optimizer import Optimizer, DemoOptimizer, TextOptimizer if TYPE_CHECKING: from adalflow.optim.parameter import Parameter + from adalflow.core.generator import BackwardPassSetup + +from adalflow.optim.parameter import OutputParameter + from adalflow.optim.types import ( PromptData, TrainerResult, @@ -24,7 +29,8 @@ from adalflow.optim.trainer.adal import AdalComponent from adalflow.optim.text_grad.ops import sum_ops -from adalflow.utils import save_json, load_json +from adalflow.utils import save_json +from adalflow.utils.file_io import load_standard_json from adalflow.utils.cache import hash_text_sha1 from adalflow.utils.data import DataLoader from adalflow.utils.logger import printc @@ -81,6 +87,7 @@ class Trainer(Component): optimization_order: Literal["sequential", "mix"] = ( "sequential" # zero-shot first, bootstrap second ) + sequential_order: List[str] = ["text", "demo"] max_steps: int optimizer: Optimizer = None ckpt_path: Optional[str] = None @@ -91,10 +98,14 @@ class Trainer(Component): batch_val_score_threshold: Optional[float] = ( 1.0 # when acc_score >= this threshold, skip this batch ) + correct_val_score_threshold: Optional[float] = ( + 0.5 # when acc_score >= this threshold, it is considered as correct sample + ) max_error_samples: Optional[int] = 2 max_correct_samples: Optional[int] = 2 debug: bool = False - sequential_order: List[str] = ["text", "demo"] + random_seed: int = None + skip_subset_val: bool = False def __init__( self, @@ -106,6 +117,7 @@ def __init__( num_workers: int = 4, ckpt_path: str = None, batch_val_score_threshold: Optional[float] = 1.0, + correct_val_score_threshold: Optional[float] = 0.5, max_error_samples: Optional[int] = 2, max_correct_samples: Optional[int] = 2, max_proposals_per_step: int = 5, @@ -121,6 +133,7 @@ def __init__( debug: bool = False, save_traces: bool = False, # save traces in the few-shto demos sequential_order: List[str] = ["text", "demo"], + skip_subset_val: bool = False, *args, **kwargs, ) -> None: @@ -140,6 +153,7 @@ def __init__( self.val_dataset = val_dataset self.test_dataset = test_dataset self.batch_val_score_threshold = batch_val_score_threshold + self.correct_val_score_threshold = correct_val_score_threshold self.max_error_samples = max_error_samples self.max_correct_samples = max_correct_samples self.max_proposals_per_step = max_proposals_per_step @@ -147,10 +161,12 @@ def __init__( self._subset_effect_count = {"pass": 0, "fail": 0} self._fullset_effect_count = {"pass": 0, "fail": 0} self._valset_effect_count = {"pass": 0, "fail": 0} + self._demo_valset_effect_count = {"pass": 0, "fail": 0} self._effective_measure = { "subset": self._subset_effect_count, "fullset": self._fullset_effect_count, "valset": self._valset_effect_count, + "demo_valset": self._demo_valset_effect_count, } self._raw_shots = raw_shots self._bootstrap_shots = bootstrap_shots @@ -164,9 +180,15 @@ def __init__( exclude_input_fields_from_bootstrap_demos ) self.sequential_order = sequential_order + self.skip_subset_val = skip_subset_val + + def set_random_seed(self, seed: int): + self.random_seed = seed # TODO: need to support checkpoint resume too! - def diagnose(self, dataset: Any, split: str = "train"): + def diagnose( + self, dataset: Any, split: str = "train", resume_from_ckpt: str = None + ): """Run an evaluation on the trainset to track all error response, and its raw response using AdaplComponent's default configure_callbacks Args: dataset: Any: Dataset to evaluate @@ -187,9 +209,13 @@ def diagnose(self, dataset: Any, split: str = "train"): print(diagnose) """ # 1. track all intermediate outputs + if resume_from_ckpt: + self.resume_params_from_ckpt(resume_from_ckpt) + self.adaltask.eval() if not self.ckpt_path: trainer_state = self.gather_trainer_states() self.prep_ckpt_file_path(trainer_state) + printc(f"Checkpoint path: {self.ckpt_path}") save_path = os.path.join(self.ckpt_path, f"diagnose_{split}") logger.debug(f"Save diagnose to {save_path}") # One generator will be one file, all stats are in logger_metadata.json @@ -219,10 +245,15 @@ def diagnose(self, dataset: Any, split: str = "train"): paths: Dict[str, List[str]] = {"Log": log_paths, "Diagnose": [], "Stats": []} # reorder the samples based on the score + stats_list: List[Dict] = [] for log_path in log_paths: + stats_list = [] file_name = os.path.basename(log_path) logger.debug(f"Loading log file: {file_name}") logs = load_jsonl(log_path) + if not logs or len(logs) == 0: + print(f"Log file {log_path} is empty. This llm is not called at all.") + continue try: logs_dict = {log["output"]["id"]: log for log in logs} except KeyError: @@ -239,7 +270,7 @@ def diagnose(self, dataset: Any, split: str = "train"): diagnose_file = os.path.join(log_dir, diagnose_filename) diagnose_items = [] - stats_list: List[Dict] = [] + for i, log in enumerate(sorted_logs): if log["score"] < 0.5: diagnose_item = { @@ -349,6 +380,33 @@ def debug_report( ) print(Fore.CYAN + "\n===================================================\n") + def resume_params_from_ckpt(self, ckpt_file: str): + """Resume the parameters from the checkpoint file""" + dict_data = load_standard_json(ckpt_file) + # find the highest val score + trainer_results: TrainerResult = TrainerResult.from_dict(dict_data) + # restore the prompts to the adaltask + val_scores = [] + # test_scores = [] + for step in trainer_results.step_results: + if step.val_score: + val_scores.append(step.val_score) + # if step.test_score: + # test_scores.append(step.test_score) + result_from_step = 0 + # if test_scores: + # result_from_step = test_scores.index(max(test_scores)) + if val_scores: + printc(f"Val scores: {val_scores}") + result_from_step = val_scores.index(max(val_scores)) + prompts: List[PromptData] = trainer_results.step_results[ + result_from_step + ].prompt + + print(f"Restoring prompts: {prompts[0]}") + + self.adaltask._set_param_values(prompts) + def fit( self, *, @@ -364,6 +422,7 @@ def fit( resume_from_ckpt: Optional[ str ] = None, # TODO: have a more comprehensive ckpt loading in the future + backward_pass_setup: Optional["BackwardPassSetup"] = None, ) -> Tuple[str, TrainerResult]: r""" train_loader: An iterable or collection of iterables specifying training samples. @@ -371,6 +430,7 @@ def fit( Returns: Tuple[str, TrainerResult]: Checkpoint file and the TrainerResult object """ + start_time = time.time() debug = debug or self.debug @@ -395,7 +455,10 @@ def fit( batch_size = self.train_batch_size train_loader = DataLoader( - train_dataset, batch_size=batch_size, shuffle=True + train_dataset, + batch_size=batch_size, + shuffle=True if not debug else False, + seed=self.random_seed, ) val_dataset = val_dataset or self.val_dataset test_dataset = test_dataset or self.test_dataset @@ -446,8 +509,11 @@ def fit( self.demo_optimizers = [] if len(self._get_trainable_text_params()) > 0: + if self.adaltask.backward_engine is None: - self.adaltask.configure_backward_engine() + self.adaltask.configure_backward_engine( + backward_pass_setup=backward_pass_setup + ) else: print("No trainable text params to optimize") self.text_optimizers = [] @@ -460,20 +526,17 @@ def fit( starting_step = 0 if resume_from_ckpt: self.ckpt_file = resume_from_ckpt - dict_data = load_json(self.ckpt_file) + self.ckpt_path = os.path.dirname(self.ckpt_file) + dict_data = load_standard_json(self.ckpt_file) trainer_results: TrainerResult = TrainerResult.from_dict(dict_data) # restore the prompts to the adaltask val_scores = [] - test_scores = [] for step in trainer_results.step_results: if step.val_score: val_scores.append(step.val_score) - if step.test_score: - test_scores.append(step.test_score) result_from_step = 0 - if test_scores: - result_from_step = test_scores.index(max(test_scores)) - elif val_scores: + if val_scores: + printc(f"Val scores: {val_scores}") result_from_step = val_scores.index(max(val_scores)) prompts: List[PromptData] = trainer_results.step_results[ result_from_step @@ -483,16 +546,30 @@ def fit( self.adaltask._set_param_values(prompts) starting_step = len(trainer_results.steps) - 1 + self._add_history_text_optimizers(max(val_scores)) + + else: + trainer_results = ( + self._pre_fit(val_dataset, test_dataset) + if trainer_results is None + else trainer_results + ) if debug: print("Debugging mode") text_grad_debug_path, few_shot_demo_debug_path = None, None - if len(self.text_optimizers) > 0: + if ( + len(self.text_optimizers) > 0 + and len(self._get_trainable_text_params()) > 0 + ): text_grad_debug_path = self._fit_text_grads_one_step_for_debug( train_loader ) - if len(self.demo_optimizers) > 0: + if ( + len(self.demo_optimizers) > 0 + and len(self._get_trainable_demo_params()) > 0 + ): few_shot_demo_debug_path = self._fit_demos_one_step_for_debug( train_loader, train_dataset, val_dataset, test_dataset ) @@ -531,29 +608,29 @@ def fit( def run_text_optimizers(starting_step: int, trainer_results: TrainerResult): if len(self.text_optimizers) > 0: if self.strategy == "random": - trainer_results = self._fit_text_grad_random( + self._fit_text_grad_random( train_loader, val_dataset, test_dataset, trainer_results, starting_step=starting_step, ) - starting_step += self.max_steps elif self.strategy == "constrained": - trainer_results = self._fit_text_grad_constraint( + # self.adaltask.configure_teacher_generator() # use teacher as bootstrap intemediate results + self._fit_text_grad_constraint( train_loader, val_dataset, test_dataset, trainer_results=trainer_results, starting_step=starting_step, ) - starting_step += self.max_steps else: raise ValueError(f"Strategy {self.strategy} not supported") def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult): if len(self.demo_optimizers) > 0: self.adaltask.configure_teacher_generator() + self.adaltask.disable_backward_engine() # disable it to avoid backward engine for gradients self._fit_demos_random( train_loader, train_dataset, @@ -565,27 +642,28 @@ def run_demo_optimizers(starting_step: int, trainer_results: TrainerResult): if self.sequential_order == ["text", "demo"]: run_text_optimizers(starting_step, trainer_results) + starting_step += self.max_steps + print(f"Starting step: {starting_step}") + print("steps", trainer_results.steps) run_demo_optimizers(starting_step, trainer_results) else: run_demo_optimizers(starting_step, trainer_results) + starting_step += self.max_steps run_text_optimizers(starting_step, trainer_results) - # if len(self.text_optimizers) > 0: - # run_text_optimizers(starting_step, trainer_results) - - # if len(self.demo_optimizers) > 0: - # run_demo_optimizers(starting_step, trainer_results) - # self.adaltask.configure_teacher_generator() # attemp to use the newest teacher as - # self._fit_demos_random( - # train_loader, - # train_dataset, - # val_dataset, - # test_dataset, - # trainer_results=trainer_results, - # starting_step=starting_step, - # ) end_time = time.time() print(f"Training time: {end_time - start_time}s") + trainer_results.total_time = end_time - start_time + # test at the end + if test_dataset: + test_output = self.adaltask.validation_step( + test_dataset, 0, self.num_workers + ) + test_score = test_output.avg_score + trainer_results.test_score = test_score + # write the results to the checkpoint file + save_json(trainer_results.to_dict(), self.ckpt_file) + print(f"ckpt_file: {self.ckpt_file}") return self.ckpt_file, trainer_results @@ -605,13 +683,21 @@ def initial_validation(self, val_dataset: Any, test_dataset: Any): ) test_score = test_output.avg_score trainer_results = TrainerResult( - steps=[], val_scores=[], test_scores=[], step_results=[], prompts=[] + steps=[], val_scores=[], test_scores=[], step_results=[] ) trainer_results.val_scores.append(val_score) trainer_results.test_scores.append(test_score) prompts = self.adaltask._get_param_values() - trainer_results.prompts.append(prompts) + # trainer_results.prompts.append(prompts) trainer_results.steps.append(0) + # add step result + step_result = TrainerStepResult( + step=0, + val_score=val_score, + test_score=test_score, + prompt=prompts, + ) + trainer_results.step_results.append(step_result) print(f"Initial validation score: {val_score}") print(f"Initial test score: {test_score}") return trainer_results @@ -646,6 +732,9 @@ def gather_trainer_states(self): hash_key = hash_text_sha1(serialize(trainer_state))[0:5] trainer_state["hash_key"] = hash_key trainer_state["task_state_dict"] = self.adaltask.to_dict() + # trainer_state["text_optimizers"] = [ + # opt.to_dict() for opt in self.text_optimizers + # ] # restore_state = AdalComponent.from_dict( # trainer_state["task_state_dict"] # ) # tODO: add a test for adalcomponent @@ -724,7 +813,7 @@ def _fit_demos_one_step_for_debug( self.prep_ckpt_file_path() debug_path = os.path.join(self.ckpt_path, "debug_demos") os.makedirs(debug_path, exist_ok=True) - print(f"save to {debug_path}") + print(f"_fit_demos_one_step_for_debug save to {debug_path}") self.adaltask.train() self.adaltask.trace() @@ -751,7 +840,7 @@ def _fit_demos_one_step_for_debug( # print(f"Teacher y_preds: {y_preds[0].to_dict()}") - y_preds_outputs = [p.full_response for p in y_preds] + y_preds_outputs = [p.data for p in y_preds] batch_eval: EvaluationResult = self.adaltask.evaluate_samples( batch, y_preds_outputs @@ -809,41 +898,11 @@ def _fit_demos_one_step_for_debug( self._demo_optimizers_add_scores( [sample.id for sample in batch], batch_per_item_scores, is_teacher=False ) - # for loss in losses_student: - # loss.backward() + # Check the eval result - y_preds_outputs = [p.full_response for p in y_preds_student] + y_preds_outputs = [p.data for p in y_preds_student] eval_result = self.adaltask.evaluate_samples(batch, y_preds_outputs) print(f"Eval result: {eval_result.avg_score}") - # eval_score_per_item = eval_result.per_item_scores - - # bootstrap a batch - # batch_for_teacher = [] - # losses_teacher = [] - - # for i, (sample, item_score) in enumerate(zip(batch, eval_score_per_item)): - - # # use teacher - # if sample.id in pred_teacher: - # continue - # # if item_score < 0.5: - # pred_teacher.add(sample.id) - # batch_for_teacher.append(sample) - # # run teacher, use teachers's output instead of the initial output (bootstrap) - # if len(batch_for_teacher) > 0: - # print(f"Using teacher for {len(batch_for_teacher)} samples") - # self.adaltask.use_teacher() - # y_preds_teacher = self.adaltask.train_step( - # batch_for_teacher, batch_idx, self.num_workers - # ) - # losses_teacher: List[Parameter] = self.adaltask.loss_step( # noqa F841 - # batch_for_teacher, y_preds_teacher, batch_idx, self.num_workers - # ) - # self._demo_optimizers_add_scores( - # [sample.id for sample in batch_for_teacher], - # eval_score_per_item, - # is_teacher=True, - # ) # loss_students backward for loss in losses_student: @@ -896,7 +955,6 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str self.prep_ckpt_file_path() debug_path = os.path.join(self.ckpt_path, "debug_text_grads") os.makedirs(debug_path, exist_ok=True) - print(f"save to {debug_path}") train_loader.batch_size = 2 train_loader.shuffle = True self.adaltask.train() # this will turn everything to train mode @@ -915,14 +973,12 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str else: failed_loss = loss if correct_loss is not None and failed_loss is not None: - print("Found correct and failed loss") + printc("Found correct and failed loss", "blue") break - + if not all_losses: + raise ValueError("No losses found in the dataset.") # Handle case where one or both losses are None if correct_loss is None or failed_loss is None: - if not all_losses: - raise ValueError("No losses found in the dataset.") - # Sort all_losses by their data values all_losses.sort(key=lambda x: x.data, reverse=True) # Highest to lowest @@ -931,12 +987,53 @@ def _fit_text_grads_one_step_for_debug(self, train_loader: Any) -> Dict[str, str failed_loss = all_losses[-1] print("Assigned correct_loss and failed_loss from sorted losses.") - total_loss = sum_ops([correct_loss, failed_loss]) + total_loss = sum_ops([copy(correct_loss), copy(failed_loss)]) + + t0 = time.time() + total_loss.backward() + t1 = time.time() + printc(f"finish loss backward in {t1-t0} seconds") # test optimizer self._propose_text_optimizers() + t2 = time.time() + printc(f"finish text optimizer step in {t2-t1} seconds") + + debug_files: Dict = total_loss.draw_graph(filepath=debug_path, full_trace=True) + t3 = time.time() + printc(f"finish draw_graph step in {t3-t2} seconds") + debug_output_file = total_loss.draw_output_subgraph(filepath=debug_path) + t4 = time.time() + printc(f"finish draw_output_subgraph step in {t4-t3} seconds") + debug_component_file = total_loss.draw_component_subgraph(filepath=debug_path) + debug_files.update(debug_output_file) + debug_files.update(debug_component_file) + + # zero grad + self._zero_grad_text_optimizers() + # revert + self._revert_text_optimizers() + + total_loss.reset_all_gradients() + + # draw graph on a single loss + total_loss = sum_ops([copy(failed_loss)]) + total_loss.backward() + self._propose_text_optimizers() + + failed_debug_files = total_loss.draw_graph( + filepath=debug_path, full_trace=False + ) + failed_output_file = total_loss.draw_output_subgraph(filepath=debug_path) + failed_component_file = total_loss.draw_component_subgraph(filepath=debug_path) + failed_debug_files.update(failed_output_file) + failed_debug_files.update(failed_component_file) + + for k, v in failed_debug_files.items(): + if k in debug_files: + k = f"failed_{k}" + debug_files[k] = v - debug_files = total_loss.draw_graph(filepath=debug_path, full_trace=True) return debug_files def _set_demo_optimizers_dataset(self, train_dataset: Any): @@ -978,13 +1075,17 @@ def _zero_grad_text_optimizers(self): for text_optimizer in self.text_optimizers: text_optimizer.zero_grad() + def _text_optimizers_set_target_param(self): + for text_optimizer in self.text_optimizers: + text_optimizer.set_target_param() + def _propose_text_optimizers(self): for text_optimizer in self.text_optimizers: text_optimizer.propose() - # def _add_failed_proposals_text_optimizers(self): - # for opt in self.text_optimizers: - # opt.add_failed_proposal() + def _add_failed_proposals_text_optimizers(self): + for opt in self.text_optimizers: + opt.add_failed_proposal() def _get_trainable_text_params(self): params = [] @@ -1008,6 +1109,14 @@ def _revert_text_optimizers(self): for text_optimizer in self.text_optimizers: text_optimizer.revert() + def _increment_step_from_last_improvement_text_optimizers(self): + for text_optimizer in self.text_optimizers: + text_optimizer.increment_steps_from_last_improvement() + + def _reset_steps_from_last_improvement_text_optimizers(self): + for text_optimizer in self.text_optimizers: + text_optimizer.reset_steps_from_last_improvement() + def _check_optimizer_proposal(self): r"""Return True if all optimizers have proposed a new prompt""" for text_optimizer in self.text_optimizers: @@ -1033,7 +1142,7 @@ def _fit_text_grad_demo_mix_constrained( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_demo_mix_constrained save to {self.ckpt_file}") if train_dataset is None: raise ValueError("train_dataset is required") @@ -1066,7 +1175,7 @@ def _fit_text_grad_demo_mix_constrained( all_losses.extend(losses) # student losses # extract the non-parameter y_preds all_y_preds.extend( - [y.full_response for y in y_preds if isinstance(y, Parameter)] + [y.data for y in y_preds if isinstance(y, Parameter)] ) # for loss in losses: @@ -1114,76 +1223,80 @@ def _fit_text_grad_demo_mix_constrained( all_losses=all_losses, all_y_preds=all_y_preds, include_demo_optimizers=True, + trainer_results=trainer_results, + val_dataset=val_dataset, + test_dataset=test_dataset, + total_steps=total_steps, ) ) - if not self._check_optimizer_proposal(): - print( - "No proposal can improve the subset and full set, go to next step" - ) - # self._add_failed_proposals_text_optimizers() - - self._add_one_step_in_trainer_results( - trainer_results, - trainer_results.val_scores[-1], - trainer_results.test_scores[-1], - trainer_results.prompts[-1], - total_steps, - ) - - continue - - # set the batch size to the size of the validation set - last_val_score = trainer_results.val_scores[-1] - val_output = self.adaltask.validation_step( - val_dataset, - total_steps, - self.num_workers, - minimum_score=last_val_score, - ) - val_score = val_output.avg_score - self._add_history_text_optimizers(val_score) - - if val_score > last_val_score: - print(f"Optimizer step: {val_score} > {last_val_score}") - # self.optimizer.step() - self._step_text_optimizers() - self._demo_optimizers_step() - - # test the model - test_score = None - if test_dataset is not None: - test_output = self.adaltask.validation_step( - test_dataset, total_steps, self.num_workers - ) - test_score = test_output.avg_score + # if not self._check_optimizer_proposal(): + # print( + # "No proposal can improve the subset and full set, go to next step" + # ) + # # self._add_failed_proposals_text_optimizers() + + # self._add_one_step_in_trainer_results( + # trainer_results, + # trainer_results.val_scores[-1], + # trainer_results.test_scores[-1], + # trainer_results.prompts[-1], + # total_steps, + # ) - new_prompts = self.adaltask._get_param_values() - self._add_one_step_in_trainer_results( - trainer_results, - val_score, - test_score, - new_prompts, - total_steps, - ) - all_samples, all_losses, all_y_preds = [], [], [] - else: - print(f"Optimizer revert: {val_score} <= {last_val_score}") - # self.optimizer.revert() - self._revert_text_optimizers() - self._demo_optimizers_revert() - # save the score, no change - self._add_one_step_in_trainer_results( - trainer_results, - last_val_score, - trainer_results.test_scores[-1], - trainer_results.prompts[-1], - total_steps, - attempted_val_score=val_score, - ) + # continue + + # # set the batch size to the size of the validation set + # last_val_score = trainer_results.val_scores[-1] + # val_output = self.adaltask.validation_step( + # val_dataset, + # total_steps, + # self.num_workers, + # minimum_score=last_val_score, + # ) + # val_score = val_output.avg_score + # self._add_history_text_optimizers(val_score) + + # if val_score > last_val_score: + # print(f"Optimizer step: {val_score} > {last_val_score}") + # # self.optimizer.step() + # self._step_text_optimizers() + # self._demo_optimizers_step() + + # # test the model + # test_score = None + # if test_dataset is not None: + # test_output = self.adaltask.validation_step( + # test_dataset, total_steps, self.num_workers + # ) + # test_score = test_output.avg_score + + # new_prompts = self.adaltask._get_param_values() + # self._add_one_step_in_trainer_results( + # trainer_results, + # val_score, + # test_score, + # new_prompts, + # total_steps, + # ) + # all_samples, all_losses, all_y_preds = [], [], [] + # else: + # print(f"Optimizer revert: {val_score} <= {last_val_score}") + # # self.optimizer.revert() + # self._revert_text_optimizers() + # self._demo_optimizers_revert() + # # save the score, no change + # self._add_one_step_in_trainer_results( + # trainer_results, + # last_val_score, + # trainer_results.test_scores[-1], + # trainer_results.prompts[-1], + # total_steps, + # attempted_val_score=val_score, + # ) - print(f"Saving checkpoint to {self.ckpt_file}") - save_json(trainer_results.to_dict(), self.ckpt_file) + # print(f"Saving checkpoint to {self.ckpt_file}") + # save_json(trainer_results.to_dict(), self.ckpt_file) save_json(trainer_results.to_dict(), self.ckpt_file) # checkpoint def _fit_text_grad_demo_mix_random( @@ -1202,7 +1315,7 @@ def _fit_text_grad_demo_mix_random( if train_results is None else train_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_demo_mix_random save to {self.ckpt_file}") if train_dataset is None: raise ValueError("train_dataset is required") @@ -1298,10 +1411,12 @@ def _fit_text_grad_demo_mix_random( self._demo_optimizers_step() # test the model - test_output = self.adaltask.validation_step( - test_dataset, total_steps, self.num_workers - ) - test_score = test_output.avg_score + test_score = None + # if test_dataset is not None: + # test_output = self.adaltask.validation_step( + # test_dataset, total_steps, self.num_workers + # ) + # test_score = test_output.avg_score self._add_one_step_in_trainer_results( trainer_results, val_score, @@ -1319,7 +1434,7 @@ def _fit_text_grad_demo_mix_random( trainer_results, last_val_score, trainer_results.test_scores[-1], - trainer_results.prompts[-1], + trainer_results.step_results[-1].prompt, total_steps, attempted_val_score=val_score, ) @@ -1344,7 +1459,7 @@ def _fit_demos_random( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_demos_random save to {self.ckpt_file}") print(f"Starting step: {starting_step}") self.adaltask.train() @@ -1428,10 +1543,13 @@ def _fit_demos_random( minimum_score=last_val_score, ) val_score = val_output.avg_score + if val_score > last_val_score: print( f"Pass validation: {val_score} > {trainer_results.val_scores[-1]}" ) + self._track_effectiveness("demo_valset", True) + self._demo_optimizers_step() for opt in self.demo_optimizers: if opt.proposing: @@ -1439,11 +1557,11 @@ def _fit_demos_random( # test the new prompts test_score = None - if test_dataset is not None: - test_output = self.adaltask.validation_step( - test_dataset, step, self.num_workers - ) - test_score = test_output.avg_score + # if test_dataset is not None: + # test_output = self.adaltask.validation_step( + # test_dataset, step, self.num_workers + # ) + # test_score = test_output.avg_score self._add_one_step_in_trainer_results( trainer_results, val_score, @@ -1453,6 +1571,7 @@ def _fit_demos_random( attempted_val_score=val_score, ) else: + self._track_effectiveness("demo_valset", False) print(f"Fail validation: {val_score} <= {last_val_score}, revert") self._demo_optimizers_revert() # ensure all demo optimizer are not proposing @@ -1519,6 +1638,127 @@ def _compute_validate_stats(trainer_results: TrainerResult): std_of_score=std, ) + def _random_propose_step( + self, + current_step: int, + all_samples, + all_losses: List["Parameter"], + all_y_preds, + trainer_results: TrainerResult = None, + val_dataset: Any = None, + ): + """Handles a single training step in random batch""" + + tdqm_loader = tqdm(range(self.max_proposals_per_step), desc="Proposing") + + use_eval_loss_fn = False + if self.adaltask.loss_eval_fn is not None: + use_eval_loss_fn = True + + batch_score_list = self.adaltask.evaluate_samples( + samples=all_samples, y_preds=all_y_preds, use_loss_eval_fn=use_eval_loss_fn + ) + # scores that we will compare with + batch_score = batch_score_list.avg_score + last_val_score = trainer_results.val_scores[-1] + val_score_increased = False + val_score = None + + for i in tdqm_loader: + print(f"Proposal: {i+1}") + self._propose_text_optimizers() + new_prompts = self.adaltask._get_param_values() + print("New prompts: ", new_prompts) + + # validate on the batch + batch_val_score_list = self.adaltask.validation_step( + all_samples, + current_step, + use_loss_eval_fn=use_eval_loss_fn, + ) + batch_val_score = batch_val_score_list.avg_score + + if ( + batch_val_score == batch_score + and batch_score >= self.batch_val_score_threshold + ) or batch_val_score > batch_score: # allow perfect subset to pass + + printc( + f"Pass subset check:{use_eval_loss_fn}, {batch_val_score} > {batch_score}" + ) + self._track_effectiveness("subset", True) + + else: + printc( + f"Fail subset check, try next proposal: {use_eval_loss_fn}, {batch_val_score} <= {batch_score}" + ) + self._add_failed_proposals_text_optimizers() + self._track_effectiveness("subset", False) + self._revert_text_optimizers() + continue + + # validate on the whole validation set + # set the batch size to the size of the validation set + val_output = self.adaltask.validation_step( + val_dataset, + current_step, + self.num_workers, + minimum_score=last_val_score, + ) + val_score = val_output.avg_score + + if val_score > last_val_score: + + print(f"Optimizer step: {val_score} > {last_val_score}") + # track the effectiveness + self._track_effectiveness("valset", True) + self._step_text_optimizers() + self._add_history_text_optimizers(val_score) # track top performor + # test the model + # test_output = self.adaltask.validation_step( + # test_dataset, total_steps, self.num_workers + # ) + # test_score = test_output.avg_score + test_score = None + self._add_one_step_in_trainer_results( + trainer_results, + val_score, + test_score, + new_prompts, + current_step, + ) + val_score_increased = True + self._reset_steps_from_last_improvement_text_optimizers() + break + + else: + # if val_score < last_val_score: + self._add_failed_proposals_text_optimizers() # track failed proposals + + print(f"Optimizer revert: {val_score} <= {last_val_score}") + self._revert_text_optimizers() + self._track_effectiveness("valset", False) + self._add_failed_proposals_text_optimizers() + + continue + + if not val_score_increased: + print("No proposal can improve the subset and full set, and val set") + self._zero_grad_text_optimizers() + # save the score, no change + self._add_one_step_in_trainer_results( + trainer_results, + last_val_score, + trainer_results.test_scores[-1], + trainer_results.step_results[-1].prompt, + current_step, + attempted_val_score=val_score, + ) + self._increment_step_from_last_improvement_text_optimizers() + print(f" {current_step}, Saving checkpoint to {self.ckpt_file}") + trainer_results.effective_measure = self._effective_measure + save_json(trainer_results.to_dict(), self.ckpt_file) + def _fit_text_grad_random( self, train_loader: Any, @@ -1533,84 +1773,112 @@ def _fit_text_grad_random( if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_random save to {self.ckpt_file}") self.adaltask.train() # self.optimizer.zero_grad() self._zero_grad_text_optimizers() num_epochs = self._estimate_num_epochs(train_loader, self.max_steps) - total_steps = starting_step + print(f"num_epochs: {num_epochs}, max_steps: {self.max_steps}") + current_step = starting_step for epoch in tqdm(range(num_epochs), desc="Epoch"): + print(f"Epoch: {epoch}") for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))): - total_steps += 1 - if total_steps > self.max_steps + starting_step: + current_step += 1 + if current_step > self.max_steps + starting_step: print("Reached max steps") break self._zero_grad_text_optimizers() - pbar.set_description(f"Training Step: {total_steps}") + pbar.set_description(f"Training Step: {current_step}") self.adaltask.train() # this will turn everything to train mode - self.train() - y_preds = self.adaltask.train_step(batch, steps, self.num_workers) - losses = self.adaltask.loss_step( - batch, y_preds, steps, self.num_workers - ) + try: + + y_preds = self.adaltask.train_step(batch, steps, self.num_workers) + except Exception as e: + print(f"Error in train step: {e}") + raise e + try: + losses = self.adaltask.loss_step( + batch, y_preds, steps, self.num_workers + ) + except Exception as e: + print(f"Error in loss step: {e}") + raise e total_loss = sum_ops(losses) - print("Loss backward...") - total_loss.backward() + try: + total_loss.backward() + except Exception as e: + print(f"Error in backward: {e}") + raise e print("Optimizer propose...") - self._propose_text_optimizers() - new_prompts = self.adaltask._get_param_values() - print("New prompts: ", new_prompts) - # set the batch size to the size of the validation set - last_val_score = trainer_results.val_scores[-1] - val_output = self.adaltask.validation_step( - val_dataset, - total_steps, - self.num_workers, - minimum_score=last_val_score, - ) - val_score = val_output.avg_score - - if val_score > last_val_score: - - print(f"Optimizer step: {val_score} > {last_val_score}") - # self.optimizer.step() - self._step_text_optimizers() - self._add_history_text_optimizers(val_score) # track top performor - # test the model - test_output = self.adaltask.validation_step( - test_dataset, total_steps, self.num_workers - ) - test_score = test_output.avg_score - self._add_one_step_in_trainer_results( - trainer_results, - val_score, - test_score, - new_prompts, - total_steps, - ) - else: - # if val_score < last_val_score: - # self._add_failed_proposals_text_optimizers() # track failed proposals - print(f"Optimizer revert: {val_score} <= {last_val_score}") - # self.optimizer.revert() - self._revert_text_optimizers() - # save the score, no change - self._add_one_step_in_trainer_results( - trainer_results, - last_val_score, - trainer_results.test_scores[-1], - trainer_results.prompts[-1], - total_steps, - attempted_val_score=val_score, - ) + all_y_preds = [ + y.data for y in y_preds if isinstance(y, OutputParameter) + ] + self._random_propose_step( + current_step=current_step, + all_samples=batch, + all_losses=losses, + all_y_preds=all_y_preds, + trainer_results=trainer_results, + val_dataset=val_dataset, + ) + # self._propose_text_optimizers() + # new_prompts = self.adaltask._get_param_values() + # print("New prompts: ", new_prompts) + # # set the batch size to the size of the validation set + # last_val_score = trainer_results.val_scores[-1] + # val_output = self.adaltask.validation_step( + # val_dataset, + # current_step, + # self.num_workers, + # minimum_score=last_val_score, + # ) + # val_score = val_output.avg_score + + # if val_score > last_val_score: + + # print(f"Optimizer step: {val_score} > {last_val_score}") + # # track the effectiveness + # self._track_effectiveness("valset", True) + # # self.optimizer.step() + # self._step_text_optimizers() + # self._add_history_text_optimizers(val_score) # track top performor + # # test the model + # # test_output = self.adaltask.validation_step( + # # test_dataset, total_steps, self.num_workers + # # ) + # # test_score = test_output.avg_score + # test_score = None + # self._add_one_step_in_trainer_results( + # trainer_results, + # val_score, + # test_score, + # new_prompts, + # current_step, + # ) + # else: + # # if val_score < last_val_score: + # self._add_failed_proposals_text_optimizers() # track failed proposals + + # print(f"Optimizer revert: {val_score} <= {last_val_score}") + # self._revert_text_optimizers() + # self._track_effectiveness("valset", False) + # # save the score, no change + # self._add_one_step_in_trainer_results( + # trainer_results, + # last_val_score, + # trainer_results.test_scores[-1], + # trainer_results.prompts[-1], + # current_step, + # attempted_val_score=val_score, + # ) - print(f"Saving checkpoint to {self.ckpt_file}") - save_json(trainer_results.to_dict(), self.ckpt_file) + # print(f" {current_step}, Saving checkpoint to {self.ckpt_file}") + # save_json(trainer_results.to_dict(), self.ckpt_file) save_json(trainer_results.to_dict(), self.ckpt_file) # checkpoint - return trainer_results + return trainer_results @staticmethod def _add_one_step_in_trainer_results( @@ -1633,9 +1901,45 @@ def _add_one_step_in_trainer_results( trainer_results.val_scores.append(val_score) trainer_results.test_scores.append(test_score) - trainer_results.prompts.append(prompts) trainer_results.steps.append(step) + # def _downsample_move_batch( + # self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list + # ): + # """Downsample the moving batch to a more balanced error and correct samples""" + + # from adalflow.optim.parameter import Parameter + + # if not all([score >= 0 and score <= 1 for score in acc_score_list]): + # raise ValueError( + # "acc_score_list should only contain values between 0 and 1" + # ) + + # for loss in all_losses: + # if not isinstance(loss, Parameter): + # raise ValueError("Loss should be a Parameter object") + # max_moving_batch_size = 20 + + # correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5] + # error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5] + + # if ( + # len(error_indices) + len(correct_indices) + # <= max_moving_batch_size + # # and len(correct_indices) <= max_moving_batch_size + # ): + # return all_samples, all_losses, all_y_preds, acc_score_list + + # # downsample from all samples + # new_sample_indices = random.sample( + # range(len(all_samples)), min(max_moving_batch_size, len(all_samples)) + # ) + # all_samples = [all_samples[i] for i in new_sample_indices] + # all_losses = [all_losses[i] for i in new_sample_indices] + # all_y_preds = [all_y_preds[i] for i in new_sample_indices] + # acc_score_list = [acc_score_list[i] for i in new_sample_indices] + # return all_samples, all_losses, all_y_preds, acc_score_list + def _downsample_move_batch( self, all_samples, all_losses: List["Parameter"], all_y_preds, acc_score_list ): @@ -1651,7 +1955,9 @@ def _downsample_move_batch( for loss in all_losses: if not isinstance(loss, Parameter): raise ValueError("Loss should be a Parameter object") + max_moving_batch_size = 20 + min_error_samples = 4 correct_indices = [i for i, score in enumerate(acc_score_list) if score > 0.5] error_indices = [i for i, score in enumerate(acc_score_list) if score <= 0.5] @@ -1663,14 +1969,46 @@ def _downsample_move_batch( ): return all_samples, all_losses, all_y_preds, acc_score_list - # downsample from all samples - new_sample_indices = random.sample( - range(len(all_samples)), min(max_moving_batch_size, len(all_samples)) - ) - all_samples = [all_samples[i] for i in new_sample_indices] - all_losses = [all_losses[i] for i in new_sample_indices] - all_y_preds = [all_y_preds[i] for i in new_sample_indices] - acc_score_list = [acc_score_list[i] for i in new_sample_indices] + # Adjust downsampling logic + if len(error_indices) < min_error_samples: + remaining_capacity = max_moving_batch_size - len(error_indices) + correct_indices = random.sample(correct_indices, max(0, remaining_capacity)) + else: + # Set aside minimum error samples + retained_error_indices = error_indices[:min_error_samples] + remaining_error_indices = error_indices[min_error_samples:] + + # Combine remaining error and correct indices for unified sampling + combined_indices = remaining_error_indices + correct_indices + sampled_combined_indices = random.sample( + combined_indices, max(0, max_moving_batch_size - min_error_samples) + ) + + error_indices = retained_error_indices + correct_indices = [ + i for i in sampled_combined_indices if i in correct_indices + ] + remaining_error_indices = [ + i for i in sampled_combined_indices if i in remaining_error_indices + ] + error_indices += remaining_error_indices + + error_samples = [all_samples[i] for i in error_indices] + error_losses = [all_losses[i] for i in error_indices] + error_y_preds = [all_y_preds[i] for i in error_indices] + error_scores = [acc_score_list[i] for i in error_indices] + + correct_samples = [all_samples[i] for i in correct_indices] + correct_losses = [all_losses[i] for i in correct_indices] + correct_y_preds = [all_y_preds[i] for i in correct_indices] + correct_scores = [acc_score_list[i] for i in correct_indices] + + # Combine error and downsampled correct samples + all_samples = error_samples + correct_samples + all_losses = error_losses + correct_losses + all_y_preds = error_y_preds + correct_y_preds + acc_score_list = error_scores + correct_scores + return all_samples, all_losses, all_y_preds, acc_score_list def _moving_batch_sample( @@ -1680,21 +2018,29 @@ def _moving_batch_sample( # ensure only 0 and 1 in the acc_score_list import numpy as np - if not all([score in [0, 1] for score in acc_score_list]): + if not all(0 <= score <= 1 for score in acc_score_list): raise ValueError("acc_score_list should only contain 0 and 1") - correct_indices = [i for i, score in enumerate(acc_score_list) if score == 1] - error_indices = [i for i, score in enumerate(acc_score_list) if score == 0] + correct_indices = [ + i + for i, score in enumerate(acc_score_list) + if score > self.correct_val_score_threshold + ] + error_indices = [ + i + for i, score in enumerate(acc_score_list) + if score <= self.correct_val_score_threshold + ] print(f"Moving batch correct size: {len(correct_indices)}") print(f"Moving batch error size: {len(error_indices)}") - if len(error_indices) == 0: - raise ValueError("No error samples found") + # if len(error_indices) == 0: + # raise ValueError("No error samples found") sampled_error_indices = random.sample( error_indices, min(self.max_error_samples, len(error_indices)) ) num_errors = len(sampled_error_indices) # max allowed correct samples min(0.8 * num_errors, len(correct_indices), self.max_correct_samples) - max_num_correct_samples = int(2 * num_errors) + max_num_correct_samples = int(2 * max(1, num_errors)) sampled_correct_indices = random.sample( correct_indices, min( @@ -1713,7 +2059,7 @@ def _moving_batch_sample( return subset_score, subset def _track_effectiveness( - self, stage: Literal["subset", "fullset", "valset"], pass_: bool + self, stage: Literal["subset", "fullset", "valset", "demo_valset"], pass_: bool ): if stage == "subset": if pass_: @@ -1730,14 +2076,24 @@ def _track_effectiveness( self._valset_effect_count["pass"] += 1 else: self._valset_effect_count["fail"] += 1 + elif stage == "demo_valset": + if pass_: + self._demo_valset_effect_count["pass"] += 1 + else: + self._demo_valset_effect_count["fail"] += 1 + else: + raise NotImplementedError(f"Stage {stage} not implemented") def _text_grad_constraint_propose_step( self, - steps: int, + current_step: int, all_samples, all_losses: List["Parameter"], all_y_preds, include_demo_optimizers: bool = False, + trainer_results: TrainerResult = None, + val_dataset: Any = None, + test_dataset: Any = None, ): """Handles both the mixed training and the separate training. When include_demo_optimizers is True, the demo optimizers are included in the training @@ -1749,16 +2105,33 @@ def _text_grad_constraint_propose_step( if not isinstance(loss, Parameter): raise ValueError("Loss should be a Parameter object") self.adaltask.eval() - move_batch_eval = self.adaltask.evaluate_samples(all_samples, all_y_preds) + use_eval_loss_fn = False + if self.adaltask.loss_eval_fn is not None: + use_eval_loss_fn = True + move_batch_eval = self.adaltask.evaluate_samples( + all_samples, all_y_preds, use_loss_eval_fn=use_eval_loss_fn + ) + print(f"Moving batch eval: {move_batch_eval}") move_batch_score = move_batch_eval.avg_score move_batch_acc_score_list = move_batch_eval.per_item_scores - if move_batch_score >= self.batch_val_score_threshold: - print(f"Skipping batch {steps} as acc: {move_batch_score}") - - # reset the moving batch - all_samples, all_losses, all_y_preds = [], [], [] - return all_samples, all_losses, all_y_preds + last_val_score = trainer_results.val_scores[-1] + val_score_increased = False + + # if move_batch_score >= self.batch_val_score_threshold: + # print(f"Skipping batch {steps} as acc: {move_batch_score}") + + # # reset the moving batch + # all_samples, all_losses, all_y_preds = [], [], [] + # # track the result + # self._add_one_step_in_trainer_results( + # trainer_results, + # last_val_score, + # trainer_results.test_scores[-1], + # trainer_results.prompts[-1], + # total_steps, + # ) + # return all_samples, all_losses, all_y_preds # downsample the moving batch all_samples, all_losses, all_y_preds, move_batch_acc_score_list = ( self._downsample_move_batch( @@ -1767,13 +2140,15 @@ def _text_grad_constraint_propose_step( ) move_batch_score = np.mean(np.array(move_batch_acc_score_list)) - print(f"Moving batch acc: {move_batch_score}") + printc(f"Moving batch acc: {move_batch_score}") # create a subset with a more balanced error and correct samples subset_score, subset_indices = self._moving_batch_sample( move_batch_acc_score_list ) - print(f"Subset batch acc: {subset_score}") + printc(f"Subset batch acc: {subset_score},{subset_score}") + + self.adaltask.train() # compute the subset loss subset_losses = [all_losses[i] for i in subset_indices] @@ -1786,11 +2161,11 @@ def _text_grad_constraint_propose_step( print("Optimizer propose...") # mark the subset loss to be backpropagated - # TODO: make this a step tdqm_loader = tqdm(range(self.max_proposals_per_step), desc="Proposing") + for i in tdqm_loader: - # print(f"Proposing step: {i}") + print(f"Proposal: {i+1}") # self.optimizer.propose() self._propose_text_optimizers() # new prompts if include_demo_optimizers: @@ -1799,45 +2174,122 @@ def _text_grad_constraint_propose_step( print("New prompts: ", new_prompts) # valide the subset subset_samples = [all_samples[i] for i in subset_indices] - # validate the subset val_output = self.adaltask.validation_step( - subset_samples, steps, self.num_workers + subset_samples, + current_step, + self.num_workers, + use_loss_eval_fn=use_eval_loss_fn, ) - # check subset validation score + # check subset validation score and compare with subset score val_score = val_output.avg_score - if val_score > subset_score: - print(f"Pass subset check: {val_score} > {subset_score}") + if ( + val_score == subset_score + and subset_score >= self.batch_val_score_threshold + ) or val_score > subset_score: # allow perfect subset to pass + + printc( + f"Pass subset check:{use_eval_loss_fn}, {val_score} > {subset_score}" + ) self._track_effectiveness("subset", True) else: - print( - f"Fail subset check, try next proposal: {val_score} <= {subset_score}" + printc( + f"Fail subset check, try next proposal: {use_eval_loss_fn}, {val_score} <= {subset_score}" ) - # self._add_failed_proposals_text_optimizers() + self._add_failed_proposals_text_optimizers() self._track_effectiveness("subset", False) self._revert_text_optimizers() if include_demo_optimizers: self._demo_optimizers_revert() continue # validate the full set - move_batch_result = self.adaltask.validation_step( - all_samples, steps, self.num_workers + # move_batch_result = self.adaltask.validation_step( + # all_samples, steps, self.num_workers, use_loss_eval_fn=use_eval_loss_fn + # ) + # new_move_batch_score = move_batch_result.avg_score + # if new_move_batch_score >= move_batch_score: + # printc(f"Pass full check: {new_move_batch_score} >= {move_batch_score}") + # self._track_effectiveness("fullset", True) + # # break + # else: + # printc( + # f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}" + # ) + # self._track_effectiveness("fullset", False) + # self._add_failed_proposals_text_optimizers() + # self._revert_text_optimizers() + # if include_demo_optimizers: + # self._demo_optimizers_revert() + # continue + + # check on the validation set + # set the batch size to the size of the validation set + val_output = self.adaltask.validation_step( + val_dataset, + current_step, + self.num_workers, + minimum_score=last_val_score, ) - new_move_batch_score = move_batch_result.avg_score - if new_move_batch_score >= move_batch_score: - print(f"Pass full check: {new_move_batch_score} >= {move_batch_score}") - self._track_effectiveness("fullset", True) + val_score = val_output.avg_score + + if val_score > last_val_score: + print(f"Optimizer step: {val_score} > {last_val_score}") + # self.optimizer.step() + self._track_effectiveness("valset", True) + self._step_text_optimizers() + self._add_history_text_optimizers(val_score) + + if include_demo_optimizers: + + self._demo_optimizers_step() + + # test the model + test_score = None + # if test_dataset is not None: + # test_output = self.adaltask.validation_step( + # test_dataset, total_steps, self.num_workers + # ) + # test_score = test_output.avg_score + + self._add_one_step_in_trainer_results( + trainer_results, + val_score, + test_score, + new_prompts, + current_step, + ) + all_samples, all_losses, all_y_preds = [], [], [] + val_score_increased = True + self._reset_steps_from_last_improvement_text_optimizers() break else: - print( - f"Fail full check, try next proposal: {new_move_batch_score} < {move_batch_score}" - ) - self._track_effectiveness("fullset", False) - # self._add_failed_proposals_text_optimizers() + print(f"Optimizer revert: {val_score} <= {last_val_score}") + self._track_effectiveness("valset", False) + self._add_failed_proposals_text_optimizers() + # self.optimizer.revert() self._revert_text_optimizers() if include_demo_optimizers: self._demo_optimizers_revert() + continue + if not val_score_increased: + print("No proposal can improve the subset and full set, and val set") + self._zero_grad_text_optimizers() + subset_loss.reset_all_gradients() + # save the score, no change + self._add_one_step_in_trainer_results( + trainer_results, + last_val_score, + trainer_results.test_scores[-1], + trainer_results.step_results[-1].prompt, + current_step, + attempted_val_score=val_score, + ) + self._increment_step_from_last_improvement_text_optimizers() + + print(f"Saving checkpoint to {self.ckpt_file}") + trainer_results.effective_measure = self._effective_measure + save_json(trainer_results.to_dict(), self.ckpt_file) print("Done with proposals") self.adaltask.train() @@ -1877,130 +2329,63 @@ def _fit_text_grad_constraint( trainer_results: TrainerResult = None, starting_step: int = 0, ) -> TrainerResult: - from adalflow.optim.parameter import Parameter + """ + Starting_step != 0 when it is resume_from_ckpt + """ logger.info("Fitting using Textual Gradient Descent with constraints") + printc("Fitting using Textual Gradient Descent with constraints") trainer_results = ( self._pre_fit(val_dataset, test_dataset) if trainer_results is None else trainer_results ) - print(f"save to {self.ckpt_file}") + print(f"_fit_text_grad_constraint save to {self.ckpt_file}") self.adaltask.train() self._zero_grad_text_optimizers() num_epochs = self._estimate_num_epochs(train_loader, self.max_steps) - total_steps = starting_step - all_samples, all_losses, all_y_preds = [], [], [] + current_step = starting_step + all_samples, all_losses = [], [] + all_y_preds: List[OutputParameter] = [] for epoch in tqdm(range(num_epochs), desc="Epoch"): - for steps, batch in enumerate((pbar := tqdm(train_loader, position=0))): - total_steps += 1 - if total_steps > self.max_steps + starting_step: + print(f"Epoch: {epoch}") + for _, batch in enumerate((pbar := tqdm(train_loader, position=0))): + current_step += 1 + if current_step > self.max_steps + starting_step: print("Reached max steps") break self._zero_grad_text_optimizers() - pbar.set_description(f"Training Step: {total_steps}") + self._text_optimizers_set_target_param() + pbar.set_description(f"Training Step: {current_step}") self.adaltask.train() # this will turn everything to train mode - y_preds = self.adaltask.train_step(batch, steps, self.num_workers) + y_preds = self.adaltask.train_step( + batch, current_step, self.num_workers + ) losses = self.adaltask.loss_step( - batch, y_preds, steps, self.num_workers + batch, y_preds, current_step, self.num_workers ) # moving batch all_samples.extend(batch) all_losses.extend(losses) all_y_preds.extend( - [y.full_response for y in y_preds if isinstance(y, Parameter)] + [y.data for y in y_preds if isinstance(y, OutputParameter)] ) all_samples, all_losses, all_y_preds = ( self._text_grad_constraint_propose_step( - steps=steps, + current_step=current_step, all_samples=all_samples, all_losses=all_losses, all_y_preds=all_y_preds, + trainer_results=trainer_results, + val_dataset=val_dataset, + test_dataset=test_dataset, ) ) - # check optimizer stages to see if the proposal was accepted so far - if not self._check_optimizer_proposal(): - print( - "No proposal can improve the subset and full set, go to next step" - ) - - self._add_one_step_in_trainer_results( - trainer_results, - trainer_results.val_scores[-1], - trainer_results.test_scores[-1], - trainer_results.prompts[-1], - total_steps, - ) - continue - - # prune the correct sample size if its too big, same with error samples - # run the tests as any other optimizer - if self.adaltask.validate_condition(steps, total_steps): - # set the batch size to the size of the validation set - last_val_score = trainer_results.val_scores[-1] - val_output = self.adaltask.validation_step( - val_dataset, - total_steps, - self.num_workers, - minimum_score=last_val_score, - ) - val_score = val_output.avg_score - - if val_score > last_val_score: - print(f"Optimizer step: {val_score} > {last_val_score}") - # self.optimizer.step() - self._add_history_text_optimizers( - val_score - ) # track top performor - self._step_text_optimizers() - - # save the score - step_result = { - "val_score": val_score, - } - - self._track_effectiveness("valset", True) - - # test the model - if test_dataset is not None: - test_output = self.adaltask.validation_step( - test_dataset, - steps, - self.num_workers, - ) - step_result["test_score"] = test_output.avg_score - else: - step_result["test_score"] = None - step_result["prompts"] = self.adaltask._get_param_values() - step_result["step"] = total_steps - self._add_one_step_in_trainer_results( - trainer_results, - **step_result, - ) - - all_samples, all_losses, all_y_preds = [], [], [] - - else: - print(f"Optimizer revert: {val_score} <= {last_val_score}") - self._revert_text_optimizers() - # self._add_failed_proposals_text_optimizers() # track failed proposals - self._track_effectiveness("valset", False) - self._add_one_step_in_trainer_results( - trainer_results, - trainer_results.val_scores[-1], - trainer_results.test_scores[-1], - trainer_results.prompts[-1], - total_steps, - attempted_val_score=val_score, - ) - - trainer_results.effective_measure = self._effective_measure - save_json(trainer_results.to_dict(), self.ckpt_file) save_json(trainer_results.to_dict(), self.ckpt_file) return trainer_results diff --git a/adalflow/adalflow/optim/types.py b/adalflow/adalflow/optim/types.py index 22b0ab14..f8f62c80 100644 --- a/adalflow/adalflow/optim/types.py +++ b/adalflow/adalflow/optim/types.py @@ -9,41 +9,54 @@ from adalflow.core import DataClass +# TODO: set default optimization class ParameterType(Enum): - __doc__ = """Enum for the type of parameter to compute the loss with, and to inform the optimizer.""" + __doc__ = """Enum for the type of parameter to compute the loss with, and to inform the optimizer. + + The meaning of reach tuple is: + 1. First element: the name of the parameter. + 2. Second element: the description of the parameter. + 3. Third element: whether the parameter is trainable. + + To access each element, use the following: + 1. name: `ParameterType.PROMPT.value` + 2. description: `ParameterType.PROMPT.description` + 3. trainable: `ParameterType.PROMPT.default_trainable` + """ # trainable parameters with optimizers PROMPT = ( "prompt", "Instruction to the language model on task, data, and format.", + True, ) # optimized by tgd_optimizer DEMOS = ( "demos", "A few examples to guide the language model.", + True, ) # optimized by demo_optimizer # input and output parameters (similar to tensor, can have grad_opt true, but not trainable) - INPUT = ("input", "The input to the component.") - OUTPUT = ("output", "The output of the component.") - HYPERPARAM = ("hyperparam", "Hyperparameters/args for the component.") - - # gradient paramters for each predecessor of dag. - GRADIENT = ("gradient", "A gradient parameter.") + INPUT = ("input", "The input to the component.", False) + OUTPUT = ("output", "The output of the component.", True) + HYPERPARAM = ("hyperparam", "Hyperparameters/args for the component.", False) # the following is a subtype of the output type # INSTANCE = ("instance", "Focus on fixing issues of this specific example.") GENERATOR_OUTPUT = ( "generator_output", "The output of the generator.", + True, ) # use raw response or error message as data, full response in full_response - RETRIEVER_OUTPUT = ("retriever_output", "The output of the retriever.") - LOSS_OUTPUT = ("loss", "The loss value.") - SUM_OUTPUT = ("sum", "The sum of the losses.") - NONE = ("none", "") + RETRIEVER_OUTPUT = ("retriever_output", "The output of the retriever.", True) + LOSS_OUTPUT = ("loss", "The loss value.", True) + SUM_OUTPUT = ("sum", "The sum of the losses.", True) + NONE = ("none", "", False) - def __init__(self, value, description): + def __init__(self, value: str, description: str, default_trainable: bool): self._value_ = value self.description = description + self.default_trainable = default_trainable def __str__(self): """Return a string representation that includes the enum's value and description.""" @@ -124,9 +137,7 @@ class TrainerResult(DataClass): test_scores: List[float] = field( default_factory=list, metadata={"desc": "List of test scores"} ) - prompts: List[List[PromptData]] = field( - default_factory=list, metadata={"desc": "List of optimized prompts"} - ) + step_results: List[TrainerStepResult] = field( default_factory=list, metadata={"desc": "List of step results, in an aggregated form"}, @@ -145,3 +156,7 @@ class TrainerResult(DataClass): trainer_state: Dict[str, Any] = field( default=None, metadata={"desc": "Save the most detailed state of the trainer"} ) + total_time: float = field( + default=0.0, metadata={"desc": "Total time taken for training"} + ) + test_score: float = field(default=None, metadata={"desc": "Test score"}) diff --git a/adalflow/adalflow/utils/data.py b/adalflow/adalflow/utils/data.py index 682453b1..374c47b4 100644 --- a/adalflow/adalflow/utils/data.py +++ b/adalflow/adalflow/utils/data.py @@ -74,10 +74,13 @@ class DataLoader: The biggest difference is not to handle tensors, but to handle any type of data.""" - def __init__(self, dataset, batch_size: int = 4, shuffle: bool = True): + def __init__( + self, dataset, batch_size: int = 4, shuffle: bool = True, seed: int = 42 + ): self.dataset = dataset self.batch_size = batch_size self.shuffle = shuffle + self.seed = seed self.indices = np.arange(len(dataset)) # if self.shuffle: @@ -91,6 +94,8 @@ def set_max_steps(self, max_steps: int): def __iter__(self): if self.shuffle: + if self.seed is not None: + np.random.seed(self.seed) # Use the provided seed np.random.shuffle(self.indices) self.current_index = 0 return self @@ -104,6 +109,8 @@ def __next__(self) -> Union[np.ndarray, Tuple]: if self.current_index >= len(self.dataset): if self.shuffle: + if self.seed is not None: + np.random.seed(self.seed) # Use the same seed for reshuffle np.random.shuffle(self.indices) # Reshuffle for the new epoch self.current_index = 0 if self.step_index < self.max_steps: diff --git a/adalflow/adalflow/utils/file_io.py b/adalflow/adalflow/utils/file_io.py index 7b038d7d..83728941 100644 --- a/adalflow/adalflow/utils/file_io.py +++ b/adalflow/adalflow/utils/file_io.py @@ -5,16 +5,17 @@ from typing import Mapping, Any, Optional, List, Dict -from adalflow.utils.serialization import ( - to_dict, - serialize, -) +from adalflow.utils.serialization import to_dict, serialize, _deserialize_object_hook log = logging.getLogger(__name__) def save_json(obj: Mapping[str, Any], f: str = "task.json") -> None: - """Save the object to a json file. + """Customized Save the object to a json file. + + Support Set. + We encourage users first save the data as DataClass using to_dict, + and then load it back to DataClass using from_dict. Args: obj (Mapping[str, Any]): The object to be saved. @@ -29,6 +30,15 @@ def save_json(obj: Mapping[str, Any], f: str = "task.json") -> None: raise IOError(f"Error saving object to JSON file {f}: {e}") +# def standard_save_json(obj: Mapping[str, Any], f: str = "task.json") -> None: +# os.makedirs(os.path.dirname(f) or ".", exist_ok=True) +# try: +# with open(f, "w") as file: +# json.dump(obj, file, indent=4) +# except IOError as e: +# raise IOError(f"Error saving object to JSON file {f}: {e}") + + def save_csv( obj: List[Dict[str, Any]], f: str = "task.csv", fieldnames: List[str] = None ) -> None: @@ -47,6 +57,15 @@ def save_csv( writer.writeheader() for row in obj: filtered_row = {k: v for k, v in row.items() if k in fieldnames} + # use json.dumps to serialize the object + for k, v in filtered_row.items(): + if ( + isinstance(v, dict) + or isinstance(v, list) + or isinstance(v, tuple) + or isinstance(v, set) + ): + filtered_row[k] = json.dumps(v) writer.writerow(filtered_row) except IOError as e: raise IOError(f"Error saving object to CSV file {f}: {e}") @@ -82,20 +101,63 @@ def save(obj: Mapping[str, Any], f: str = "task") -> None: raise Exception(f"Error saving object to json and pickle files: {e}") -def load_json(f: str = "task.json") -> Optional[Mapping[str, Any]]: - r"""Load the object from a json file. +# def load_json(f: str = "task.json") -> Optional[Mapping[str, Any]]: +# r"""Load the object from a json file. + +# Args: +# f (str, optional): The file name. Defaults to "task". +# """ +# if not os.path.exists(f): +# log.warning(f"File {f} does not exist.") +# return None +# try: +# with open(f, "r") as file: +# return json.load(file) +# except Exception as e: +# raise Exception(f"Error loading object from JSON file {f}: {e}") + + +def load_json(f: str) -> Any: + """Customized Load a JSON file and deserialize it. Args: - f (str, optional): The file name. Defaults to "task". + f (str): The file name of the JSON file to load. + + Returns: + Any: The deserialized Python object. """ if not os.path.exists(f): - log.warning(f"File {f} does not exist.") - return None + raise FileNotFoundError(f"JSON file not found: {f}") + + try: + with open(f, "r") as file: + data = json.load(file, object_hook=_deserialize_object_hook) + return data + except json.JSONDecodeError as e: + raise ValueError(f"Error decoding JSON file {f}: {e}") + except Exception as e: + raise IOError(f"Error loading JSON file {f}: {e}") + + +def load_standard_json(f: str) -> Any: + """Standard Load a JSON file and deserialize it. + Args: + f (str): The file name of the JSON file to load. + + Returns: + Any: The deserialized Python object. + """ + if not os.path.exists(f): + raise FileNotFoundError(f"JSON file not found: {f}") + try: with open(f, "r") as file: - return json.load(file) + data = json.load(file) + return data + except json.JSONDecodeError as e: + raise ValueError(f"Error decoding JSON file {f}: {e}") except Exception as e: - raise Exception(f"Error loading object from JSON file {f}: {e}") + raise IOError(f"Error loading JSON file {f}: {e}") def load_pickle(f: str = "task.pickle") -> Optional[Mapping[str, Any]]: diff --git a/adalflow/adalflow/utils/serialization.py b/adalflow/adalflow/utils/serialization.py index 5cb1dd27..929cbd60 100644 --- a/adalflow/adalflow/utils/serialization.py +++ b/adalflow/adalflow/utils/serialization.py @@ -58,6 +58,14 @@ def default(o: Any) -> Union[Dict[str, Any], str]: except Exception as e: log.error(f"Error serializing object {o}: {e}") pass + # handle set + elif isinstance(o, set): + return {"type": type(o).__name__, "data": list(o)} + else: + return {"type": type(o).__name__, "data": str(o)} + # raise NotImplementedError( + # f"Object of type {o.__class__.__name__} is not JSON serializable: {o}" + # ) elif obj_type == ObjectTypes.TYPE: log.debug(f"Object {o} is a type of {o.__name__}") try: @@ -101,9 +109,19 @@ def _deserialize_object_hook(d: Dict[str, Any]) -> Any: """Hook to deserialize objects based on their type.""" if "type" in d and "data" in d: class_name = d["type"] + if class_name == "set": + return set(d["data"]) + + # deseralize customized types + # TODO: all customized data types need to be saved class_type = EntityMapping.get(class_name) - if class_type: - return class_type.from_dict(d) + try: + if class_type: + return class_type.from_dict(d) + except Exception as e: + # default to the original object + log.error(f"Error deserializing object {d}: {e}") + pass return d diff --git a/adalflow/tests/test_openai_client.py b/adalflow/tests/test_openai_client.py index 9167c820..823f3ec6 100644 --- a/adalflow/tests/test_openai_client.py +++ b/adalflow/tests/test_openai_client.py @@ -362,7 +362,7 @@ async def test_acall_image_generation(self, MockAsyncOpenAI): # Mock the image generation response mock_async_client.images.generate = AsyncMock( - return_value=type('Response', (), {'data': self.mock_image_response})() + return_value=type("Response", (), {"data": self.mock_image_response})() ) # Call the acall method with image generation @@ -394,7 +394,7 @@ def test_call_image_generation(self, MockSyncOpenAI, mock_init_sync_client): # Mock the image generation response mock_sync_client.images.generate = Mock( - return_value=type('Response', (), {'data': self.mock_image_response})() + return_value=type("Response", (), {"data": self.mock_image_response})() ) # Set the sync client diff --git a/adalflow/tests/test_parameter.py b/adalflow/tests/test_parameter.py index 3da290da..a8f64f0e 100644 --- a/adalflow/tests/test_parameter.py +++ b/adalflow/tests/test_parameter.py @@ -46,6 +46,19 @@ def test_update_value(self, data, new_data): param.update_value(new_data) assert param.data == new_data, "Parameter data should be updated correctly" + def test_data_in_prompt_callable(self): + param = Parameter( + data=10, requires_opt=False, data_in_prompt=lambda x: f"Data: {x.data}" + ) + + assert ( + param.data_in_prompt(param) == "Data: 10" + ), "Data should be correctly formatted in the prompt" + + assert ( + param.get_prompt_data() == "Data: 10" + ), "Data should be correctly formatted in the prompt" + # def test_update_value_incorrect_type(self): # """Test updating the parameter with an incorrect type.""" # param = Parameter[int](data=10) diff --git a/adalflow/tests/test_parameter_text_grad.py b/adalflow/tests/test_parameter_text_grad.py index 91cf4dc9..64e004e0 100644 --- a/adalflow/tests/test_parameter_text_grad.py +++ b/adalflow/tests/test_parameter_text_grad.py @@ -8,36 +8,15 @@ class TestGradientContext(unittest.TestCase): def test_gradient_context_initialization(self): context = GradientContext( - context="Sample context", + input_output="Sample context", response_desc="Sample response description", variable_desc="Sample variable description", ) - self.assertEqual(context.context, "Sample context") + self.assertEqual(context.input_output, "Sample context") self.assertEqual(context.response_desc, "Sample response description") self.assertEqual(context.variable_desc, "Sample variable description") -class TestParameter(unittest.TestCase): - def setUp(self): - self.param1 = Parameter(data="Gradient 1", name="param1") - self.param2 = Parameter(data="Gradient 2", name="param2") - self.param1.gradients.append(self.param2) - self.param1.gradients_context[self.param2] = GradientContext( - context="Conversation context", - response_desc="Response description", - variable_desc="Variable description", - ) - - def test_get_gradient_text(self): - expected_output = """Batch size: 1 - -1. -Conversation context - -Gradient 2""" - self.assertEqual(self.param1.get_gradient_and_context_text(), expected_output) - - # def test_get_gradient_and_context_text(self): # expected_output = """ # Feedback 1.\n @@ -83,7 +62,6 @@ def test_update_prompt(self): # Check if each variable value is in the generated output # self.assertIn("Role description", result) # self.assertIn("short value", result) - self.assertIn("gradient and context text", result) # self.assertIn("", result) # self.assertIn("", result) self.assertIn("Some constraint text", result) diff --git a/adalflow/tests/test_react_agent.py b/adalflow/tests/test_react_agent.py index 244a421f..6575054c 100644 --- a/adalflow/tests/test_react_agent.py +++ b/adalflow/tests/test_react_agent.py @@ -1,2 +1,229 @@ +import unittest +from unittest.mock import Mock, patch +from adalflow.core.func_tool import FunctionTool +from adalflow.core.types import FunctionExpression, GeneratorOutput +from adalflow.components.agent.react import ReActAgent, StepOutput +from adalflow.components.model_client.openai_client import OpenAIClient + + +# Mock tools for testing +def mock_add(a: int, b: int) -> int: + return a + b + + +def mock_multiply(a: int, b: int) -> int: + return a * b + + +def mock_simple_tool(input: str) -> str: + return f"Processed: {input}" + + +class TestReActAgent(unittest.TestCase): + """Test Agent with normal functions""" + + def setUp(self): + # Mock OpenAIClient + self.mock_model_client = Mock(spec=OpenAIClient) + + # Initialize ReActAgent with mocked tools and model client + self.tools = [ + FunctionTool(mock_add), + FunctionTool(mock_multiply), + FunctionTool(mock_simple_tool), + ] + self.react_agent = ReActAgent( + tools=self.tools, + max_steps=5, + add_llm_as_fallback=False, + model_client=self.mock_model_client, + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + def test_react_agent_initialization(self): + self.assertEqual(self.react_agent.max_steps, 5) + self.assertTrue(not self.react_agent.add_llm_as_fallback) + self.assertEqual( + len(self.react_agent.tool_manager.tools), 4 + ) # 3 tools + finish + fallback + + @patch.object(ReActAgent, "planner", create=True) + def test_simple_query_execution(self, mock_planner): + # Simulate a valid JSON-serializable response from the planner + mock_planner.return_value = GeneratorOutput( + data=FunctionExpression.from_function( + thought="Finish the task directly.", + func=self.react_agent._finish, + answer="Simple answer", + ) + ) + + result = self.react_agent.call("What is 2 + 2?") + self.assertEqual(result.answer, "Simple answer") + + @patch.object(ReActAgent, "planner", create=True) + def test_complex_query_execution(self, mock_planner): + # Simulate multiple steps for a complex query, each planner will return a FunctionExpression + mock_planner.side_effect = [ + GeneratorOutput( + data=FunctionExpression.from_function( + thought="Divide the task into subqueries.", func=mock_add, a=2, b=2 + ) + ), + GeneratorOutput( + data=FunctionExpression.from_function( + thought="Multiply the results.", func=mock_multiply, a=4, b=3 + ) + ), + GeneratorOutput( + data=FunctionExpression.from_function( + thought="Finish the task directly.", + func=self.react_agent._finish, + answer=12, + ) + ), + ] + + # mock the agent to run the first step + step_output = self.react_agent._run_one_step( + step=1, step_history=[], prompt_kwargs={}, model_kwargs={} + ) + print(f"step_output 1: {step_output}") + self.assertEqual(len(step_output), 1) + self.assertTrue(isinstance(step_output[0], StepOutput)) + self.assertTrue(step_output[0].action) + self.assertTrue(isinstance(step_output[0].action, FunctionExpression)) + + result = self.react_agent.call("Add 2 and 3, then multiply by 4.") + print(f"result: {result}") + self.assertEqual(result.answer, 12) + + @patch.object(ReActAgent, "planner", create=True) + def test_error_handling(self, mock_planner): + # Simulate an error scenario + mock_planner.return_value = GeneratorOutput( + data={ + "thought": "Encountered an error.", + "function": {"name": "finish", "args": {"answer": "Error occurred"}}, + } + ) + # no action + + # check error raised + # with self.assertRaises(ValueError): + + result = self.react_agent.call("Simulate an error.") + print(f"result 2: {result}") + self.assertIn("Error occurred", result.answer) + + +from adalflow.optim.grad_component import GradComponent + + +class GradAdd(GradComponent): + def __init__(self): + super().__init__() + + def call(self, x, y): + return x + y + + def forward(self, x, y): + return f"{x + y} + forward" + + +class GradSub(GradComponent): + def __init__(self): + super().__init__() + + def call(self, x, y): + return x - y + + def forward(self, x, y): + return f"{x - y} + forward" + + +class TestReactAgentWithComponentASTool(unittest.TestCase): + @patch("adalflow.components.model_client.openai_client.OpenAIClient", autospec=True) + def setUp(self, MockOpenAIClient): + """Set up the ReActAgent with GradComponents as tools.""" + self.add_component = GradAdd() + self.sub_component = GradSub() + + self.tools = [ + FunctionTool(fn=self.add_component.__call__, component=self.add_component), + FunctionTool(fn=self.sub_component.__call__, component=self.sub_component), + ] + + self.mock_model_client = MockOpenAIClient.return_value + self.agent = ReActAgent( + tools=self.tools, + max_steps=5, + add_llm_as_fallback=False, + model_client=self.mock_model_client, + model_kwargs={"model": "gpt-3.5-turbo"}, + ) + + def test_agent_with_eval_mode(self): + """Test the agent's behavior when GradComponents are in eval mode.""" + # Ensure components start in eval mode + self.assertFalse(self.add_component.training) + self.assertFalse(self.sub_component.training) + + # Use agent to call addition tool + result = self.agent.tool_manager.tools[0](3, 2) # GradAdd in eval mode + self.assertEqual(result.output, 5) + + # Use agent to call subtraction tool + result = self.agent.tool_manager.tools[1](5, 3) # GradSub in eval mode + self.assertEqual(result.output, 2) + + def test_agent_with_train_mode(self): + """Test the agent's behavior when GradComponents are in train mode.""" + # Set the agent to train mode, which should propagate to components + self.agent.train() + + self.assertTrue(self.add_component.training) + self.assertTrue(self.sub_component.training) + # as the component is not directly registered in the agent, but passed to the tool manager, it will not be in training mode + + # Use agent to call addition tool in train mode + result = self.agent.tool_manager.tools[0](3, 2) # GradAdd in train mode + self.assertEqual(result.output, "5 + forward") + + # Use agent to call subtraction tool in train mode + result = self.agent.tool_manager.tools[1](5, 3) # GradSub in train mode + self.assertEqual(result.output, "2 + forward") + + def test_agent_switch_modes(self): + """Test the agent's ability to switch between eval and train modes.""" + # Start in eval mode + self.assertFalse(self.add_component.training) + self.assertFalse(self.sub_component.training) + + # Switch to train mode + self.agent.train() + named_components = self.agent.named_components() + for name, component in named_components: + print(f"{name}: {component}") + print(f"add_component: {self.add_component}") + self.assertTrue(self.agent.tool_manager.training) + + # add component will have eval mode + self.assertTrue(self.add_component.training) + + # the tools from the tool manager will be in training mode + self.assertTrue(self.agent.tool_manager.tools[0].training) + self.assertTrue(self.agent.tool_manager.tools[1].training) + + # back to eval mode + self.agent.eval() + self.assertFalse(self.add_component.training) + self.assertFalse(self.sub_component.training) + + # tools from the tool manager will be in eval mode + self.assertFalse(self.agent.tool_manager.tools[0].training) + self.assertFalse(self.agent.tool_manager.tools[1].training) + + if __name__ == "__main__": - pass + unittest.main() diff --git a/adalflow/tests/test_tool.py b/adalflow/tests/test_tool.py index 441eef56..e2cb3485 100644 --- a/adalflow/tests/test_tool.py +++ b/adalflow/tests/test_tool.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from adalflow.core.func_tool import FunctionTool +from adalflow.core.tool_manager import ToolManager from adalflow.core.types import FunctionDefinition @@ -53,27 +54,150 @@ def test_function_tool_async(): tool.call(3, 4) -# def test_invalid_function_tool_initialization(): -# # Test initialization without any function should raise ValueError -# with pytest.raises(ValueError): -# tool = FunctionTool(metadata=metadata) +from adalflow.optim.grad_component import GradComponent -# def test_from_defaults_uses_function_docstring(): -# def sample_function(x, y, user: User = User(id=1, name="John")): -# """ -# Adds two numbers together and returns the sum. -# """ -# return x + y +class GradAdd(GradComponent): + def __init__(self): + super().__init__() + print(f"training: {self.training}") -# tool = FunctionTool(fn=sample_function) + def call(self, x, y): + return x + y -# expected_description = sample_function.__doc__.strip() -# actual_description = tool.metadata.description -# print(f"Expected: {expected_description}, Actual: {actual_description}") + def forward(self, x, y): + print(f"training: {self.training}") + return f"{x + y} + forward" -# # Check if the metadata description matches the function's docstring -# assert ( -# actual_description == expected_description -# ), f"The description should automatically be set to the function's docstring. Expected: {expected_description}, Actual: {actual_description}" +class GradSub(GradComponent): + def __init__(self): + super().__init__() + + def call(self, x, y): + return x - y + + def forward(self, x, y): + print(f"training: {self.training}") + return f"{x - y} + forward" + + +class TestComponent(GradComponent): + def __init__(self): + super().__init__() + + self.add = GradAdd() + self.sub = GradSub() + + print(f"sub_component: {self.sub.training}") + + print(f"add_component: {self.add.training}") + + def add_as_tool(x, y): + return self.add(x, y) + + self.tools = [ + FunctionTool(fn=add_as_tool, component=self.add), + FunctionTool(fn=self.sub.__call__, component=self.sub), + ] + + +add = GradAdd() +sub = GradSub() + + +class TestComponnetInstanceOutsideComponent(GradComponent): + def __init__(self): + super().__init__() + + print(f"sub_component: {sub.training}") + + print(f"add_component: {add.training}") + + def add_as_tool(x, y): + return add(x, y) + + self.tools = [ + FunctionTool(fn=add_as_tool, component=add), + FunctionTool(fn=sub.__call__, component=sub), + ] + + +class TestToolManagerComponent(GradComponent): + + def __init__(self): + super().__init__() + + print(f"sub_component: {sub.training}") + + print(f"add_component: {add.training}") + + def add_as_tool(x, y): + return add(x, y) + + self.tools = [ + FunctionTool(fn=add_as_tool, component=add), + FunctionTool(fn=sub.__call__, component=sub), + ] + + # manag by tool manager, and since the component is passed to tools_manager which is also a component, it will be in training mode + self.tools_manager = ToolManager(tools=self.tools) + + +def test_function_tool_with_grad_component(): + r"""When we set the training mode of the component, the subcomponents will change with it. + Once the subcomponent change, it will adapt to training model too. + """ + + test_com = TestComponent() + assert not test_com.training + # call the tools + output = test_com.tools[0](1, 2) + # ensure it is the call method that is called + assert output.output == 3 + test_com.train() + assert test_com.training + assert test_com.add.training + # ensure it is the forward method that is called + output = test_com.tools[0](1, 2) + assert output.output == "3 + forward" + + +def test_component_instance_outside_component(): + r"""When we set the training mode of the component, the subcomponents will change with it. + Once the subcomponent change, it will adapt to training model too. + """ + + test_com = TestComponnetInstanceOutsideComponent() + assert not test_com.training + # call the tools + output = test_com.tools[0](1, 2) + # ensure it is the call method that is called + assert output.output == 3 + test_com.train() + assert test_com.training + assert not add.training # the subcomponent is no longer in training mode + # ensure it is the forward method that is called + output = test_com.tools[0](1, 2) + assert output.output == 3 + + +def test_tool_manager_with_grad_component(): + r"""When we set the training mode of the component, the subcomponents will change with it. + Once the subcomponent change, it will adapt to training model too. + """ + + test_com = TestToolManagerComponent() + assert not test_com.training + # call the tools + output = test_com.tools_manager.tools[0](1, 2) + # ensure it is the call method that is called + assert output.output == 3 + test_com.train() + assert test_com.training + assert ( + add.training + ) # the subcomponent will change as it is managed by the tool manager + # ensure it is the forward method that is called + output = test_com.tools_manager.tools[0](1, 2) + assert output.output == "3 + forward" diff --git a/benchmarks/BHH_object_count/dspy_count.py b/benchmarks/BHH_object_count/dspy_count.py index 63d1bd7f..7be300d8 100644 --- a/benchmarks/BHH_object_count/dspy_count.py +++ b/benchmarks/BHH_object_count/dspy_count.py @@ -1,11 +1,26 @@ import dspy -from use_cases.question_answering.bhh_object_count.data import _parse_integer_answer - +import re turbo = dspy.OpenAI(model="gpt-3.5-turbo") dspy.settings.configure(lm=turbo) +def parse_integer_answer(answer: str): + """A function that parses the last integer from a string using regular expressions.""" + try: + # Use regular expression to find all sequences of digits + numbers = re.findall(r"\d+", answer) + if numbers: + # Get the last number found + answer = int(numbers[-1]) + else: + answer = -1 + except ValueError: + answer = -1 + + return answer + + class GenerateAnswer(dspy.Signature): # NOTE: this doc string acts as the system prompt """You will answer a reasoning question. Think step by step.""" # The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value.""" @@ -27,14 +42,14 @@ def __init__(self): def forward(self, question): pred = self.generate_answer(question=question) - answer = _parse_integer_answer(pred.answer, only_first_line=False) + answer = parse_integer_answer(pred.answer) answer = str(answer) # dspy will assume these fields are strings not integers # print(f"Pred: {pred}, Answer: {answer}") return dspy.Prediction(answer=answer) if __name__ == "__main__": - from lightrag.utils import setup_env + from adalflow.utils import setup_env setup_env() obj = ObjectCount() diff --git a/benchmarks/BHH_object_count/train.py b/benchmarks/BHH_object_count/train.py index 7d3e357e..67625b8e 100644 --- a/benchmarks/BHH_object_count/train.py +++ b/benchmarks/BHH_object_count/train.py @@ -5,6 +5,11 @@ from benchmarks.BHH_object_count.dspy_count import ObjectCount +turbo = dspy.OpenAI(model="gpt-3.5-turbo-0125") + +gpt_4 = dspy.OpenAI(model="gpt-4o") +dspy.configure(lm=turbo) + def validate_exact_match(example, pred, trace=None): if dspy.evaluate.answer_exact_match(example, pred): @@ -14,10 +19,10 @@ def validate_exact_match(example, pred, trace=None): return acc -def load_datasets(max_samples=10): - from use_cases.question_answering.bhh_object_count.data import load_datasets +def load_datasets(): + from use_cases.question_answering.bbh.data import load_datasets - trainset, valset, testset = load_datasets(max_samples=max_samples) + trainset, valset, testset = load_datasets() # dspy requires us to package the dataset to Example objects and specify the inputs dspy_trainset, dspy_valset, dspy_testset = [], [], [] @@ -25,7 +30,7 @@ def load_datasets(max_samples=10): [trainset, valset, testset], [dspy_trainset, dspy_valset, dspy_testset] ): for item in dataset[0]: - example = Example(question=item.x, answer=item.y) + example = Example(question=item.question, answer=item.answer) example = example.with_inputs("question") dataset[1].append(example) @@ -43,6 +48,35 @@ def train(dspy_trainset=None): return compiled_count +def train_MIPROv2(trainset, valset, save_path, filename): + + import os + from dspy.teleprompt import MIPROv2 + + if not os.path.exists(save_path): + os.makedirs(save_path) + + tp = MIPROv2( + metric=validate_exact_match, + prompt_model=gpt_4, + task_model=turbo, + num_candidates=30, + init_temperature=1.0, + ) + compiled_task = tp.compile( + ObjectCount(), + trainset=trainset, + valset=valset, + max_bootstrapped_demos=5, + max_labeled_demos=2, + num_batches=12, # MINIBATCH_SIZE = 25, + seed=2025, + requires_permission_to_run=False, + ) + compiled_task.save(os.path.join(save_path, filename)) + return compiled_task + + def validate(dataset, compiled_count): from tqdm import tqdm @@ -54,33 +88,67 @@ def validate(dataset, compiled_count): return sum(acc_list) / len(acc_list) +def train_and_validate(): + save_path = "benchmarks/BHH_object_count/models/dspy" + if not os.path.exists(save_path): + os.makedirs(save_path) + + import time + import tqdm + + dspy_trainset, dspy_valset, dspy_testset = load_datasets() + + val_accs = [] + test_accs = [] + training_times = [] + + num_runs = 4 + + for i in tqdm.tqdm(range(num_runs)): + start = time.time() + output_file = f"compiled_count_{i}.json" + + compiled_count = train_MIPROv2( + dspy_trainset, dspy_valset, save_path, output_file + ) + val_acc = validate(dspy_valset, compiled_count) + test_acc = validate(dspy_testset, compiled_count) + + val_accs.append(val_acc) + test_accs.append(test_acc) + + training_times.append(time.time() - start) + + # compute the mean and standard deviation + import numpy as np + + val_accs = np.array(val_accs) + test_accs = np.array(test_accs) + training_times = np.array(training_times) + + print("Validation accuracy:", val_accs.mean(), val_accs.std()) + print("Test accuracy:", test_accs.mean(), test_accs.std()) + + print("Training time:", training_times.mean()) + + if __name__ == "__main__": - from benchmarks.BHH_object_count.dspy_count import GenerateAnswer + from adalflow.utils import setup_env + import os + setup_env() + save_path = "benchmarks/BHH_object_count/models/dspy" - if not os.path.exists(save_path): - os.makedirs(save_path) - example = GenerateAnswer( - question="How many musical instruments do I have?", answer="5" - ) - pred = GenerateAnswer( - question="How many musical instruments do I have?", answer="5" - ) - print(validate_exact_match(example, pred)) - - dspy_trainset, dspy_valset, dspy_testset = load_datasets(max_samples=4) - - start_val_acc = validate(dspy_valset, ObjectCount()) - start_test_acc = validate(dspy_testset, ObjectCount()) - print("Starting validation accuracy:", start_val_acc) - print("Starting test accuracy:", start_test_acc) - pass - - compiled_count = train(dspy_trainset) - val_acc = validate(dspy_valset, compiled_count) - test_acc = validate(dspy_testset, compiled_count) - compiled_count.save(os.path.join(save_path, "compiled_count.json")) - print("Validation accuracy:", val_acc) - print("Test accuracy:", test_acc) + train_and_validate() + # if not os.path.exists(save_path): + # os.makedirs(save_path) + + # example = GenerateAnswer( + # question="How many musical instruments do I have?", answer="5" + # ) + # pred = GenerateAnswer( + # question="How many musical instruments do I have?", answer="5" + # ) + # print(validate_exact_match(example, pred)) diff --git a/benchmarks/hotpot_qa/_adal_train.py b/benchmarks/hotpot_qa/_adal_train.py deleted file mode 100644 index e397cf0f..00000000 --- a/benchmarks/hotpot_qa/_adal_train.py +++ /dev/null @@ -1,664 +0,0 @@ -"deprecated" -"""We will use dspy's retriever to keep that the same and only use our generator and optimizer""" - -import dspy -from typing import List, Union, Optional, Dict, Callable -from dataclasses import dataclass, field - -import adalflow as adal -from adalflow.optim.parameter import Parameter, ParameterType - -from adalflow.datasets.hotpot_qa import HotPotQA, HotPotQAData -from adalflow.datasets.types import Example - -from adalflow.core.retriever import Retriever - - -colbertv2_wiki17_abstracts = dspy.ColBERTv2( - url="http://20.102.90.50:2017/wiki17_abstracts" -) - -dspy.settings.configure(rm=colbertv2_wiki17_abstracts) - - -def load_datasets(): - - trainset = HotPotQA(split="train", size=20) # 20 - valset = HotPotQA(split="val", size=50) # 50 - testset = HotPotQA(split="test", size=50) # to keep the same as the dspy #50 - print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}") - return trainset, valset, testset - - -# task pipeline -from typing import Any, Tuple - -from adalflow.core import Component, Generator - - -# dspy format -# Follow the following format. -# Context: may contain relevant facts -# Question: ${question} -# Reasoning: Let's think step by step in order to ${produce the query}. We ... -# Query: ${query} -@dataclass -class QueryRewritterData(adal.DataClass): - reasoning: str = field( - metadata={"desc": "The reasoning to produce the query"}, - ) - query: str = field( - metadata={"desc": "The query you produced"}, - ) - - __output_fields__ = ["reasoning", "query"] - - -@dataclass -class AnswerData(adal.DataClass): - reasoning: str = field( - metadata={"desc": "The reasoning to produce the answer"}, - ) - answer: str = field( - metadata={"desc": "The answer you produced"}, - ) - - __output_fields__ = ["reasoning", "answer"] - - -query_template = """ -Write a simple search query that will help answer a complex question. - -You will receive a context(may contain relevant facts) and a question. -Think step by step. - -{{output_format_str}} -{# Few shot demos #} -{% if few_shot_demos is not none %} -Here are some examples: -{{few_shot_demos}} -{% endif %} - - -Context: {{context}} -Question: {{question}} - -""" - -# Library gives a standard template for easy prompt -answer_template = """ -Answer questions with short factoid answers. - -You will receive context(may contain relevabt facts) and a question. -Think step by step. -{{output_format_str}} -{# Few shot demos #} -{% if few_shot_demos is not none %} -Here are some examples: -{{few_shot_demos}} -{% endif %} - - -Context: {{context}} -Question: {{question}} -""" - -from adalflow.core.component import fun_to_component -import re - - -@fun_to_component -def parse_string_query(text: str) -> str: - return re.search(r"Query: (.*)", text).group(1) - - -@fun_to_component -def parse_string_answer(text: str) -> str: - return re.search(r"Answer: (.*)", text).group(1) - - -from dataclasses import dataclass, field - - -@dataclass -class HotPotQADemoData(Example): - context: List[str] = field( - metadata={"desc": "The context to be used for answering the question"}, - default_factory=list, - ) - score: float = field( - metadata={"desc": "The score of the answer"}, - default=None, - ) - - -from benchmarks.hotpot_qa.dspy_train import validate_context_and_answer_and_hops - - -def convert_y_pred_to_dataclass(y_pred): - # y_pred in both eval and train mode - context: List[str] = ( - y_pred.input_args["prompt_kwargs"]["context"] - if hasattr(y_pred, "input_args") - else [] - ) - # context_str = "\n".join(context) - data = y_pred.data if hasattr(y_pred, "data") else y_pred - return DynamicDataClassFactory.from_dict( - class_name="HotPotQAData", - data={ - "answer": data, - "context": context, - }, - ) - - -def eval_fn(sample, y_pred, metadata): - if isinstance(sample, Parameter): - sample = sample.data - y_pred_obj = convert_y_pred_to_dataclass(y_pred) - return 1 if validate_context_and_answer_and_hops(sample, y_pred_obj) else 0 - - -from adalflow.core.types import RetrieverOutput, GeneratorOutput - - -# Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline -class DspyRetriever(Retriever): - def __init__(self, k=3): - super().__init__() - self.k = k - self.dspy_retriever = dspy.Retrieve(k=k) - - def call(self, input: str) -> List[RetrieverOutput]: - output = self.dspy_retriever(query_or_queries=input, k=self.k) - print(f"dsy_retriever output: {output}") - final_output: List[RetrieverOutput] = [] - documents = output.passages - - final_output.append( - RetrieverOutput( - query=input, - documents=documents, - doc_indices=[], - ) - ) - print(f"final_output: {final_output}") - return final_output - - -# example need to have question, -# pred needs to have query - -import adalflow as adal - - -# User customize an auto-grad operator -class MultiHopRetriever(adal.Retriever): - def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): - super().__init__() - - self.passages_per_hop = passages_per_hop - self.max_hops = max_hops - - self.data_parser = adal.DataClassParser( - data_class=QueryRewritterData, return_data_class=True, format_type="yaml" - ) - - # Grad Component - self.query_generator = Generator( - name="query_generator", - model_client=model_client, - model_kwargs=model_kwargs, - prompt_kwargs={ - "few_shot_demos": Parameter( - name="few_shot_demos_1", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ), - "output_format_str": self.data_parser.get_output_format_str(), - }, - template=query_template, - # output_processors=parse_string_query, - output_processors=self.data_parser, - use_cache=True, - # demo_data_class=HotPotQADemoData, - # demo_data_class_input_mapping={ - # "question": "question", - # # "context": "context", - # }, - # demo_data_class_output_mapping={"answer": lambda x: x.raw_response}, - ) - self.retrieve = DspyRetriever(k=passages_per_hop) - - @staticmethod - def context_to_str(context: List[str]) -> str: - return "\n".join(context) - - def call(self, *, question: str, id: str = None) -> Any: # Add id for tracing - # inference mode!!! - # output = self.forward(question, id=id) - - context = [] - self.max_hops = 1 - for hop in range(self.max_hops): - gen_out = self.query_generator( - prompt_kwargs={ - "context": self.context_to_str(context), - "question": question, - }, - id=id, - ) - query = None - # TODO: the bridge between the retriever to the generator and generator to the retriever needs to be more smooth - if isinstance(gen_out, GeneratorOutput): - query = ( # noqa: F841 - gen_out.data.query if gen_out.data and gen_out.data.query else None - ) - elif isinstance(gen_out, adal.Parameter): - gen_out.successor_map_fn = lambda x: ( - x.full_response.data.query - if x.full_response and x.full_response.data - else None - ) - print(f"gen_out: {gen_out}") - # query = ( - # gen_out.full_response.data.query - # if gen_out.full_response and gen_out.full_response.data - # else None - # ) - retrieve_out = self.retrieve(input=gen_out) - print(f"retrieve_out: {retrieve_out}") - # passages = [] - # if isinstance(retrieve_out, Parameter): - # passages = retrieve_out.data[0].documents - # else: - # passages = retrieve_out[0].documents - - # print(f"passages: {passages}") - - # context = deduplicate(context + passages) - - # # for hop in range(self.max_hops): - # last_context_param = Parameter( - # data=context, - # name=f"query_context_{id}_{0}", - # requires_opt=True, - # ) - # query = self.query_generator( - # prompt_kwargs={ - # "context": last_context_param, - # "question": question, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # print(f"output call: {output}") - # return output[0].documents - - # def forward(self, question: str, id: str = None) -> Parameter: - # question_param = question - # if not isinstance(question, Parameter): - # question_param = Parameter( - # data=question, - # name="question", - # role_desc="The question to be answered", - # requires_opt=False, - # ) - # context = [] - # self.max_hops = 1 - # # for hop in range(self.max_hops): - # last_context_param = Parameter( - # data=context, - # name=f"query_context_{id}_{0}", - # requires_opt=True, - # ) - # query = self.query_generator( - # prompt_kwargs={ - # "context": last_context_param, - # "question": question_param, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # passages = [] - # if isinstance(output, Parameter): - # passages = output.data[0].documents - # else: - # passages = output[0].documents - # # context = deduplicate(context + passages) # all these needs to gradable - # # output_param = Parameter( - # # data=passages, - # # alias=f"qa_context_{id}", - # # role_desc="The context to be used for answering the question", - # # requires_opt=True, - # # ) - # output.data = passages # reset the values to be used in the next - # if not isinstance(output, Parameter): - # raise ValueError(f"Output must be a Parameter, got {output}") - # return output - # # output_param.set_grad_fn( - # # BackwardContext( - # # backward_fn=self.backward, - # # response=output_param, - # # id=id, - # # prededecessors=prededecessors, - # # ) - # # ) - # # return output_param - - def backward(self, response: Parameter, id: Optional[str] = None): - print(f"MultiHopRetriever backward: {response}") - children_params = response.predecessors - # backward score to the demo parameter - for pred in children_params: - if pred.requires_opt: - # pred._score = float(response._score) - pred.set_score(response._score) - print( - f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}" - ) - if pred.param_type == ParameterType.DEMOS: - # Accumulate the score to the demo - pred.add_score_to_trace( - trace_id=id, score=response._score, is_teacher=self.teacher_mode - ) - print(f"Pred: {pred.name}, traces: {pred._traces}") - - -class HotPotQARAG( - Component -): # use component as not creating a new ops, but assemble existing ops - r"""Same system prompt as text-grad paper, but with our one message prompt template, which has better starting performance""" - - def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): - super().__init__() - - self.passages_per_hop = passages_per_hop - self.max_hops = max_hops - - self.multi_hop_retriever = MultiHopRetriever( - model_client=model_client, - model_kwargs=model_kwargs, - passages_per_hop=passages_per_hop, - max_hops=max_hops, - ) - # TODO: sometimes the cache will collide, so we get different evaluation - self.llm_counter = Generator( - name="QuestionAnswering", - model_client=model_client, - model_kwargs=model_kwargs, - prompt_kwargs={ - "few_shot_demos": Parameter( - name="few_shot_demos", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ) - }, - template=answer_template, - output_processors=parse_string_answer, - use_cache=True, - demo_data_class=HotPotQADemoData, - demo_data_class_input_mapping={ - "question": "question", - "context": "context", - }, - demo_data_class_output_mapping={"answer": lambda x: x.raw_response}, - ) - - # TODO: the error will be a context - # a component wont handle training, forward or backward, just passing everything through - def call(self, question: str, id: str = None) -> Union[Parameter, str]: - - # normal component, will be called when in inference mode - - question_param = Parameter( - data=question, - name="question", - role_desc="The question to be answered", - requires_opt=False, - ) - context = [] # noqa: F841 - output = None - retrieved_context = self.multi_hop_retriever(question_param, id=id) - - # forming a backpropagation graph - # Make this step traceable too. - # for hop in range(self.max_hops): - # # make context a parameter to be able to trace - # query = self.query_generator( - # prompt_kwargs={ - # "context": Parameter( - # data=context, alias=f"query_context_{id}", requires_opt=True - # ), - # "question": question_param, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # passages = [] - # if isinstance(output, Parameter): - # passages = output.data[0].documents - # else: - # output[0].documents - # context = deduplicate(context + passages) - # print(f"context: {context}") - - output = self.llm_counter( - prompt_kwargs={ - "context": retrieved_context, - "question": question_param, - }, - id=id, - ) # already support both training (forward + call) - - if ( - not self.training - ): # if users want to customize the output, ensure to use if not self.training - - # convert the generator output to a normal data format - print(f"converting output: {output}") - - if output.data is None: - error_msg = ( - f"Error in processing the question: {question}, output: {output}" - ) - print(error_msg) - output = error_msg - else: - output = output.data - return output - - -from adalflow.optim.trainer.adal import AdalComponent -from adalflow.optim.trainer.trainer import Trainer -from adalflow.optim.few_shot.bootstrap_optimizer import BootstrapFewShot -from adalflow.eval.answer_match_acc import AnswerMatchAcc -from adalflow.optim.text_grad.text_loss_with_eval_fn import EvalFnToTextLoss -from adalflow.core.base_data_class import DynamicDataClassFactory - - -class HotPotQARAGAdal(AdalComponent): - # TODO: move teacher model or config in the base class so users dont feel customize too much - def __init__(self, task: Component, teacher_model_config: dict): - super().__init__() - self.task = task - self.teacher_model_config = teacher_model_config - - self.evaluator = AnswerMatchAcc("fuzzy_match") - self.eval_fn = self.evaluator.compute_single_item - # self.eval_fn = eval_fn - - def handle_one_task_sample( - self, sample: HotPotQAData - ) -> Any: # TODO: auto id, with index in call train examples - return self.task, {"question": sample.question, "id": sample.id} - - def handle_one_loss_sample( - self, sample: HotPotQAData, y_pred: Any - ) -> Tuple[Callable, Dict]: - return self.loss_fn.forward, { - "kwargs": { - "y": y_pred, - "y_gt": Parameter( - data=sample.answer, - role_desc="The ground truth(reference correct answer)", - name="y_gt", - requires_opt=False, - ), - } - } - - def configure_optimizers(self, *args, **kwargs): - - # TODO: simplify this, make it accept generator - parameters = [] - for name, param in self.task.named_parameters(): - param.name = name - parameters.append(param) - do = BootstrapFewShot(params=parameters) - return [do] - - def evaluate_one_sample( - self, sample: Any, y_pred: Any, metadata: Dict[str, Any] - ) -> Any: - - # we need "context" be passed as metadata - # print(f"sample: {sample}, y_pred: {y_pred}") - # convert pred to Dspy structure - - # y_obj = convert_y_pred_to_dataclass(y_pred) - # print(f"y_obj: {y_obj}") - # raise ValueError("Stop here") - if metadata: - return self.eval_fn(sample, y_pred, metadata) - return self.eval_fn(sample, y_pred) - - def configure_teacher_generator(self): - super().configure_teacher_generator(**self.teacher_model_config) - - def configure_loss_fn(self): - self.loss_fn = EvalFnToTextLoss( - eval_fn=self.eval_fn, - eval_fn_desc="ObjectCountingEvalFn, Output accuracy score: 1 for correct, 0 for incorrect", - backward_engine=None, - ) - - -def validate_dspy_demos( - demos_file="benchmarks/BHH_object_count/models/dspy/hotpotqa.json", -): - from adalflow.utils.file_io import load_json - - demos_json = load_json(demos_file) - - demos = demos_json["generate_answer"]["demos"] # noqa: F841 - - # task = HotPotQARAG( # noqa: F841 - # **gpt_3_model, - # passages_per_hop=3, - # max_hops=2, - # ) - # task.llm_counter.p - - -def test_multi_hop_retriever(): - - from use_cases.config import ( - gpt_3_model, - ) - - multi_hop_retriever = MultiHopRetriever( - **gpt_3_model, - passages_per_hop=3, - max_hops=2, - ) - # 1. use print - # print(multi_hop_retriever.query_generator) - # # 2. run one forward for query generator - question = "How many storeys are in the castle that David Gregory inherited?" - # context = [] - # context_str = multi_hop_retriever.context_to_str(context) - # print( - # multi_hop_retriever.query_generator( - # prompt_kwargs={"question": question, "context": context_str}, id="1" - # ) - # ) - # # verfify the prompt - # multi_hop_retriever.query_generator.print_prompt( - # **{"question": question, "context": context_str} - # ) - - # training mode - multi_hop_retriever.train() - - # 3. run one forward for retriever - print(multi_hop_retriever(question=question, id="1")) - - -def train(): - trainset, valset, testset = load_datasets() - - from use_cases.config import ( - gpt_3_model, - gpt_4o_model, - ) - - task = HotPotQARAG( - **gpt_3_model, - passages_per_hop=3, - max_hops=2, - ) - print(task) - question = "How long is the highway Whitehorse/Cousins Airport was built to support as of 2012?" - print(task(question)) - - # for name, param in task.named_parameters(): - # print(f"name: {name}, param: {param}") - - trainset, valset, testset = load_datasets() - - trainer = Trainer( - adaltask=HotPotQARAGAdal(task=task, teacher_model_config=gpt_4o_model), - max_steps=10, - raw_shots=0, - bootstrap_shots=4, - train_batch_size=4, - ckpt_path="hotpot_qa_rag", - strategy="random", - save_traces=True, - debug=True, # make it having debug mode - weighted_sampling=True, - ) - # fit include max steps - trainer.fit( - train_dataset=trainset, val_dataset=valset, test_dataset=testset, debug=True - ) - - -if __name__ == "__main__": - ### Try the minimum effort to test on any task - - # get_logger(level="DEBUG") - test_multi_hop_retriever() - - -# TODO: i forgot that i need demo_data_class -# TODO: i forgot that i need to set id -# Failed to generate demos but no error messages diff --git a/benchmarks/hotpot_qa/adal_exp/build.py b/benchmarks/hotpot_qa/adal_exp/build.py deleted file mode 100644 index 9f1d078c..00000000 --- a/benchmarks/hotpot_qa/adal_exp/build.py +++ /dev/null @@ -1,630 +0,0 @@ -"""We will use dspy's retriever to keep that the same and only use our generator and optimizer""" - -import dspy -import re -from typing import List, Union, Optional, Dict, Callable -from dataclasses import dataclass, field - -import adalflow as adal -from adalflow.optim.parameter import Parameter, ParameterType - -from adalflow.datasets.hotpot_qa import HotPotQA, HotPotQAData -from adalflow.datasets.types import Example - -from adalflow.core.retriever import Retriever -from adalflow.core.component import fun_to_component - - -colbertv2_wiki17_abstracts = dspy.ColBERTv2( - url="http://20.102.90.50:2017/wiki17_abstracts" -) - -dspy.settings.configure(rm=colbertv2_wiki17_abstracts) - - -def load_datasets(): - - trainset = HotPotQA(split="train", size=20) - valset = HotPotQA(split="val", size=50) - testset = HotPotQA(split="test", size=50) - print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}") - return trainset, valset, testset - - -# task pipeline -from typing import Any, Tuple - -from adalflow.core import Component, Generator - - -# dspy format -# Follow the following format. -# Context: may contain relevant facts -# Question: ${question} -# Reasoning: Let's think step by step in order to ${produce the query}. We ... -# Query: ${query} -@dataclass -class QueryRewritterData(adal.DataClass): - reasoning: str = field( - metadata={"desc": "The reasoning to produce the query"}, - ) - query: str = field( - metadata={"desc": "The query you produced"}, - ) - - __output_fields__ = ["reasoning", "query"] - - -@dataclass -class AnswerData(adal.DataClass): - reasoning: str = field( - metadata={"desc": "The reasoning to produce the answer"}, - ) - answer: str = field( - metadata={"desc": "The answer you produced"}, - ) - - __output_fields__ = ["reasoning", "answer"] - - -query_template = """ -Write a simple search query that will help answer a complex question. - -You will receive a context(may contain relevant facts) and a question. -Think step by step. - -{{output_format_str}} -{# Few shot demos #} -{% if few_shot_demos is not none %} -Here are some examples: -{{few_shot_demos}} -{% endif %} - - -Context: {{context}} -Question: {{question}} - -""" - -# Library gives a standard template for easy prompt -answer_template = """ -Answer questions with short factoid answers. - -You will receive context(may contain relevabt facts) and a question. -Think step by step. -{{output_format_str}} -{# Few shot demos #} -{% if few_shot_demos is not none %} -Here are some examples: -{{few_shot_demos}} -{% endif %} - - -Context: {{context}} -Question: {{question}} -""" - - -# @fun_to_component -# def parse_string_query(text: str) -> str: -# return re.search(r"Query: (.*)", text).group(1) - - -@fun_to_component -def parse_string_answer(text: str) -> str: - return re.search(r"Answer: (.*)", text).group(1) - - -from dataclasses import dataclass, field - - -@dataclass -class HotPotQADemoData(Example): - context: List[str] = field( - metadata={"desc": "The context to be used for answering the question"}, - default_factory=list, - ) - score: float = field( - metadata={"desc": "The score of the answer"}, - default=None, - ) - - -from adalflow.core.types import RetrieverOutput, GeneratorOutput - - -# Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline -class DspyRetriever(Retriever): - def __init__(self, k=3): - super().__init__() - self.k = k - self.dspy_retriever = dspy.Retrieve(k=k) - - def call(self, input: str) -> List[RetrieverOutput]: - output = self.dspy_retriever(query_or_queries=input, k=self.k) - print(f"dsy_retriever output: {output}") - final_output: List[RetrieverOutput] = [] - documents = output.passages - - final_output.append( - RetrieverOutput( - query=input, - documents=documents, - doc_indices=[], - ) - ) - print(f"final_output: {final_output}") - return final_output - - -import adalflow as adal - - -# User customize an auto-grad operator -class MultiHopRetriever(adal.Retriever): - def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): - super().__init__() - - self.passages_per_hop = passages_per_hop - self.max_hops = max_hops - - self.data_parser = adal.DataClassParser( - data_class=QueryRewritterData, return_data_class=True, format_type="yaml" - ) - - # Grad Component - self.query_generator = Generator( - name="query_generator", - model_client=model_client, - model_kwargs=model_kwargs, - prompt_kwargs={ - "few_shot_demos": Parameter( - name="few_shot_demos_1", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ), - "output_format_str": self.data_parser.get_output_format_str(), - }, - template=query_template, - # output_processors=parse_string_query, - output_processors=self.data_parser, - use_cache=True, - # demo_data_class=HotPotQADemoData, - # demo_data_class_input_mapping={ - # "question": "question", - # # "context": "context", - # }, - # demo_data_class_output_mapping={"answer": lambda x: x.raw_response}, - ) - self.retrieve = DspyRetriever(k=passages_per_hop) - - @staticmethod - def context_to_str(context: List[str]) -> str: - return "\n".join(context) - - def call(self, *, question: str, id: str = None) -> Any: # Add id for tracing - # inference mode!!! - # output = self.forward(question, id=id) - - context = [] - self.max_hops = 1 - for hop in range(self.max_hops): - gen_out = self.query_generator( - prompt_kwargs={ - "context": self.context_to_str(context), - "question": question, - }, - id=id, - ) - query = None - # TODO: the bridge between the retriever to the generator and generator to the retriever needs to be more smooth - if isinstance(gen_out, GeneratorOutput): - query = ( # noqa: F841 - gen_out.data.query if gen_out.data and gen_out.data.query else None - ) - elif isinstance(gen_out, adal.Parameter): - gen_out.successor_map_fn = lambda x: ( - x.full_response.data.query - if x.full_response and x.full_response.data - else None - ) - print(f"gen_out: {gen_out}") - # query = ( - # gen_out.full_response.data.query - # if gen_out.full_response and gen_out.full_response.data - # else None - # ) - retrieve_out = self.retrieve(input=gen_out) - print(f"retrieve_out: {retrieve_out}") - # passages = [] - # if isinstance(retrieve_out, Parameter): - # passages = retrieve_out.data[0].documents - # else: - # passages = retrieve_out[0].documents - - # print(f"passages: {passages}") - - # context = deduplicate(context + passages) - - # # for hop in range(self.max_hops): - # last_context_param = Parameter( - # data=context, - # name=f"query_context_{id}_{0}", - # requires_opt=True, - # ) - # query = self.query_generator( - # prompt_kwargs={ - # "context": last_context_param, - # "question": question, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # print(f"output call: {output}") - # return output[0].documents - - # def forward(self, question: str, id: str = None) -> Parameter: - # question_param = question - # if not isinstance(question, Parameter): - # question_param = Parameter( - # data=question, - # name="question", - # role_desc="The question to be answered", - # requires_opt=False, - # ) - # context = [] - # self.max_hops = 1 - # # for hop in range(self.max_hops): - # last_context_param = Parameter( - # data=context, - # name=f"query_context_{id}_{0}", - # requires_opt=True, - # ) - # query = self.query_generator( - # prompt_kwargs={ - # "context": last_context_param, - # "question": question_param, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # passages = [] - # if isinstance(output, Parameter): - # passages = output.data[0].documents - # else: - # passages = output[0].documents - # # context = deduplicate(context + passages) # all these needs to gradable - # # output_param = Parameter( - # # data=passages, - # # alias=f"qa_context_{id}", - # # role_desc="The context to be used for answering the question", - # # requires_opt=True, - # # ) - # output.data = passages # reset the values to be used in the next - # if not isinstance(output, Parameter): - # raise ValueError(f"Output must be a Parameter, got {output}") - # return output - # # output_param.set_grad_fn( - # # BackwardContext( - # # backward_fn=self.backward, - # # response=output_param, - # # id=id, - # # prededecessors=prededecessors, - # # ) - # # ) - # # return output_param - - def backward(self, response: Parameter, id: Optional[str] = None): - print(f"MultiHopRetriever backward: {response}") - children_params = response.predecessors - # backward score to the demo parameter - for pred in children_params: - if pred.requires_opt: - # pred._score = float(response._score) - pred.set_score(response._score) - print( - f"backpropagate the score {response._score} to {pred.name}, is_teacher: {self.teacher_mode}" - ) - if pred.param_type == ParameterType.DEMOS: - # Accumulate the score to the demo - pred.add_score_to_trace( - trace_id=id, score=response._score, is_teacher=self.teacher_mode - ) - print(f"Pred: {pred.name}, traces: {pred._traces}") - - -class HotPotQARAG( - Component -): # use component as not creating a new ops, but assemble existing ops - r"""Same system prompt as text-grad paper, but with our one message prompt template, which has better starting performance""" - - def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): - super().__init__() - - self.passages_per_hop = passages_per_hop - self.max_hops = max_hops - - self.multi_hop_retriever = MultiHopRetriever( - model_client=model_client, - model_kwargs=model_kwargs, - passages_per_hop=passages_per_hop, - max_hops=max_hops, - ) - # TODO: sometimes the cache will collide, so we get different evaluation - self.llm_counter = Generator( - name="QuestionAnswering", - model_client=model_client, - model_kwargs=model_kwargs, - prompt_kwargs={ - "few_shot_demos": Parameter( - name="few_shot_demos", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ) - }, - template=answer_template, - output_processors=parse_string_answer, - use_cache=True, - demo_data_class=HotPotQADemoData, - demo_data_class_input_mapping={ - "question": "question", - "context": "context", - }, - demo_data_class_output_mapping={"answer": lambda x: x.raw_response}, - ) - - # TODO: the error will be a context - # a component wont handle training, forward or backward, just passing everything through - def call(self, question: str, id: str = None) -> Union[Parameter, str]: - - # normal component, will be called when in inference mode - - question_param = Parameter( - data=question, - name="question", - role_desc="The question to be answered", - requires_opt=False, - ) - context = [] # noqa: F841 - output = None - retrieved_context = self.multi_hop_retriever(question_param, id=id) - - # forming a backpropagation graph - # Make this step traceable too. - # for hop in range(self.max_hops): - # # make context a parameter to be able to trace - # query = self.query_generator( - # prompt_kwargs={ - # "context": Parameter( - # data=context, alias=f"query_context_{id}", requires_opt=True - # ), - # "question": question_param, - # }, - # id=id, - # ) - # print(f"query: {query}") - # if isinstance(query, GeneratorOutput): - # query = query.data - # output = self.retrieve(query) - # print(f"output: {output}") - # passages = [] - # if isinstance(output, Parameter): - # passages = output.data[0].documents - # else: - # output[0].documents - # context = deduplicate(context + passages) - # print(f"context: {context}") - - output = self.llm_counter( - prompt_kwargs={ - "context": retrieved_context, - "question": question_param, - }, - id=id, - ) # already support both training (forward + call) - - if ( - not self.training - ): # if users want to customize the output, ensure to use if not self.training - - # convert the generator output to a normal data format - print(f"converting output: {output}") - - if output.data is None: - error_msg = ( - f"Error in processing the question: {question}, output: {output}" - ) - print(error_msg) - output = error_msg - else: - output = output.data - return output - - -from adalflow.optim.trainer.adal import AdalComponent -from adalflow.optim.trainer.trainer import Trainer -from adalflow.optim.few_shot.bootstrap_optimizer import BootstrapFewShot -from adalflow.eval.answer_match_acc import AnswerMatchAcc -from adalflow.optim.text_grad.text_loss_with_eval_fn import EvalFnToTextLoss - - -class HotPotQARAGAdal(AdalComponent): - # TODO: move teacher model or config in the base class so users dont feel customize too much - def __init__(self, task: Component, teacher_model_config: dict): - super().__init__() - self.task = task - self.teacher_model_config = teacher_model_config - - self.evaluator = AnswerMatchAcc("fuzzy_match") - self.eval_fn = self.evaluator.compute_single_item - # self.eval_fn = eval_fn - - def handle_one_task_sample( - self, sample: HotPotQAData - ) -> Any: # TODO: auto id, with index in call train examples - return self.task, {"question": sample.question, "id": sample.id} - - def handle_one_loss_sample( - self, sample: HotPotQAData, y_pred: Any - ) -> Tuple[Callable, Dict]: - return self.loss_fn.forward, { - "kwargs": { - "y": y_pred, - "y_gt": Parameter( - data=sample.answer, - role_desc="The ground truth(reference correct answer)", - name="y_gt", - requires_opt=False, - ), - } - } - - def configure_optimizers(self, *args, **kwargs): - - # TODO: simplify this, make it accept generator - parameters = [] - for name, param in self.task.named_parameters(): - param.name = name - parameters.append(param) - do = BootstrapFewShot(params=parameters) - return [do] - - def evaluate_one_sample( - self, sample: Any, y_pred: Any, metadata: Dict[str, Any] - ) -> Any: - - # we need "context" be passed as metadata - # print(f"sample: {sample}, y_pred: {y_pred}") - # convert pred to Dspy structure - - # y_obj = convert_y_pred_to_dataclass(y_pred) - # print(f"y_obj: {y_obj}") - # raise ValueError("Stop here") - if metadata: - return self.eval_fn(sample, y_pred, metadata) - return self.eval_fn(sample, y_pred) - - def configure_teacher_generator(self): - super().configure_teacher_generator(**self.teacher_model_config) - - def configure_loss_fn(self): - self.loss_fn = EvalFnToTextLoss( - eval_fn=self.eval_fn, - eval_fn_desc="ObjectCountingEvalFn, Output accuracy score: 1 for correct, 0 for incorrect", - backward_engine=None, - ) - - -def validate_dspy_demos( - demos_file="benchmarks/BHH_object_count/models/dspy/hotpotqa.json", -): - from adalflow.utils.file_io import load_json - - demos_json = load_json(demos_file) - - demos = demos_json["generate_answer"]["demos"] # noqa: F841 - - # task = HotPotQARAG( # noqa: F841 - # **gpt_3_model, - # passages_per_hop=3, - # max_hops=2, - # ) - # task.llm_counter.p - - -def test_multi_hop_retriever(): - - from use_cases.config import ( - gpt_3_model, - ) - - multi_hop_retriever = MultiHopRetriever( - **gpt_3_model, - passages_per_hop=3, - max_hops=2, - ) - # 1. use print - # print(multi_hop_retriever.query_generator) - # # 2. run one forward for query generator - question = "How many storeys are in the castle that David Gregory inherited?" - # context = [] - # context_str = multi_hop_retriever.context_to_str(context) - # print( - # multi_hop_retriever.query_generator( - # prompt_kwargs={"question": question, "context": context_str}, id="1" - # ) - # ) - # # verfify the prompt - # multi_hop_retriever.query_generator.print_prompt( - # **{"question": question, "context": context_str} - # ) - - # training mode - multi_hop_retriever.train() - - # 3. run one forward for retriever - print(multi_hop_retriever(question=question, id="1")) - - -def train(): - trainset, valset, testset = load_datasets() - - from use_cases.config import ( - gpt_3_model, - gpt_4o_model, - ) - - task = HotPotQARAG( - **gpt_3_model, - passages_per_hop=3, - max_hops=2, - ) - print(task) - question = "How long is the highway Whitehorse/Cousins Airport was built to support as of 2012?" - print(task(question)) - - # for name, param in task.named_parameters(): - # print(f"name: {name}, param: {param}") - - trainset, valset, testset = load_datasets() - - trainer = Trainer( - adaltask=HotPotQARAGAdal(task=task, teacher_model_config=gpt_4o_model), - max_steps=10, - raw_shots=0, - bootstrap_shots=4, - train_batch_size=4, - ckpt_path="hotpot_qa_rag", - strategy="random", - save_traces=True, - debug=True, # make it having debug mode - weighted_sampling=True, - ) - # fit include max steps - trainer.fit( - train_dataset=trainset, val_dataset=valset, test_dataset=testset, debug=True - ) - - -if __name__ == "__main__": - ### Try the minimum effort to test on any task - - # get_logger(level="DEBUG") - test_multi_hop_retriever() - - -# TODO: i forgot that i need demo_data_class -# TODO: i forgot that i need to set id -# Failed to generate demos but no error messages diff --git a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py index cebcfdf2..3d50e25d 100644 --- a/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py +++ b/benchmarks/hotpot_qa/adal_exp/build_multi_hop_rag.py @@ -1,7 +1,7 @@ """We will use dspy's retriever to keep that the same and only use our generator and optimizer""" import dspy -from typing import List +from typing import List, Optional from dataclasses import dataclass, field import adalflow as adal @@ -12,6 +12,9 @@ from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import DspyRetriever from adalflow.utils.logger import printc +from adalflow.components.agent.react import ReActAgent + +from adalflow.optim.grad_component import GradComponent2 colbertv2_wiki17_abstracts = dspy.ColBERTv2( url="http://20.102.90.50:2017/wiki17_abstracts" @@ -52,17 +55,34 @@ class QueryRewritterData(adal.DataClass): {% endif %} -Context: {{context}} Question: {{question}} +{% if last_query is not none %} +Last Query: {{last_query}} +{% endif %} +{% if context is not none %} +Context from last search query: {{context}} +{% endif %} """ +@dataclass +class QueriesOutput(adal.DataClass): + data: str = field( + metadata={"desc": "The joined queries"}, + ) + id: str = field( + metadata={"desc": "The id of the output"}, + ) + + class DeduplicateList(adal.GradComponent): def __init__(self): super().__init__() - def call(self, exisiting_list: List[str], new_list: List[str]) -> List[str]: + def call( + self, exisiting_list: List[str], new_list: List[str], id: str = None + ) -> List[str]: seen = set() return [x for x in exisiting_list + new_list if not (x in seen or seen.add(x))] @@ -73,12 +93,80 @@ def backward(self, *args, **kwargs): return super().backward(*args, **kwargs) -# User customize an auto-grad operator -# Need this to be a GradComponent +class CombineList(GradComponent2): + def __init__( + self, + name="CombineRetrieverOut", + desc="combines two lists and deduplicate with set", + ): + super().__init__(name=name, desc=desc) + + def call( + self, + context_1: adal.RetrieverOutput, + context_2: adal.RetrieverOutput, + id: str = None, + ) -> List[str]: + + seen = set() + lists_1 = context_1.documents + lists_2 = context_2.documents + combined = [x for x in lists_1 + lists_2 if not (x in seen or seen.add(x))] + + output = adal.RetrieverOutput( + id=id, + # query=f"query 1: {context_1.query}, query 2: {context_2.query}", + query=[context_1.query, context_2.query], + documents=combined, + doc_indices=[], + ) + return output + +class CombineQueries(GradComponent2): + def __init__( + self, + name="CombineTwoQueries using ','", + desc="combines two queries for evaluation", + ): + super().__init__(name=name, desc=desc) -# NOTE: deprecated -class MultiHopRetriever(adal.Retriever): + def call( + self, + q_1: str, + q_2: str, + id: str = None, + ) -> QueriesOutput: + + value = f"{q_1}, {q_2}" + + output = QueriesOutput(data=value, id=id) + + return output + + +query_generator_task_desc = """Write a simple search query that will help answer a complex question. + +You will receive a context(may contain relevant facts) and a question. +Think step by step.""" + + +task_desc_str = """ +You will receive an original question, last search query, and the retrieved context from the last search query. +Write the next search query to help retrieve all relevant context to answer the original question. +Think step by step.""" + +task_desc_str_system_finetuned = """ +Write a search query to identify key information step by step. Begin by extracting names or entities directly referenced in the question. Use retrieved data to iteratively refine subsequent queries, targeting specific attributes such as filmographies, roles, or numerical criteria (e.g., number of movies or TV shows). Adjust the query dynamically based on gaps or ambiguities in retrieved results. +""" + +task_desc_system_finedtuned_separately = [ + "Write a search query that extracts the key entity or fact required to begin answering the question. Focus on identifying specific names, titles, or roles directly referenced in the question. The query should aim to retrieve precise and relevant details (e.g., the name of a person, cast members of a movie, or associated facts) to refine understanding of the question.", + "Based on the retrieved results, refine the search query to target detailed information that resolves the question. Use retrieved entities or partial answers to adjust the query dynamically. If gaps or ambiguities remain, incorporate criteria from the original question (e.g., specific numbers, attributes, or context) to improve precision and relevance.", +] + + +class MultiHopRetrieverCycle(adal.Retriever): def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): super().__init__() @@ -89,146 +177,187 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): data_class=QueryRewritterData, return_data_class=True, format_type="json" ) - # Grad Component - self.query_generators: List[adal.Generator] = [] - for i in range(self.max_hops): - self.query_generators.append( - adal.Generator( - name=f"query_generator_{i}", - model_client=model_client, - model_kwargs=model_kwargs, - prompt_kwargs={ - "few_shot_demos": Parameter( - name="few_shot_demos_1", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ), - "task_desc_str": Parameter( - name="task_desc_str", - data="""Write a simple search query that will help answer a complex question. + # only one generator which will be used in a loop, called max_hops times + self.query_generator: adal.Generator = adal.Generator( + name="query_generator", + model_client=model_client, + model_kwargs=model_kwargs, + prompt_kwargs={ + # "few_shot_demos": Parameter( + # name="few_shot_demos", + # data=None, + # role_desc="To provide few shot demos to the language model", + # requires_opt=True, + # param_type=ParameterType.DEMOS, + # ), + "task_desc_str": Parameter( + name="task_desc_str", + data=task_desc_str, + # data=task_desc_str_system_finetuned, + # data=task_desc_system_finedtuned_separately[0], + role_desc="Task description for the language model. Used together with \ + the following template: \ + Question: {{question}} \ +{% if last_query is not none %} \ +Last Query: {{last_query}}\ +{% endif %}\ +{% if context is not none %}\ +Context from last search query: {{context}}\ +{% endif %}", + requires_opt=True, + param_type=ParameterType.PROMPT, + ), + "output_format_str": self.data_parser.get_output_format_str(), + }, + template=query_template, + output_processors=self.data_parser, + use_cache=True, + ) -You will receive a context(may contain relevant facts) and a question. -Think step by step.""", - role_desc="Task description for the language model", - requires_opt=True, - param_type=ParameterType.PROMPT, - ), - "output_format_str": self.data_parser.get_output_format_str(), - }, - template=query_template, - output_processors=self.data_parser, - use_cache=True, - ) - ) self.retriever = DspyRetriever(top_k=passages_per_hop) self.deduplicater = DeduplicateList() + self.combine_list = CombineList() @staticmethod def context_to_str(context: List[str]) -> str: return "\n".join(context) - @staticmethod - def deduplicate(seq: list[str]) -> list[str]: - """ - Source: https://stackoverflow.com/a/480227/1493011 - """ - - seen = set() - return [x for x in seq if not (x in seen or seen.add(x))] - - def call(self, *, question: str, id: str = None) -> adal.RetrieverOutput: - context = [] - print(f"question: {question}") - for i in range(self.max_hops): - gen_out = self.query_generators[i]( - prompt_kwargs={ - "context": self.context_to_str(context), - "question": question, - }, - id=id, - ) - - query = gen_out.data.query if gen_out.data and gen_out.data.query else None + def call(self, *, input: str, id: str = None) -> List[adal.RetrieverOutput]: + # assemble the foundamental building blocks + out = self.forward(input=input, id=id) - print(f"query {i}: {query}") + if not isinstance(out, adal.Parameter): + raise ValueError("The output should be a parameter") - retrieve_out = self.retriever.call(input=query) - passages = retrieve_out[0].documents - context = self.deduplicate(context + passages) - out = [adal.RetrieverOutput(documents=context, query=query, doc_indices=[])] - return out + return out.data # or full response its up to users - def forward(self, *, question: str, id: str = None) -> adal.Parameter: - # assemble the foundamental building blocks + def forward(self, *, input: str, id: str = None) -> adal.Parameter: context = [] - print(f"question: {question}") # 1. make question a parameter as generator does not have it yet # can create the parameter at the leaf, but not the intermediate nodes question_param = adal.Parameter( name="question", - data=question, + data=input, role_desc="The question to be answered", - requires_opt=True, - param_type=ParameterType.INPUT, - ) - context_param = adal.Parameter( - name="context", - data=context, - role_desc="The context to be used for the query", - requires_opt=True, + requires_opt=False, param_type=ParameterType.INPUT, ) - context_param.add_successor_map_fn( - successor=self.query_generators[0], - map_fn=lambda x: self.context_to_str(x.data), - ) + contexts = [] + last_query = None for i in range(self.max_hops): - gen_out = self.query_generators[i].forward( + gen_out = self.query_generator.forward( prompt_kwargs={ - "context": context_param, + "context": context, "question": question_param, + "last_query": last_query, + # "task_desc_str": task_desc_system_finedtuned_separately[ + # i + # ], # replace this at runtime }, id=id, ) - + # prompt_kwargs = { + # "context": context, + # "question": question_param, + # "last_query": last_query, + # } + # prompt = self.query_generator.get_prompt(**prompt_kwargs) + # printc(f"prompt: {prompt}", "yellow") + + # printc(f"query {i}: {gen_out.data.data.query}", "yellow") + # extract the query from the generator output success_map_fn = lambda x: ( # noqa E731 - x.full_response.data.query - if x.full_response - and x.full_response.data - and x.full_response.data.query - else None + x.data.data.query + if x.data and x.data.data and x.data.data.query + else (x.data.raw_response if x.data and x.data.raw_response else None) ) - print(f"query {i}: {success_map_fn(gen_out)}") + # print(f"query {i}: {success_map_fn(gen_out)}") gen_out.add_successor_map_fn( successor=self.retriever, map_fn=success_map_fn ) + # printc(f"before retrieve_out: {success_map_fn(gen_out)}", "yellow") - retrieve_out = self.retriever.forward(input=gen_out) + # retrieve the passages + retrieve_out: adal.Parameter = self.retriever.forward(input=gen_out, id=id) + # printc(f"retrieve_out: {retrieve_out}", "yellow") - def retrieve_out_map_fn(x: adal.Parameter): - return x.data[0].documents if x.data and x.data[0].documents else [] + retrieve_out.data_in_prompt = lambda x: { + "query": x.data.query, + "documents": x.data.documents, + } + if i + 1 < self.max_hops: + last_query = gen_out + + last_query.add_successor_map_fn( + successor=self.query_generator, map_fn=success_map_fn + ) - print(f"retrieve_out: {retrieve_out}") + def retrieve_out_map_fn(x: adal.Parameter): + return x.data.documents if x.data and x.data.documents else [] + # add the map function to the retrieve_out retrieve_out.add_successor_map_fn( successor=self.deduplicater, map_fn=retrieve_out_map_fn ) + context = retrieve_out + if i + 1 < self.max_hops: + context.add_successor_map_fn( + successor=self.query_generator, map_fn=retrieve_out_map_fn + ) + + contexts.append(context) + + contexts[0].add_successor_map_fn( + successor=self.combine_list, map_fn=lambda x: x.data + ) + contexts[1].add_successor_map_fn( + successor=self.combine_list, map_fn=lambda x: x.data + ) + + context_sum = self.combine_list.forward(contexts[0], contexts[1]) + return context_sum - context_param = self.deduplicater.forward( - exisiting_list=context_param, new_list=retrieve_out - ) - context_param.param_type = ParameterType.RETRIEVER_OUTPUT +# task_desc_str = """Write a simple search query that will help answer a complex question. - return context_param +# You will receive a context(may contain relevant facts) and a question. +# Think step by step.""" -class MultiHopRetriever2(adal.Retriever): +trained_task_desc_strs = [ + "You are tasked with formulating precise search queries using the original question, last search query, and its retrieved context. Prioritize identifying, emphasizing, and explicitly including all crucial entities, relationships, and geographical details mentioned in the question. Ensure comprehensive retrieval by focusing on key elements such as specific individuals (e.g., 'Kyrie Irving'), roles, or contextual details required for accuracy. Demonstrate reasoning by cross-referencing multiple sources and provide clear examples where necessary. Adapt queries to capture all nuances effectively for improved relevance and accuracy. Think step by step.", + "You will receive an original question, the last search query, and the retrieved context from that search. Write the next search query to ensure comprehensive retrieval of all relevant context needed to answer the original question. Emphasize identifying, precisely including, and verifying specific key entities, historical events, and factual names directly linked to the question within the context. Explicitly use the context to confirm and match critical entities to improve recall and ensure consistency with the targeted entities. Avoid irrelevant inclusions or false positives by cross-referencing data and verifying alignment accurately. Think step by step.", +] + +trained_task_desc_strs = [ + "You will receive an original question, last search query, and the retrieved context from the last search query. Identify key entities, explicitly named individuals, and specific versions (e.g., specific film versions) in the original question to ensure comprehensive and focused retrieval. Craft a refined search query to help retrieve relevant context, prioritizing connections and biographical details needed. Think step by step.", + "You will receive an original question, last search query, and the retrieved context from the last search query. Analyze both the question and context to craft the next search query. Focus on all pertinent entities, especially notable individuals, mentioned in the question and context to ensure comprehensive coverage. Think step by step.", +] + +few_shot_demos = [ + "reasoning: The question is asking for the individual who defeated Sander Levin in\n a specific election, the Michigan gubernatorial election of 1970. I need to determine\n who his opponent was and who won that election. Hence, I should focus the search\n on the Michigan gubernatorial election of 1970, Sander Levin, and the name of the\n winner.\nquery: Michigan gubernatorial election 1970 winner Sander Levin\n\nquestion: What is the name of this American law firm headquartered in Little Rock,\n Arkansas, which was co-founded by Robert Crittenden?\nanswer: Rose Law Firm", + "reasoning: The context provides information about Kirk Humphreys, the chairman of\n The Humphreys Company, and his birth date as September 13, 1950. It also mentions\n that he lost in a primary to former Congressman Tom Coburn, who is a medical doctor.\n To determine who is older, we need to find the birth date of Tom Coburn.\nquery: Tom Coburn birth date\n\nquestion: In which century was football introduced to this region represented by FC\n Espanya de Barcelona?\nanswer: 19th century", +] + +manual_task_desc_strs = [ + "You will receive an question that requires 2 retrieveal steps to have enough context to answer. \ + You are the first step, write a simple search query to retrieve the first part of the context. \ + Think step by step.", + "You will receive an original question, last search query, and the retrieved context from the last search query. Write the next search query to help retrieve all relevant context to answer the original question. Think step by step.", +] + + +# task_desc_str = """ You are a query assistant that helps search all relevant context to answer a multi-hop question. + +# You will a question, and existing context(may contain relevant facts along with its sub-questions). +# Write a new simple search query to help retrieve the relevant context to answer the question. +# Think step by step.""" + + +class MultiHopRetriever(adal.Component): def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): super().__init__() @@ -239,11 +368,10 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): data_class=QueryRewritterData, return_data_class=True, format_type="json" ) - # Grad Component - # self.query_generators: List[adal.Generator] = [] self.query_generators: adal.ComponentList[adal.Generator] = adal.ComponentList() self.retrievers: List[Retriever] = [] self.deduplicaters: List[adal.GradComponent] = [] + for i in range(self.max_hops): self.query_generators.append( adal.Generator( @@ -251,20 +379,27 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): model_client=model_client, model_kwargs=model_kwargs, prompt_kwargs={ - "few_shot_demos": Parameter( - name=f"few_shot_demos_{i}", - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=True, - param_type=ParameterType.DEMOS, - ), + # "few_shot_demos": Parameter( + # name=f"few_shot_demos_{i}", + # # data=few_shot_demos[i], + # data=None, + # role_desc="To provide few shot demos to the language model", + # requires_opt=True, + # param_type=ParameterType.DEMOS, + # ), "task_desc_str": Parameter( name="task_desc_str", - data="""Write a simple search query that will help answer a complex question. - -You will receive a context(may contain relevant facts) and a question. -Think step by step.""", - role_desc="Task description for the language model", + data=task_desc_str, + # data=manual_task_desc_strs[i], + role_desc=f"""Task description for the {i+1}th language model.""" + + "Used together with the following template: \ +Question: {{question}} \ +{% if last_query is not none %} \ +Last Query: {{last_query}}\ +{% endif %}\ +{% if context is not none %}\ +Context from last search query: {{context}}\ +{% endif %}", requires_opt=True, param_type=ParameterType.PROMPT, ), @@ -278,6 +413,9 @@ def __init__(self, model_client, model_kwargs, passages_per_hop=3, max_hops=2): self.retrievers.append(DspyRetriever(top_k=passages_per_hop)) self.deduplicaters.append(DeduplicateList()) + self.combine_list = CombineList() + self.combine_queries = CombineQueries() + @staticmethod def context_to_str(context: List[str]) -> str: return "\n".join(context) @@ -291,51 +429,75 @@ def deduplicate(seq: list[str]) -> list[str]: seen = set() return [x for x in seq if not (x in seen or seen.add(x))] - # def call(self, *, question: str, id: str = None) -> adal.RetrieverOutput: - # context = [] - # print(f"question: {question}") - # for i in range(self.max_hops): - # gen_out = self.query_generators[i]( - # prompt_kwargs={ - # "context": self.context_to_str(context), - # "question": question, - # }, - # id=id, - # ) - - # query = gen_out.data.query if gen_out.data and gen_out.data.query else None - - # print(f"query {i}: {query}") - - # retrieve_out = self.retrievers[i].call(input=query) - # passages = retrieve_out[0].documents - # context = self.deduplicate(context + passages) - # out = [adal.RetrieverOutput(documents=context, query=query, doc_indices=[])] - # return out - - # TODO: simplify and avoid the need where users need to write two methods (call and forward) - def call(self, *, input: str, id: str = None) -> List[adal.RetrieverOutput]: - # assemble the foundamental building blocks - printc(f"question: {input}", "yellow") - out = self.forward(input=input, id=id) + def call(self, *, input: str, id: str = None) -> adal.RetrieverOutput: + context = [] + queries: List[str] = [] + last_query = None + for i in range(self.max_hops): + gen_out = self.query_generators[i]( + prompt_kwargs={ + "context": context, + "question": input, + "last_query": last_query, + }, + id=id, + ) - if not isinstance(out, adal.Parameter): - raise ValueError("The output should be a parameter") + query = gen_out.data.query if gen_out.data and gen_out.data.query else input - return out.data # or full response its up to users + retrieve_out = self.retrievers[i](input=query, id=id) + + passages = retrieve_out.documents + context = self.deduplicate(context + passages) + queries.append(query) + last_query = query + out = adal.RetrieverOutput( + documents=context, query=queries, doc_indices=[], id=id + ) + return out + + def call2(self, *, input: str, id: str = None) -> str: + context = [] + queries: List[str] = [] + last_query = None + for i in range(self.max_hops): + gen_out = self.query_generators[i]( + prompt_kwargs={ + "context": context, + "question": input, + "last_query": last_query, + }, + id=id, + ) + + query = gen_out.data.query if gen_out.data and gen_out.data.query else input + + retrieve_out = self.retrievers[i](input=query, id=id) + + passages = retrieve_out.documents + context = self.deduplicate(context + passages) + queries.append(query) + last_query = query + out = ", ".join(queries) + query_output = QueriesOutput(data=out, id=id) + return query_output def forward(self, *, input: str, id: str = None) -> adal.Parameter: # assemble the foundamental building blocks - printc(f"question: {input}", "yellow") - context = [] + # printc(f"question: {input}", "yellow") + # context = [] queries: List[str] = [] - for i in range(self.max_hops): + context = [] + last_query = None + contexts: List[Parameter] = [] - gen_out = self.query_generators[i].forward( + for i in range(self.max_hops): + gen_out: Parameter = self.query_generators[i].forward( prompt_kwargs={ - "context": context, # can be a list or a parameter + "context": context, + "last_query": last_query, "question": adal.Parameter( name="question", data=input, @@ -348,17 +510,11 @@ def forward(self, *, input: str, id: str = None) -> adal.Parameter: ) success_map_fn = lambda x: ( # noqa E731 - x.full_response.data.query - if x.full_response - and x.full_response.data - and x.full_response.data.query - else ( - x.full_response.raw_response - if x.full_response and x.full_response.raw_response - else None - ) + x.data.data.query + if x.data and x.data.data and x.data.data.query + else (x.data.raw_response if x.data and x.data.raw_response else None) ) - print(f"query {i}: {success_map_fn(gen_out)}") + # printc(f"query {i}: {success_map_fn(gen_out)}") queries.append(success_map_fn(gen_out)) @@ -372,41 +528,153 @@ def forward(self, *, input: str, id: str = None) -> adal.Parameter: retrieve_out = self.retrievers[i].forward(input=gen_out, id=id) def retrieve_out_map_fn(x: adal.Parameter): - return x.data[0].documents if x.data and x.data[0].documents else [] + return x.data.documents if x.data and x.data.documents else [] # print(f"retrieve_out: {retrieve_out}") - retrieve_out.add_successor_map_fn( - successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn + # retrieve_out.add_successor_map_fn( + # successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn + # ) + + # context = self.deduplicaters[i].forward( + # exisiting_list=context, new_list=retrieve_out + # ) + retrieve_out.data_in_prompt = lambda x: { + "query": x.data.query, + "documents": x.data.documents, + } + context = retrieve_out + if i + 1 < self.max_hops: + context.add_successor_map_fn( + successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn + ) + last_query = success_map_fn(gen_out) + contexts.append(retrieve_out) + # if i + 1 < self.max_hops: + # retrieve_out.add_successor_map_fn( + # successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn + # ) + + # last_query = success_map_fn(gen_out) + # printc(f"retrieve_out, last_query: {last_query}") + + contexts[0].add_successor_map_fn( + successor=self.combine_list, map_fn=lambda x: x.data + ) + contexts[1].add_successor_map_fn( + successor=self.combine_list, map_fn=lambda x: x.data + ) + contexts_sum = self.combine_list.forward( + context_1=contexts[0], context_2=contexts[1] + ) + contexts_sum.data_in_prompt = lambda x: { + "query": x.data.query, + "documents": x.data.documents, + } + + return contexts_sum + + # TODO: might need to support multiple output parameters + def forward2(self, *, input: str, id: str = None) -> List[adal.Parameter]: + r"""Experiment multiple output parameters for multiple evaluation.""" + # assemble the foundamental building blocks + printc(f"question: {input}", "yellow") + + queries: List[adal.Parameter] = [] + + context = [] + last_query = None + contexts: List[Parameter] = [] + + for i in range(self.max_hops): + gen_out: Parameter = self.query_generators[i].forward( + prompt_kwargs={ + "context": context, + "last_query": last_query, + "question": adal.Parameter( + name="question", + data=input, + role_desc="The question to be answered", + requires_opt=False, + param_type=ParameterType.INPUT, + ), + }, + id=id, ) - context = self.deduplicaters[i].forward( - exisiting_list=context, new_list=retrieve_out + success_map_fn = lambda x: ( # noqa E731 + x.data.data.query + if x.data and x.data.data and x.data.data.query + else (x.data.raw_response if x.data and x.data.raw_response else None) ) + # printc(f"query {i}: {success_map_fn(gen_out)}") - context.param_type = ParameterType.RETRIEVER_OUTPUT + # queries.append(success_map_fn(gen_out)) + queries.append(gen_out) - def context_to_retrover_output(x): - return [ - adal.RetrieverOutput( - documents=x.data, query=[input] + queries, doc_indices=[] - ) - ] + gen_out.add_successor_map_fn( + successor=self.retrievers[i], map_fn=success_map_fn + ) - context.data = context_to_retrover_output(context) + if success_map_fn(gen_out) is None: + raise ValueError(f"The query is None, please check the generator {i}") + + retrieve_out = self.retrievers[i].forward(input=gen_out, id=id) - printc(f"MultiHopRetriever2 grad fn: {context.grad_fn}", "yellow") + def retrieve_out_map_fn(x: adal.Parameter): + return x.data.documents if x.data and x.data.documents else [] - return context + # print(f"retrieve_out: {retrieve_out}") - def backward(self, *args, **kwargs): + # retrieve_out.add_successor_map_fn( + # successor=self.deduplicaters[i], map_fn=retrieve_out_map_fn + # ) + context = retrieve_out + if i + 1 < self.max_hops: + context.add_successor_map_fn( + successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn + ) - printc(f"MultiHopRetriever2 backward: {args}", "yellow") - super().backward(*args, **kwargs) - return + # context = self.deduplicaters[i].forward( + # exisiting_list=context, new_list=retrieve_out + # ) + contexts.append(retrieve_out) + if i + 1 < self.max_hops: + retrieve_out.add_successor_map_fn( + successor=self.query_generators[i + 1], map_fn=retrieve_out_map_fn + ) + + last_query = success_map_fn(gen_out) + # printc(f"retrieve_out, last_query: {last_query}") + + # contexts[0].add_successor_map_fn( + # successor=self.combine_list, map_fn=lambda x: x.data + # ) + # contexts[1].add_successor_map_fn( + # successor=self.combine_list, map_fn=lambda x: x.data + # ) + # contexts_sum = self.combine_list.forward( + # context_1=contexts[0], context_2=contexts[1] + # ) + # contexts_sum.data_in_prompt = lambda x: { + # "query": x.data.query, + # "documents": x.data.documents, + # } + # setattr(contexts_sum, "queries", [q.data.data.query for q in queries]) + queries[0].add_successor_map_fn( + successor=self.combine_queries, map_fn=lambda x: x.data.data.query + ) + queries[1].add_successor_map_fn( + successor=self.combine_queries, map_fn=lambda x: x.data.data.query + ) + combined_queries = self.combine_queries.forward(q_1=queries[0], q_2=queries[1]) + printc(f"queries: {combined_queries.data}", "yellow") + return combined_queries -from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG +from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import ( + VanillaRAG, +) class MultiHopRAG(VanillaRAG): @@ -418,7 +686,28 @@ def __init__( model_client=model_client, model_kwargs=model_kwargs, ) - self.retriever = MultiHopRetriever2( + self.retriever = MultiHopRetriever( + model_client=model_client, + model_kwargs=model_kwargs, + passages_per_hop=passages_per_hop, + max_hops=max_hops, + ) + # update the parameters to untainable + # for name, param in self.llm.named_parameters(): + # param.requires_opt = False + # printc(f"param: {name} requires_opt: {param.requires_opt}", "yellow") + + +class MultiHopRAGCycle(VanillaRAG): + def __init__( + self, passages_per_hop=3, max_hops=2, model_client=None, model_kwargs=None + ): + super().__init__( + passages_per_hop=passages_per_hop, + model_client=model_client, + model_kwargs=model_kwargs, + ) + self.retriever = MultiHopRetrieverCycle( model_client=model_client, model_kwargs=model_kwargs, passages_per_hop=passages_per_hop, @@ -426,6 +715,111 @@ def __init__( ) +# TODO: agent needs storage for the context instead of all in the step history. +class AgenticRAG(adal.GradComponent): + def __init__(self, model_client, model_kwargs): + super().__init__() + + self.dspy_retriever = DspyRetriever(top_k=2) + # self.llm_parser = adal.DataClassParser( + # data_class=AnswerData, return_data_class=True, format_type="json" + # ) + # self.llm = adal.Generator( + # model_client=model_client, + # model_kwargs=model_kwargs, + # template=answer_template, + # prompt_kwargs={ + # "task_desc_str": adal.Parameter( + # data=task_desc_str, + # role_desc="Task description for the language model", + # param_type=adal.ParameterType.PROMPT, + # requires_opt=True, + # ), + # "few_shot_demos": adal.Parameter( + # data=None, + # requires_opt=None, + # role_desc="To provide few shot demos to the language model", + # param_type=adal.ParameterType.DEMOS, + # ), + # "output_format_str": self.llm_parser.get_output_format_str(), + # }, + # output_processors=self.llm_parser, + # ) + + # self.context = [] + + def dspy_retriever_as_tool( + input: str, + # context_variables: Dict, + id: Optional[str] = None, + ) -> List[str]: + r"""Retrieves the top 2 passages from using input as the query. + Ensure you get all the context to answer the original question. + """ + output = self.dspy_retriever(input=input, id=id) + parsed_output = output + if isinstance(output, adal.Parameter): + parsed_output = output.data.documents + return parsed_output + documents = parsed_output.documents + # if context_variables: + # context_variables["context"].extend(documents) + return documents + + # def generator_as_tool( + # input: str, + # context_variables: Dict, + # id: Optional[str] = None, + # ) -> str: + # r"""Generates the answer to the question(input) and the context from the context_variables(Dict). + # Example: generator_as_tool(original question, context_variables=context_variables) + + # YOU MUST call generator_as_tool once to produce the final answer. + # """ + # context = context_variables["context"] + # # print(f"context: {context}") + # output = self.llm( + # prompt_kwargs={"question": input, "context": context}, id=id + # ) + # return output + + from adalflow.core.func_tool import FunctionTool + + tools = [ + FunctionTool(self.dspy_retriever.__call__, component=self.dspy_retriever), + # FunctionTool(generator_as_tool, component=self.llm), + ] # NOTE: agent is not doing well to call component methods at this moment + + tools = [ + FunctionTool(dspy_retriever_as_tool, component=self.dspy_retriever), + # FunctionTool(generator_as_tool, component=self.llm), + ] + + self.agent = ReActAgent( + max_steps=3, + add_llm_as_fallback=False, + tools=tools, + model_client=model_client, + model_kwargs=model_kwargs, + context_variables=None, + ) + + def forward(self, *args, **kwargs) -> Parameter: + return self.bicall(*args, **kwargs) + + def call(self, *args, **kwargs): + return self.bicall(*args, **kwargs) + + def bicall(self, input: str, id: str = None) -> str: + out = self.agent(input=input, id=id) + if isinstance(out, adal.Parameter): + return out + return out # .observation ReactOutput + # if isinstance(out, adal.Parameter): + # return out.data[-1].observation + # return out[-1].observation + + def test_multi_hop_retriever(): from use_cases.config import ( @@ -438,17 +832,73 @@ def test_multi_hop_retriever(): max_hops=2, ) + question = "How many storeys are in the castle that David Gregory inherited?" + print(f"multi_hop_retriever: {multi_hop_retriever}") + return + # eval mode + output = multi_hop_retriever.call(input=question, id="1") + print(output) + + # train mode + multi_hop_retriever.train() + output = multi_hop_retriever.forward(input=question, id="1") + print(output) + output.draw_graph() + + +def test_multi_hop_retriever_cycle(): + + from use_cases.config import ( + gpt_3_model, + ) + + multi_hop_retriever = MultiHopRetrieverCycle( + **gpt_3_model, + passages_per_hop=3, + max_hops=2, + ) + question = "How many storeys are in the castle that David Gregory inherited?" # eval mode - output = multi_hop_retriever.call(question=question, id="1") + output = multi_hop_retriever.call(input=question, id="1") print(output) # train mode multi_hop_retriever.train() - output = multi_hop_retriever.forward(question=question, id="1") + output = multi_hop_retriever.forward(input=question, id="1") print(output) output.draw_graph() + output.draw_output_subgraph() + output.draw_component_subgraph() + + +def test_agent_rag(): + + from use_cases.config import ( + gpt_3_model, + ) + + task = AgenticRAG( + **gpt_3_model, + ) + print(task) + + question = "How many storeys are in the castle that David Gregory inherited?" + + task.train() + output = task(input=question, id="1") + print(output.data) + output.draw_graph() + + # output = + # print(output) + # output.draw_graph() + # output.draw_output_subgraph() + # output.draw_component_subgraph() + + # task.eval() + # output = task(input=question, id="1") def test_multi_hop_retriever2(): @@ -457,7 +907,7 @@ def test_multi_hop_retriever2(): gpt_3_model, ) - multi_hop_retriever = MultiHopRetriever2( + multi_hop_retriever = MultiHopRetriever( **gpt_3_model, passages_per_hop=3, max_hops=2, @@ -531,4 +981,7 @@ def test_multi_hop_rag(): # get_logger(level="DEBUG") # test_multi_hop_retriever() # test_multi_hop_retriever2() - test_multi_hop_rag() + + # test_multi_hop_retriever_cycle() + # test_multi_hop_rag() + test_agent_rag() diff --git a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py index 3eae0598..9f0668a6 100644 --- a/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py +++ b/benchmarks/hotpot_qa/adal_exp/build_vanilla_rag.py @@ -6,7 +6,7 @@ import adalflow as adal -from adalflow.datasets.hotpot_qa import HotPotQA +from benchmarks.hotpot_qa.config import load_datasets from adalflow.core.retriever import Retriever from adalflow.core.types import RetrieverOutput @@ -20,18 +20,6 @@ dspy.settings.configure(rm=colbertv2_wiki17_abstracts) -def load_datasets(): - - trainset = HotPotQA(split="train", size=20) - valset = HotPotQA(split="val", size=50) - testset = HotPotQA(split="test", size=50) - print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}") - return trainset, valset, testset - - -# task pipeline - - # dspy format # Follow the following format. # Context: may contain relevant facts @@ -102,6 +90,8 @@ class AnswerData(adal.DataClass): # Demonstrating how to wrap other retriever to adalflow retriever and be applied in training pipeline # as a subclass of retriever which is a subclass of GradComponent, we dont need to do additional implementation # data processing has already done + + class DspyRetriever(Retriever): def __init__(self, top_k: int = 3): super().__init__() @@ -110,7 +100,7 @@ def __init__(self, top_k: int = 3): def call( self, input: str, top_k: Optional[int] = None, id: str = None - ) -> List[RetrieverOutput]: + ) -> RetrieverOutput: k = top_k or self.top_k @@ -119,27 +109,36 @@ def call( output = self.dspy_retriever(query_or_queries=input, k=k) # print(f"dsy_retriever output: {output}") - final_output: List[RetrieverOutput] = [] documents = output.passages - final_output.append( - RetrieverOutput( - query=input, - documents=documents, - doc_indices=[], - ) + return RetrieverOutput( + query=input, + documents=documents, + doc_indices=[], ) - # print(f"final_output: {final_output}") - return final_output task_desc_str = r"""Answer questions with short factoid answers. -You will receive context(may contain relevant facts) and a question. +You will receive context(contain relevant facts). Think step by step.""" +task_desc_str_system_finetuned = "Generate a concise, factually accurate answer by synthesizing information from the provided context. If multiple sources are available, prioritize resolving ambiguities and cross-referencing data for consistency. Ensure the final answer directly addresses the question while considering specific numerical or descriptive criteria mentioned in the input." + +# task_desc_str = r"""Answer questions with verbatim short factoid responses. + +# You will receive context. Extract only the most relevant fact for a precise answer. +# """ + +demo_str = r"""reasoning: \"Dragon Data, the producer of Dragon 32/64, was based in Port Talbot, Wales,\\\n \\ while TK82C was a product of a Brazilian company, Microdigital Eletr\\xF4nica Ltda.\"\nanswer: 'No'\n\nreasoning: The context specifies that the live action sequel '102 Dalmatians' was\n directed by Kevin Lima.\nanswer: Kevin Lima\n\nreasoning: The context specifically mentions that in the 1970 Michigan gubernatorial\n election, Republican William Milliken defeated Democrat Sander Levin.\nanswer: William Milliken\n\nreasoning: The context states that 'Lost Songs from the Lost Years' is a compilation\n by Cloud Cult, which is an experimental indie rock band from Duluth, Minnesota.\nanswer: Minnesota +""" + +# task_desc_str = r"""Answer the question with given context. +# The question requires you to answer one subquestion first, and then find the next potential subquestion and until you find the final answer. +# """ + -class VanillaRAG(adal.GradComponent): +class VanillaRAG(adal.Component): def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None): super().__init__() @@ -154,31 +153,42 @@ def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None): model_kwargs=model_kwargs, prompt_kwargs={ "task_desc_str": adal.Parameter( + # data=task_desc_str_system_finetuned, data=task_desc_str, - role_desc="Task description for the language model", + role_desc="""Task description for the language model,\ + used with the following template: \ + {{task_desc_str}} \ + {{output_format_str}}\ + +Context: {{context}} +Question: {{question}} +""", param_type=adal.ParameterType.PROMPT, requires_opt=True, + instruction_to_backward_engine="You need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.", + instruction_to_optimizer="ou need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.", + # + "Given existing context, ensure the task instructions can maximize the performance.", ), - "few_shot_demos": adal.Parameter( - data=None, - requires_opt=True, - role_desc="To provide few shot demos to the language model", - param_type=adal.ParameterType.DEMOS, - ), + # "few_shot_demos": adal.Parameter( + # # data=demo_str, + # data=None, + # requires_opt=True, + # role_desc="To provide few shot demos to the language model", + # param_type=adal.ParameterType.DEMOS, + # ), "output_format_str": self.llm_parser.get_output_format_str(), + # "output_format_str": adal.Parameter( + # data=self.llm_parser.get_output_format_str(), + # requires_opt=True, + # param_type=adal.ParameterType.PROMPT, + # role_desc="The output format string to ensure no failed json parsing", + # ), }, template=answer_template, output_processors=self.llm_parser, use_cache=True, ) - # user should just treat it as a call function - # and we will handle the connection between the components - # they should directly pass the retriever_output along with - # each output's successor_map_fn. - # what if it is passed to two different componnents? - # we can create a copy - def call(self, question: str, id: str = None) -> adal.GeneratorOutput: if self.training: raise ValueError( @@ -188,7 +198,7 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput: retriever_out = self.retriever.call(input=question, id=id) successor_map_fn = lambda x: ( # noqa E731 - "\n\n".join(x[0].documents) if x and x[0] and x[0].documents else "" + "\n\n".join(x.documents) if x and x.documents else "" ) retrieved_context = successor_map_fn(retriever_out) @@ -201,29 +211,16 @@ def call(self, question: str, id: str = None) -> adal.GeneratorOutput: prompt_kwargs=prompt_kwargs, id=id, ) - # self.llm.print_prompt(**prompt_kwargs) - # print(f"retrieved_context: {retrieved_context}") - # print(f"retriever_out: {retriever_out}") - return output - # def call(self, *, question: str, id: str = None) -> adal.GeneratorOutput: - # self.train() - # out = self.forward(question=question, id=id) - # if not isinstance(out, adal.Parameter): - # raise ValueError( - # "This output should be a Parameter, please check the forward function" - # ) - # self.eval() - # return out.data + return output - # TODO: add id in the retriever output def forward(self, question: str, id: str = None) -> adal.Parameter: if not self.training: raise ValueError("This component is not supposed to be called in eval mode") retriever_out = self.retriever.forward(input=question, id=id) successor_map_fn = lambda x: ( # noqa E731 - "\n\n".join(x.data[0].documents) - if x.data and x.data[0] and x.data[0].documents + "\n\n".join(x.data.documents) + if x.data and x.data and x.data.documents else "" ) retriever_out.add_successor_map_fn(successor=self.llm, map_fn=successor_map_fn) @@ -242,8 +239,8 @@ def bicall( retriever_out = self.retriever(input=question) if isinstance(retriever_out, adal.Parameter): successor_map_fn = lambda x: ( # noqa E731 - "\n\n".join(x.data[0].documents) - if x.data and x.data[0] and x.data[0].documents + "\n\n".join(x.data.documents) + if x.data and x.data and x.data.documents else "" ) retriever_out.add_successor_map_fn( @@ -251,7 +248,7 @@ def bicall( ) else: successor_map_fn = lambda x: ( # noqa E731 - "\n\n".join(x[0].documents) if x and x[0] and x[0].documents else "" + "\n\n".join(x.documents) if x and x.documents else "" ) retrieved_context = successor_map_fn(retriever_out) prompt_kwargs = { @@ -262,8 +259,77 @@ def bicall( return output +class Vanilla(adal.Component): + def __init__(self, passages_per_hop=3, model_client=None, model_kwargs=None): + super().__init__() + + self.passages_per_hop = passages_per_hop + + # self.retriever = DspyRetriever(top_k=passages_per_hop) + self.llm_parser = adal.DataClassParser( + data_class=AnswerData, return_data_class=True, format_type="json" + ) + self.llm = Generator( + model_client=model_client, + model_kwargs=model_kwargs, + prompt_kwargs={ + "task_desc_str": adal.Parameter( + data=task_desc_str, + role_desc="Task description for the language model", + param_type=adal.ParameterType.PROMPT, + requires_opt=True, + instruction_to_backward_engine="You need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.", + instruction_to_optimizer="You need find the best way(where does the right answer come from the context) to extract the RIGHT answer from the context.", + # + "Given existing context, ensure the task instructions can maximize the performance.", + ), + # "few_shot_demos": adal.Parameter( + # data=None, + # requires_opt=True, + # role_desc="To provide few shot demos to the language model", + # param_type=adal.ParameterType.DEMOS, + # ), + "output_format_str": self.llm_parser.get_output_format_str(), + }, + template=answer_template, + output_processors=self.llm_parser, + use_cache=True, + ) + + def call( + self, question: str, context: List[str], id: str = None + ) -> adal.GeneratorOutput: + if self.training: + raise ValueError( + "This component is not supposed to be called in training mode" + ) + + prompt_kwargs = { + "context": context, + "question": question, + } + + output = self.llm.call( + prompt_kwargs=prompt_kwargs, + id=id, + ) + + return output + + # TODO: add id in the retriever output + def forward( + self, question: str, context: List[str], id: str = None + ) -> adal.Parameter: + if not self.training: + raise ValueError("This component is not supposed to be called in eval mode") + + generator_out = self.llm.forward( + prompt_kwargs={"question": question, "context": context}, id=id + ) + return generator_out + + def test_retriever(): - question = "How many storeys are in the castle that David Gregory inherited?" + question = "Were Scott Derrickson and Ed Wood of the same nationality?" retriever = DspyRetriever(top_k=3) retriever_out = retriever(input=question) print(f"retriever_out: {retriever_out}") @@ -271,10 +337,6 @@ def test_retriever(): def test_vailla_rag(): - from use_cases.config import ( - gpt_3_model, - ) - task = VanillaRAG( **gpt_3_model, passages_per_hop=3, @@ -301,6 +363,29 @@ def test_vailla_rag(): # print(f"generator_out: {generator_out}") +from use_cases.config import ( + gpt_3_model, +) + + +def test_vanilla(): + task = Vanilla( + **gpt_3_model, + passages_per_hop=3, + ) + task.eval() + data_train, data_val, data_test = load_datasets() + data = data_train[0] + + output = task.call(question=data.question, context=data.context, id="1") + print(f"output: {output}, answer: {data.answer}") + + task.train() + output = task.forward(question=data.question, context=data.context, id="1") + print(f"output: {output.data}, answer: {data.answer}") + + if __name__ == "__main__": # test_retriever() - test_vailla_rag() + test_vanilla() + # test_vailla_rag() diff --git a/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py new file mode 100644 index 00000000..1c498f14 --- /dev/null +++ b/benchmarks/hotpot_qa/adal_exp/train_agent_rag.py @@ -0,0 +1,264 @@ +from typing import Any, Callable, Dict, Tuple + +import adalflow as adal +from adalflow.eval.answer_match_acc import AnswerMatchAcc +from adalflow.datasets.types import HotPotQAData + +from benchmarks.hotpot_qa.config import load_datasets +from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import AgenticRAG +from use_cases.config import gpt_3_model, gpt_4o_model +from adalflow.utils import printc + + +# TODO: look more into the loss function +# TODO: test LLM judge too. + +from adalflow.components.agent.react import ReActOutput + + +class AgenticRAGAdal(adal.AdalComponent): + def __init__( + self, + model_client: adal.ModelClient, + model_kwargs: Dict, + backward_engine_model_config: Dict | None = None, + teacher_model_config: Dict | None = None, + text_optimizer_model_config: Dict | None = None, + ): + task = AgenticRAG( + model_client=model_client, + model_kwargs=model_kwargs, + ) + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + loss_eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item + # eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item + loss_fn = adal.EvalFnToTextLoss( + eval_fn=loss_eval_fn, + eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0", + ) + # eval_fn = f1_score # 0.38 (hand crafted the finish, exat match 0.25) + + # loss_fn = adal.EvalFnToTextLoss( + # eval_fn=eval_fn, eval_fn_desc="Computes the overlaps between y and y_gt" + # ) + super().__init__( + task=task, + eval_fn=eval_fn, + loss_eval_fn=loss_eval_fn, + loss_fn=loss_fn, + backward_engine_model_config=backward_engine_model_config, + teacher_model_config=teacher_model_config, + text_optimizer_model_config=text_optimizer_model_config, + ) + + # tell the trainer how to call the task + def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: + if self.task.training: + return self.task.forward, {"input": sample.question, "id": sample.id} + else: + # print("eval mode") + return self.task.call, {"input": sample.question, "id": sample.id} + + # TODO: use two map fn to make the cde even simpler + + # eval mode: get the generator output, directly engage with the eval_fn + def prepare_eval(self, sample: HotPotQAData, y_pred: ReActOutput) -> float: + + y_label = y_pred.answer if isinstance(y_pred, ReActOutput) else y_pred + + printc( + f"eval y_label: {y_label}, y_gt: {sample.answer}, self.eval_fn: {self.eval_fn(y_label, sample.answer)}" + ) + + return self.eval_fn, {"y": y_label, "y_gt": sample.answer} + + def prepare_loss_eval(self, sample: HotPotQAData, y_pred: ReActOutput) -> float: + y_label = y_pred.answer if isinstance(y_pred, ReActOutput) else y_pred + printc( + f"loss eval y_label: {y_label}, y_gt: {sample.answer}, self.eval_fn: {self.loss_eval_fn(y_label, sample.answer)}" + ) + return self.loss_eval_fn, {"y": y_label, "y_gt": sample.answer} + + # train mode: get the loss and get the data from the full_response + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): + # prepare gt parameter + y_gt = adal.Parameter( + name="y_gt", + data=sample.answer, + eval_input=sample.answer, + requires_opt=False, + ) + + # pred's full_response is the output of the task pipeline which is GeneratorOutput + # pred.eval_input = ( + # pred.data[-1].observation if pred.data and pred.data[-1] else "" + # ) + printc(f"pred data: {pred.data}, gt: {sample.answer}") + pred.eval_input = pred.data if pred.data else "" + # pred.eval_input = ( + # pred.data[-1].observation if pred.data and pred.data[-1] else "" + # ) + # printc(f"loss eval_input: {pred.eval_input}") + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "id": sample.id, + "gt": y_gt.eval_input, + "input": {"question": sample.question}, + } + + +# Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics +# i checked the eval which does fuzzy match, and found some yes and Yes are not matched, then converted both strings to lower and +# the performances have gone up from 0.15 to 0.4 +def train_diagnose( + model_client: adal.ModelClient, + model_kwargs: Dict, +) -> Dict: + + trainset, valset, testset = load_datasets() + + adal_component = AgenticRAGAdal( + model_client, + model_kwargs, + backward_engine_model_config=gpt_4o_model, + teacher_model_config=gpt_3_model, + text_optimizer_model_config=gpt_3_model, + ) + trainset = trainset[:5] + trainer = adal.Trainer(adaltask=adal_component) + trainer.diagnose(dataset=trainset, split="train") + # trainer.diagnose(dataset=valset, split="val") + # trainer.diagnose(dataset=testset, split="test") + + +from adalflow.core.generator import BackwardPassSetup + + +def train( + train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle + raw_shots: int = 0, + bootstrap_shots: int = 4, + max_steps=1, + num_workers=4, + strategy="constrained", + optimization_order="sequential", + debug=False, + resume_from_ckpt=None, + exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, +): + adal_component = AgenticRAGAdal( + **gpt_3_model, + teacher_model_config=gpt_4o_model, + text_optimizer_model_config=gpt_4o_model, # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt + backward_engine_model_config=gpt_4o_model, + ) + print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + trainer = adal.Trainer( + train_batch_size=train_batch_size, + adaltask=adal_component, + strategy=strategy, + max_steps=max_steps, + num_workers=num_workers, + raw_shots=raw_shots, + bootstrap_shots=bootstrap_shots, + debug=debug, + weighted_sampling=True, + optimization_order=optimization_order, + exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + sequential_order=["text", "demo"], + max_proposals_per_step=max_proposals_per_step, + backward_pass_setup=backward_pass_setup, + ) + trainer.set_random_seed(seed) + print(trainer) + + train_dataset, val_dataset, test_dataset = load_datasets() + # train_dataset = train_dataset[:40] + # val_dataset = val_dataset[:40] + # test_dataset = test_dataset[:40] + + ckpt, _ = trainer.fit( + train_dataset=train_dataset, + val_dataset=val_dataset, + test_dataset=test_dataset, + resume_from_ckpt=resume_from_ckpt, + ) + return ckpt + + +if __name__ == "__main__": + from use_cases.config import gpt_3_model + + log = adal.get_logger(level="DEBUG", enable_console=False) + + adal.setup_env() + import json + + import random + + random.seed(2025) + + adal.setup_env() + + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + + # task = MultiHopRAGAdal(**gpt_3_model) + # print(task) + + # train_diagnose(**gpt_3_model) + # exit() + + ckpt = train( + debug=False, + max_steps=12, + seed=2025, + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_387b2_run_1.json", + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json", + ) + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") + + # 0.68 on val without training, 0.74on the second step. 0.84 test + # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_2_029cb_run_1.json + # 0.7, 0.72 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_2_b7523_run_1.json + # 208.085706949234s, 2 steps, maximum 4 steps allow for an agent. + # 0.72->0.74, 4 steps, 366s, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_4_dca7e_run_1.json [Already faster, still lots to optimize] + + # 1246s, 12 steps, 0.8 val, /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_defe7_run_1.json + # 2149s, both gradients, 0.68 -> 0.78 /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_8a24a_run_1.json + # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_cdcb5_run_1.json 1728 s, 0.8 + # /Users/liyin/.adalflow/ckpt/AgenticRAGAdal/constrained_max_steps_12_735a7_run_1.json 0.58 -> 0.68 (separate gradients) "pass": 17, + # "fail": 35 diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py index d80e6336..0f890263 100644 --- a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py +++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py @@ -3,8 +3,9 @@ import adalflow as adal from adalflow.eval.answer_match_acc import AnswerMatchAcc from adalflow.datasets.types import HotPotQAData +from benchmarks.hotpot_qa.config import load_datasets -from benchmarks.hotpot_qa._adal_train import load_datasets +# from benchmarks.hotpot_qa._adal_train import load_datasets from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import MultiHopRAG from use_cases.config import gpt_3_model, gpt_4o_model @@ -23,41 +24,45 @@ def __init__( task = MultiHopRAG( model_client=model_client, model_kwargs=model_kwargs, - passages_per_hop=3, + passages_per_hop=2, # better with only two passages, ablation study 0.49 vs 0.52 max_hops=2, ) - eval_fn = AnswerMatchAcc(type="fuzzy_match").compute_single_item + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + + loss_eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item loss_fn = adal.EvalFnToTextLoss( - eval_fn=eval_fn, eval_fn_desc="fuzzy_match: 1 if str(y) in str(y_gt) else 0" + eval_fn=loss_eval_fn, + eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0", ) super().__init__( task=task, eval_fn=eval_fn, + loss_eval_fn=loss_eval_fn, loss_fn=loss_fn, backward_engine_model_config=backward_engine_model_config, teacher_model_config=teacher_model_config, text_optimizer_model_config=text_optimizer_model_config, ) - # tell the trainer how to call the task def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: if self.task.training: return self.task.forward, {"question": sample.question, "id": sample.id} else: return self.task.call, {"question": sample.question, "id": sample.id} - # TODO: use two map fn to make the cde even simpler - - # eval mode: get the generator output, directly engage with the eval_fn def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float: y_label = "" if y_pred and y_pred.data and y_pred.data.answer: y_label = y_pred.data.answer return self.eval_fn, {"y": y_label, "y_gt": sample.answer} - # train mode: get the loss and get the data from the full_response + def prepare_loss_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer + return self.loss_eval_fn, {"y": y_label, "y_gt": sample.answer} + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): - # prepare gt parameter y_gt = adal.Parameter( name="y_gt", data=sample.answer, @@ -67,13 +72,19 @@ def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): # pred's full_response is the output of the task pipeline which is GeneratorOutput pred.eval_input = ( - pred.full_response.data.answer - if pred.full_response - and pred.full_response.data - and pred.full_response.data.answer + pred.data.data.answer + if pred.data and pred.data.data and pred.data.data.answer else "" ) - return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}} + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "input": {"question": sample.question}, + "id": sample.id, + "gt": sample.answer, + } + + +from adalflow.core.generator import BackwardPassSetup # Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics @@ -101,23 +112,32 @@ def train_diagnose( def train( train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle - raw_shots: int = 0, - bootstrap_shots: int = 4, + raw_shots: int = 2, + bootstrap_shots: int = 2, max_steps=1, - num_workers=4, + num_workers=10, strategy="constrained", optimization_order="sequential", debug=False, resume_from_ckpt=None, exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, ): adal_component = MultiHopRAGAdal( **gpt_3_model, - teacher_model_config=gpt_3_model, + teacher_model_config=gpt_4o_model, text_optimizer_model_config=gpt_4o_model, # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt backward_engine_model_config=gpt_4o_model, ) - print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + # print(adal_component) trainer = adal.Trainer( train_batch_size=train_batch_size, adaltask=adal_component, @@ -127,57 +147,78 @@ def train( raw_shots=raw_shots, bootstrap_shots=bootstrap_shots, debug=debug, - weighted_sampling=True, + weighted_sampling=False, optimization_order=optimization_order, exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, sequential_order=["text", "demo"], + max_proposals_per_step=max_proposals_per_step, + backward_pass_setup=backward_pass_setup, ) + trainer.set_random_seed(seed) print(trainer) train_dataset, val_dataset, test_dataset = load_datasets() - trainer.fit( + ckpt, _ = trainer.fit( train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, resume_from_ckpt=resume_from_ckpt, ) + return ckpt if __name__ == "__main__": from use_cases.config import gpt_3_model - log = adal.get_logger(level="DEBUG", enable_console=False) + # log = adal.get_logger(level="DEBUG", enable_console=False) + + adal.setup_env() + + import json + + import random + + random.seed(2025) adal.setup_env() + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + # task = MultiHopRAGAdal(**gpt_3_model) # print(task) # train_diagnose(**gpt_3_model) - # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion - train( - debug=False, - max_steps=12, + ckpt = train( + debug=True, + max_steps=24, + seed=2025, # pass the numpy seed + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_fde51_run_1.json", # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json", ) - - # notes for debug: if have nontype, delete all model cache and try again - # raise ValueError(ValueError: score must be provided for each demo, - - # 12/11/2024 - # demo only: /Users/liyin/Documents/test/LightRAG/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_8cdfc_run_9.json - - # why text grad did not improve in the rag case? Do we need to improve the meta prompt? - # /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_2686e_run_1.json - # 0.58 -> 0.68 on the test split - # 0.72 text grad /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_c1660_run_1.json - # try cycle next - # 0.66 /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_1d189_run_1.json - # no gradients 1021s (/Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_68e7e_run_1.json) -> 0.64 -> 0.68, pass 10/10+28 - # no gradient but scores (positive & negative) /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_83871_run_1.json 0.64->0.66, test 0.64 -> 0.66 - # no gradient but only negative score - # no gradient but score + teacher demonstration. - # feedback while seeing the gt + y - # only negative feedback /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_f5506_run_1.json 0.62 -> 0.7 - # /Users/liyin/.adalflow/ckpt/MultiHopRAGAdal/constrained_max_steps_12_b4aa5_run_1.json 0.74 pass rate 8 32 + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py new file mode 100644 index 00000000..8fae1c3b --- /dev/null +++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py @@ -0,0 +1,225 @@ +from typing import Any, Callable, Dict, Tuple + +import adalflow as adal +from adalflow.eval.answer_match_acc import AnswerMatchAcc +from adalflow.datasets.types import HotPotQAData + +from benchmarks.hotpot_qa.config import load_datasets +from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import MultiHopRAGCycle +from use_cases.config import gpt_3_model, gpt_4o_model + + +class MultiHopRAGCycleAdal(adal.AdalComponent): + def __init__( + self, + model_client: adal.ModelClient, + model_kwargs: Dict, + backward_engine_model_config: Dict | None = None, + teacher_model_config: Dict | None = None, + text_optimizer_model_config: Dict | None = None, + ): + task = MultiHopRAGCycle( + model_client=model_client, + model_kwargs=model_kwargs, + passages_per_hop=2, + max_hops=2, + ) + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + loss_eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item + loss_fn = adal.EvalFnToTextLoss( + eval_fn=loss_eval_fn, + eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0", + ) + + super().__init__( + task=task, + eval_fn=eval_fn, + loss_eval_fn=loss_eval_fn, + loss_fn=loss_fn, + backward_engine_model_config=backward_engine_model_config, + teacher_model_config=teacher_model_config, + text_optimizer_model_config=text_optimizer_model_config, + ) + + def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: + if self.task.training: + return self.task.forward, {"question": sample.question, "id": sample.id} + else: + return self.task.call, {"question": sample.question, "id": sample.id} + + def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer + return self.eval_fn, {"y": y_label, "y_gt": sample.answer} + + def prepare_loss_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer + return self.loss_eval_fn, {"y": y_label, "y_gt": sample.answer} + + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): + # prepare gt parameter + y_gt = adal.Parameter( + name="y_gt", + data=sample.answer, + eval_input=sample.answer, + requires_opt=False, + ) + + # pred's full_response is the output of the task pipeline which is GeneratorOutput + pred.eval_input = ( + pred.data.data.answer + if pred.data and pred.data.data and pred.data.data.answer + else "" + ) + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "id": sample.id, + "input": {"question": sample.question}, + } + + +def train_diagnose( + model_client: adal.ModelClient, + model_kwargs: Dict, +) -> Dict: + + trainset, valset, testset = load_datasets() + + adal_component = MultiHopRAGCycleAdal( + model_client, + model_kwargs, + backward_engine_model_config=gpt_4o_model, + teacher_model_config=gpt_3_model, + text_optimizer_model_config=gpt_3_model, + ) + trainer = adal.Trainer(adaltask=adal_component) + trainer.diagnose(dataset=trainset, split="train") + trainer.diagnose(dataset=valset, split="val") + trainer.diagnose(dataset=testset, split="test") + + +from adalflow.core.generator import BackwardPassSetup + + +def train( + train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle + raw_shots: int = 0, + bootstrap_shots: int = 4, + max_steps=1, + num_workers=4, + strategy="random", + optimization_order="sequential", + debug=False, + resume_from_ckpt=None, + exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, +): + adal_component = MultiHopRAGCycleAdal( + **gpt_3_model, + teacher_model_config=gpt_3_model, + text_optimizer_model_config=gpt_4o_model, # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt + backward_engine_model_config=gpt_4o_model, + ) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + print(adal_component) + trainer = adal.Trainer( + train_batch_size=train_batch_size, + adaltask=adal_component, + strategy=strategy, + max_steps=max_steps, + num_workers=num_workers, + raw_shots=raw_shots, + bootstrap_shots=bootstrap_shots, + debug=debug, + weighted_sampling=True, + optimization_order=optimization_order, + exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + sequential_order=["text", "demo"], + backward_pass_setup=backward_pass_setup, + ) + print(trainer) + trainer.set_random_seed(seed) + + train_dataset, val_dataset, test_dataset = load_datasets() + + # replace the train dataset for debug + # if debug: + # train_dataset = train_dataset[:2] + # data: HotPotQAData = train_dataset[0] + # data.question = "Brown State Fishing Lake is in a country that has a population of how many inhabitants?" + # data.answer = "9,984" + # print(f"train_dataset: {train_dataset}") + + ckpt, _ = trainer.fit( + train_dataset=train_dataset, + val_dataset=val_dataset, + test_dataset=test_dataset, + resume_from_ckpt=resume_from_ckpt, + ) + return ckpt + + +if __name__ == "__main__": + from use_cases.config import gpt_3_model + import json + + import random + + random.seed(2025) + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + + # log = adal.get_logger( + # level="DEBUG", enable_console=False, filename="multi_hop_rag_cycle.log" + # ) + + adal.setup_env() + + # task = MultiHopRAGAdal(**gpt_3_model) + # print(task) + + # train_diagnose(**gpt_3_model) + # exit() + + # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion + ckpt = train( + debug=False, + max_steps=24, + seed=2025, # pass the numpy seed + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + # resume_from_ckpt="/Users/liyin/Documents/test/LightRAG/.adalflow/ckpt/MultiHopRAGCycleAdal/constrained_max_steps_12_69e07_run_1.json", + ) + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") diff --git a/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py new file mode 100644 index 00000000..c0edded3 --- /dev/null +++ b/benchmarks/hotpot_qa/adal_exp/train_multi_hop_retriever.py @@ -0,0 +1,314 @@ +from typing import Any, Callable, Dict, Tuple, List + +import adalflow as adal +from adalflow.eval.retriever_recall import RetrieverEvaluator +from adalflow.eval.answer_match_acc import AnswerMatchAcc +from adalflow.datasets.types import HotPotQAData +from benchmarks.hotpot_qa.config import load_datasets + +from benchmarks.hotpot_qa.adal_exp.build_multi_hop_rag import ( + MultiHopRetriever, +) +from use_cases.config import gpt_3_model, gpt_4o_model +from adalflow.utils import printc + + +def retriever_recall(y: List[str], y_gt: List[str]) -> float: + return RetrieverEvaluator().compute_single_item(y, y_gt)["recall"] + + +def retriever_precision(y: List[str], y_gt: List[str]) -> float: + return RetrieverEvaluator().compute_single_item(y, y_gt)["precision"] + + +def retriever_query_f1(y: str, y_gt: str) -> float: + evaluator = AnswerMatchAcc(type="f1_score") + score = evaluator.compute_single_item(y, y_gt) + + return score + + +class MultiHopRetrieverAdal(adal.AdalComponent): + def __init__( + self, + model_client: adal.ModelClient, + model_kwargs: Dict, + backward_engine_model_config: Dict | None = None, + teacher_model_config: Dict | None = None, + text_optimizer_model_config: Dict | None = None, + ): + task = MultiHopRetriever( + model_client=model_client, + model_kwargs=model_kwargs, + passages_per_hop=2, + max_hops=2, + ) + eval_fn = retriever_recall + loss_fn = adal.EvalFnToTextLoss( + eval_fn=eval_fn, + eval_fn_desc="recall: len(y_gt.intersection(y)) / len(y_gt)", + ) + super().__init__( + task=task, + eval_fn=eval_fn, + loss_fn=loss_fn, + backward_engine_model_config=backward_engine_model_config, + teacher_model_config=teacher_model_config, + text_optimizer_model_config=text_optimizer_model_config, + ) + + def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: + if self.task.training: + return self.task.forward, {"input": sample.question, "id": sample.id} + else: + return self.task.call, {"input": sample.question, "id": sample.id} + + def prepare_eval(self, sample: HotPotQAData, y_pred: adal.RetrieverOutput) -> float: + if isinstance(y_pred, adal.Parameter): + raise ValueError("y_pred is not a RetrieverOutput") + documents = y_pred.documents + y_pred_titles = [] + for doc in documents: + title, content = doc.split("|") + y_pred_titles.append(title) + + return self.eval_fn, { + "y": y_pred_titles, + "y_gt": list(sample.gold_titles), + } + + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): + y_gt = adal.Parameter( + name="y_gt", + data=sample.gold_titles, + eval_input=list(sample.gold_titles), + requires_opt=False, + ) + + pred_titles = [] + for doc in pred.data.documents: + title, content = doc.split("|") + pred_titles.append(title) + + pred.eval_input = pred_titles + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "id": sample.id, + "gt": y_gt.data, + } + + +# 1. test the eval and the loss use different metrics +class MultiHopRetriever2Adal(adal.AdalComponent): + def __init__( + self, + model_client: adal.ModelClient, + model_kwargs: Dict, + backward_engine_model_config: Dict | None = None, + teacher_model_config: Dict | None = None, + text_optimizer_model_config: Dict | None = None, + ): + task = MultiHopRetriever( + model_client=model_client, + model_kwargs=model_kwargs, + passages_per_hop=2, + max_hops=2, + ) + eval_fn = retriever_query_f1 + loss_fn = adal.EvalFnToTextLoss( + eval_fn=eval_fn, + eval_fn_desc="precision: overlap of words between gt and prediction (queries). Only evaluate the generated queries from the generator. The multiple queries are joiend together by ',' to evaluate over the overlap on words.", + ) + super().__init__( + task=task, + eval_fn=eval_fn, + loss_fn=loss_fn, + backward_engine_model_config=backward_engine_model_config, + teacher_model_config=teacher_model_config, + text_optimizer_model_config=text_optimizer_model_config, + ) + self.eval_retriever_recall = retriever_recall + + def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: + if self.task.training: + return self.task.forward2, {"input": sample.question, "id": sample.id} + else: + return self.task.call2, {"input": sample.question, "id": sample.id} + + def prepare_eval(self, sample: HotPotQAData, y_pred: any) -> float: + if isinstance(y_pred, adal.Parameter): + raise ValueError("y_pred is not a RetrieverOutput") + + y_gt = ", ".join(sample.gold_titles) + # for doc in documents: + # title, content = doc.split("|") + # y_pred_titles.append(title) + + printc(f"y_gt: {y_gt}, pred: {y_pred}") + + return self.eval_fn, { + "y": y_pred.data, + "y_gt": y_gt, + } + + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): + + y_gt = adal.Parameter( + name="y_gt", + data=sample.gold_titles, + eval_input=", ".join(sample.gold_titles), + requires_opt=False, + ) + + pred.eval_input = pred.data.data + + printc(f"y_gt 1: {sample.gold_titles}, pred 1: {pred.eval_input}") + + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "id": sample.id, + "gt": y_gt.data, + } + + +from adalflow.core.generator import BackwardPassSetup + + +def train_diagnose( + model_client: adal.ModelClient, + model_kwargs: Dict, +) -> Dict: + + trainset, valset, testset = load_datasets() + + adal_component = MultiHopRetrieverAdal( + model_client, + model_kwargs, + backward_engine_model_config=gpt_4o_model, + teacher_model_config=gpt_3_model, + text_optimizer_model_config=gpt_3_model, + ) + trainer = adal.Trainer(adaltask=adal_component) + # trainer.diagnose(dataset=trainset, split="train") # 0.69 recall + # trainer.diagnose(dataset=valset, split="val") # 0.675 recall + trainer.diagnose(dataset=testset, split="test") # 0.71 (0.665) + + +def train( + train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle + raw_shots: int = 1, + bootstrap_shots: int = 1, + max_steps=1, + num_workers=10, + strategy="constrained", + optimization_order="sequential", + debug=False, + resume_from_ckpt=None, + exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, +): + adal_component = MultiHopRetrieverAdal( + **gpt_3_model, + teacher_model_config=gpt_4o_model, + text_optimizer_model_config=gpt_4o_model, # gpt3.5 is not enough to be used as a good optimizer, it struggles for long contenxt + backward_engine_model_config=gpt_4o_model, + ) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + # print(adal_component) + trainer = adal.Trainer( + train_batch_size=train_batch_size, + adaltask=adal_component, + strategy=strategy, + max_steps=max_steps, + num_workers=num_workers, + raw_shots=raw_shots, + bootstrap_shots=bootstrap_shots, + debug=debug, + weighted_sampling=False, + optimization_order=optimization_order, + exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + sequential_order=["text", "demo"], + max_proposals_per_step=max_proposals_per_step, + backward_pass_setup=backward_pass_setup, + ) + trainer.set_random_seed(seed) + print(trainer) + + train_dataset, val_dataset, test_dataset = load_datasets() + # val_dataset = val_dataset[:20] + ckpt, _ = trainer.fit( + train_dataset=train_dataset, + val_dataset=val_dataset, + test_dataset=test_dataset, + resume_from_ckpt=resume_from_ckpt, + ) + return ckpt + + +if __name__ == "__main__": + from use_cases.config import gpt_3_model + + # log = adal.get_logger(level="DEBUG", enable_console=False) + + adal.setup_env() + + import json + + import random + + random.seed(2025) + + adal.setup_env() + + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + + # task = MultiHopRAGAdal(**gpt_3_model) + # print(task) + + # train_diagnose(**gpt_3_model) + # exit() + + # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion + ckpt = train( + debug=True, + max_steps=12, + seed=2025, # pass the numpy seed + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + exclude_input_fields_from_bootstrap_demos=True, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRetrieverAdal/constrained_max_steps_12_945bd_run_1.json", + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/MultiHopRetrieverAdal/constrained_max_steps_12_d7043_run_1.json", + ) + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") + + # diff --git a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py index fc14e161..a264afac 100644 --- a/benchmarks/hotpot_qa/adal_exp/train_vanilla.py +++ b/benchmarks/hotpot_qa/adal_exp/train_vanilla.py @@ -4,14 +4,16 @@ from adalflow.eval.answer_match_acc import AnswerMatchAcc from adalflow.datasets.types import HotPotQAData -from benchmarks.hotpot_qa._adal_train import load_datasets -from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG -from use_cases.config import gpt_3_model, gpt_4o_model +from benchmarks.hotpot_qa.config import load_datasets +from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import Vanilla +from use_cases.config import gpt_3_model, gpt_4o_model, gpt_3_1106_model + +from adalflow.utils import printc # TODO: look more into the loss function # TODO: test LLM judge too. -class VallinaRAGAdal(adal.AdalComponent): +class VallinaAdal(adal.AdalComponent): def __init__( self, model_client: adal.ModelClient, @@ -20,18 +22,22 @@ def __init__( teacher_model_config: Dict | None = None, text_optimizer_model_config: Dict | None = None, ): - task = VanillaRAG( + task = Vanilla( model_client=model_client, model_kwargs=model_kwargs, - passages_per_hop=3, + passages_per_hop=2, ) - eval_fn = AnswerMatchAcc(type="fuzzy_match").compute_single_item + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + loss_eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item + loss_fn = adal.EvalFnToTextLoss( - eval_fn=eval_fn, eval_fn_desc="fuzzy_match: 1 if str(y) in str(y_gt) else 0" + eval_fn=loss_eval_fn, + eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0", ) super().__init__( task=task, eval_fn=eval_fn, + loss_eval_fn=loss_eval_fn, loss_fn=loss_fn, backward_engine_model_config=backward_engine_model_config, teacher_model_config=teacher_model_config, @@ -41,22 +47,32 @@ def __init__( # tell the trainer how to call the task def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: if self.task.training: - return self.task.forward, {"question": sample.question, "id": sample.id} + return self.task.forward, { + "question": sample.question, + "context": sample.context, + "id": sample.id, + } else: - return self.task.call, {"question": sample.question, "id": sample.id} - - # TODO: use two map fn to make the cde even simpler + return self.task.call, { + "question": sample.question, + "context": sample.context, + "id": sample.id, + } - # eval mode: get the generator output, directly engage with the eval_fn def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float: y_label = "" if y_pred and y_pred.data and y_pred.data.answer: - y_label = y_pred.data.answer + y_label = y_pred.data.answer # .lower() + printc(f"y_label: {y_label}, y_gt: {sample.answer}") return self.eval_fn, {"y": y_label, "y_gt": sample.answer} - # train mode: get the loss and get the data from the full_response + def prepare_loss_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer + return self.loss_eval_fn, {"y": y_label, "y_gt": sample.answer} + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): - # prepare gt parameter y_gt = adal.Parameter( name="y_gt", data=sample.answer, @@ -64,15 +80,12 @@ def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): requires_opt=False, ) - # pred's full_response is the output of the task pipeline which is GeneratorOutput pred.eval_input = ( - pred.full_response.data.answer - if pred.full_response - and pred.full_response.data - and pred.full_response.data.answer + pred.data.data.answer + if pred.data and pred.data.data and pred.data.data.answer else "" ) - return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}} + return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id} # Note: diagnose is quite helpful, it helps you to quickly check if the evalfunction is the right metrics @@ -85,7 +98,7 @@ def train_diagnose( trainset, valset, testset = load_datasets() - adal_component = VallinaRAGAdal( + adal_component = VallinaAdal( model_client, model_kwargs, backward_engine_model_config=gpt_4o_model, @@ -98,6 +111,9 @@ def train_diagnose( # trainer.diagnose(dataset=testset, split="test") +from adalflow.core.generator import BackwardPassSetup + + def train( train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle raw_shots: int = 0, @@ -109,14 +125,23 @@ def train( debug=False, resume_from_ckpt=None, exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, ): - adal_component = VallinaRAGAdal( - **gpt_3_model, + adal_component = VallinaAdal( + **gpt_3_1106_model, teacher_model_config=gpt_4o_model, text_optimizer_model_config=gpt_4o_model, backward_engine_model_config=gpt_4o_model, ) print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) trainer = adal.Trainer( train_batch_size=train_batch_size, adaltask=adal_component, @@ -129,23 +154,54 @@ def train( weighted_sampling=True, optimization_order=optimization_order, exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + max_proposals_per_step=max_proposals_per_step, + backward_pass_setup=backward_pass_setup, ) + trainer.set_random_seed(seed) print(trainer) train_dataset, val_dataset, test_dataset = load_datasets() - trainer.fit( + ckpt, _ = trainer.fit( train_dataset=train_dataset, val_dataset=val_dataset, + # test_dataset=val_dataset[0:4], test_dataset=test_dataset, resume_from_ckpt=resume_from_ckpt, ) + # diagnose the test set + # trainer.diagnose(dataset=test_dataset, split="test", resume_from_ckpt=ckpt) + return ckpt if __name__ == "__main__": from use_cases.config import gpt_3_model + import json + + import random + + random.seed(2025) + adal.setup_env() + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + # task = VallinaRAGAdal(**gpt_3_model) # print(task) @@ -153,11 +209,22 @@ def train( # train: 0.15 before the evaluator converted to lower and 0.4 after the conversion # TODO: test debug mode - train( + ckpt = train( debug=False, - max_steps=12, - # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json", + max_steps=1, + seed=2025, # pass the numpy seed + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + resume_from_ckpt="/Users/liyin/.adalflow/ckpt/VallinaAdal/random_max_steps_24_1511c_run_1.json", ) + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") # random_max_steps_12_ecf16_run_9.json, demo only, val 0.6 to 0.68, test: 0.58-0.61 # random_max_steps_12_7c091_run_1.json, prompt + demo, 0.58 -0.62, test: 0.55 - 0.58 # resume from random_max_steps_12_7c091_run_1.json diff --git a/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py b/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py new file mode 100644 index 00000000..ddb6c1fd --- /dev/null +++ b/benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py @@ -0,0 +1,212 @@ +from typing import Any, Callable, Dict, Tuple + +import adalflow as adal +from adalflow.eval.answer_match_acc import AnswerMatchAcc +from adalflow.datasets.types import HotPotQAData + +from benchmarks.hotpot_qa.config import load_datasets +from benchmarks.hotpot_qa.adal_exp.build_vanilla_rag import VanillaRAG +from use_cases.config import gpt_3_model, gpt_4o_model, gpt_3_1106_model + + +class VallinaRAGAdal(adal.AdalComponent): + def __init__( + self, + model_client: adal.ModelClient, + model_kwargs: Dict, + backward_engine_model_config: Dict | None = None, + teacher_model_config: Dict | None = None, + text_optimizer_model_config: Dict | None = None, + ): + task = VanillaRAG( + model_client=model_client, + model_kwargs=model_kwargs, + passages_per_hop=3, + ) + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + loss_eval_fn = AnswerMatchAcc(type="f1_score").compute_single_item + + loss_fn = adal.EvalFnToTextLoss( + eval_fn=loss_eval_fn, + eval_fn_desc="exact_match: 1 if str(y_gt) == str(y) else 0", + ) + super().__init__( + task=task, + eval_fn=eval_fn, + loss_eval_fn=loss_eval_fn, + loss_fn=loss_fn, + backward_engine_model_config=backward_engine_model_config, + teacher_model_config=teacher_model_config, + text_optimizer_model_config=text_optimizer_model_config, + ) + + def prepare_task(self, sample: HotPotQAData) -> Tuple[Callable[..., Any], Dict]: + if self.task.training: + return self.task.forward, {"question": sample.question, "id": sample.id} + else: + return self.task.call, {"question": sample.question, "id": sample.id} + + def prepare_eval(self, sample: HotPotQAData, y_pred: adal.GeneratorOutput) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer # .lower() + # printc(f"y_label: {y_label}, y_gt: {sample.answer}") + return self.eval_fn, {"y": y_label, "y_gt": sample.answer} + + def prepare_loss_eval(self, sample: Any, y_pred: Any, *args, **kwargs) -> float: + y_label = "" + if y_pred and y_pred.data and y_pred.data.answer: + y_label = y_pred.data.answer + return self.loss_eval_fn, {"y": y_label, "y_gt": sample.answer} + + def prepare_loss(self, sample: HotPotQAData, pred: adal.Parameter): + y_gt = adal.Parameter( + name="y_gt", + data=sample.answer, + eval_input=sample.answer, + requires_opt=False, + ) + + pred.eval_input = ( + pred.data.data.answer + if pred.data and pred.data.data and pred.data.data.answer + else "" + ) + return self.loss_fn, { + "kwargs": {"y": pred, "y_gt": y_gt}, + "id": sample.id, + # "gt": sample.answer, + } + + +def train_diagnose( + model_client: adal.ModelClient, + model_kwargs: Dict, +) -> Dict: + + trainset, valset, testset = load_datasets() + + adal_component = VallinaRAGAdal( + model_client, + model_kwargs, + backward_engine_model_config=gpt_4o_model, + teacher_model_config=gpt_3_model, + text_optimizer_model_config=gpt_3_model, + ) + trainer = adal.Trainer(adaltask=adal_component) + trainer.diagnose(dataset=trainset, split="train") + # trainer.diagnose(dataset=valset, split="val") + # trainer.diagnose(dataset=testset, split="test") + + +from adalflow.core.generator import BackwardPassSetup + + +def train( + train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle + raw_shots: int = 0, + bootstrap_shots: int = 4, + max_steps=1, + num_workers=4, + strategy="constrained", + optimization_order="sequential", + debug=False, + resume_from_ckpt=None, + exclude_input_fields_from_bootstrap_demos=True, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, +): + adal_component = VallinaRAGAdal( + **gpt_3_1106_model, + teacher_model_config=gpt_4o_model, + text_optimizer_model_config=gpt_4o_model, + backward_engine_model_config=gpt_4o_model, + ) + print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + trainer = adal.Trainer( + train_batch_size=train_batch_size, + adaltask=adal_component, + strategy=strategy, + max_steps=max_steps, + num_workers=num_workers, + raw_shots=raw_shots, + bootstrap_shots=bootstrap_shots, + debug=debug, + weighted_sampling=False, + optimization_order=optimization_order, + exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + max_proposals_per_step=max_proposals_per_step, + backward_pass_setup=backward_pass_setup, + ) + trainer.set_random_seed(seed) + print(trainer) + + train_dataset, val_dataset, test_dataset = load_datasets() + ckpt, _ = trainer.fit( + train_dataset=train_dataset, + val_dataset=val_dataset, + test_dataset=test_dataset, + resume_from_ckpt=resume_from_ckpt, + ) + return ckpt + + +if __name__ == "__main__": + from use_cases.config import gpt_3_model + + import json + + import random + + random.seed(2025) + + adal.setup_env() + + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_false") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + + # task = VallinaRAGAdal(**gpt_3_model) + # print(task) + + # train_diagnose(**gpt_3_model) + + ckpt = train( + debug=True, + max_steps=12, + seed=2025, # pass the numpy seed + tg=use_tg, + strategy=set_strategy, + max_proposals_per_step=max_proposals_per_step, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/VallinaRAGAdal/constrained_max_steps_12_8cdad_run_1.json", + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/VallinaRAGAdal/constrained_max_steps_12_5a4b4_run_1.json", + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ValinaRAGAdal/random_max_steps_12_7c091_run_1.json", + ) + print(f"ckpt: {ckpt}") + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") diff --git a/benchmarks/hotpot_qa/config.py b/benchmarks/hotpot_qa/config.py index ebdfbb01..a7de7e86 100644 --- a/benchmarks/hotpot_qa/config.py +++ b/benchmarks/hotpot_qa/config.py @@ -1,2 +1,13 @@ dspy_save_path = "benchmarks/BHH_object_count/models/dspy" adal_save_path = "benchmarks/BHH_object_count/models/adal" + +from adalflow.datasets.hotpot_qa import HotPotQA + + +def load_datasets(): + + trainset = HotPotQA(split="train", size=100) # 20 + valset = HotPotQA(split="val", size=100) # 50 + testset = HotPotQA(split="test", size=200) # to keep the same as the dspy #50 + print(f"trainset, valset: {len(trainset)}, {len(valset)}, example: {trainset[0]}") + return trainset, valset, testset diff --git a/benchmarks/trec_classification/dspy_train_few_shot_boostrap.py b/benchmarks/trec_classification/dspy_train_few_shot_boostrap.py index 28c7c484..0e93aeb3 100644 --- a/benchmarks/trec_classification/dspy_train_few_shot_boostrap.py +++ b/benchmarks/trec_classification/dspy_train_few_shot_boostrap.py @@ -1,15 +1,18 @@ import dspy import dspy.evaluate from dspy import Example +from adalflow.eval.answer_match_acc import AnswerMatchAcc + # DSPY cache:~/cachedir_joblib/joblib/dsp/modules -turbo = dspy.OpenAI(model="gpt-3.5-turbo") +turbo = dspy.OpenAI(model="gpt-3.5-turbo-0125") gpt_4 = dspy.OpenAI(model="gpt-4o") colbertv2_wiki17_abstracts = dspy.ColBERTv2( url="http://20.102.90.50:2017/wiki17_abstracts" ) -dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts) +# dspy.settings.configure(lm=turbo, rm=colbertv2_wiki17_abstracts) +dspy.configure(lm=turbo) class GenerateAnswer(dspy.Signature): @@ -43,7 +46,11 @@ def forward(self, question): def exact_match(example, pred, trace=None): - if str(pred.answer.strip()) == str(example.answer.strip()): + # if str(pred.answer.strip()) == str(example.answer.strip()): + # return True + + eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item + if eval_fn(pred.answer, example.answer): return True return False @@ -86,29 +93,52 @@ def train_signature(trainset, valset, save_path, filename): def train(trainset, valset, save_path, filename): - from dspy.teleprompt import BootstrapFewShotWithRandomSearch + r""" + Use the MIPROv2 teleprompter to train the model. + """ import os + from dspy.teleprompt import MIPROv2 if not os.path.exists(save_path): os.makedirs(save_path) # I dont know how to config teacher_config, cant find their documentation on this. - teleprompter = BootstrapFewShotWithRandomSearch( - metric=dspy.evaluate.answer_exact_match, - teacher_settings=dict(lm=gpt_4), - max_rounds=1, - max_bootstrapped_demos=4, - max_labeled_demos=40, + # teleprompter = BootstrapFewShotWithRandomSearch( + # metric=dspy.evaluate.answer_exact_match, + # teacher_settings=dict(lm=gpt_4), + # max_rounds=1, + # max_bootstrapped_demos=4, + # max_labeled_demos=40, + # ) + # compiled_baleen = teleprompter.compile( + # TrecClassifier(), + # # teacher=TrecClassifier(), + # trainset=trainset, + # valset=valset, + # ) + # turbo.inspect_history(n=3) + # compiled_baleen.save(os.path.join(save_path, filename)) + # return compiled_baleen + + tp = MIPROv2( + metric=exact_match, + prompt_model=gpt_4, + task_model=turbo, + num_candidates=30, + init_temperature=1.0, ) - compiled_baleen = teleprompter.compile( + compiled_task = tp.compile( TrecClassifier(), - # teacher=TrecClassifier(), trainset=trainset, valset=valset, + max_bootstrapped_demos=5, + max_labeled_demos=2, + num_batches=12, # MINIBATCH_SIZE = 25, + seed=2025, + requires_permission_to_run=False, ) - turbo.inspect_history(n=3) - compiled_baleen.save(os.path.join(save_path, filename)) - return compiled_baleen + compiled_task.save(os.path.join(save_path, filename)) + return compiled_task def evaluate(devset, compiled_task): @@ -134,14 +164,11 @@ def evaluate(devset, compiled_task): task = TrecClassifier() trainset, valset, testset = load_dspy_datasets() - for data in trainset: - response = task(data.question) - turbo.inspect_history(n=3) - - print(response) - print(data) - break + start_val_acc = evaluate(valset, task) + print("Val start: ", start_val_acc) # 72.89% + start_test_acc = evaluate(testset, task) + print("Test start: ", start_test_acc) # 76.35% dspy_save_path = "benchmarks/trec_classification/dspy_models" import os @@ -153,14 +180,34 @@ def evaluate(devset, compiled_task): # output = evaluate(testset, task) # val start: 61.11, train: 57.5%, # test: 60.42% # print(output) + import time + + start = time.time() + # train the model compiled_baleen = train( - trainset, valset, dspy_save_path, "trec_classifier_class_name_2.json" + trainset, valset, dspy_save_path, "trec_classifier_MIPROv2_50_name_2.json" ) - # select class: optimizeed: test: 83.3%, val: 83.3% - evaluate(testset, compiled_baleen) - evaluate(valset, compiled_baleen) - # 80.6 on the test set, 79.9, 86.11 on val set, 81.2 - # 40 raw, 4 bootstrapped, 80.5 val, 86.1 on test, - # with class name: 86.1 val, 82.6 test on 4 bootstrapped, 36 raw + # # select class: optimizeed: test: 83.3%, val: 83.3% + test_score = evaluate(testset, compiled_baleen) + val_score = evaluate(valset, compiled_baleen) + print("Time taken: ", time.time() - start) + print("Test score: ", test_score) + print("Val score: ", val_score) + + # MIROv2: 85.6 on test, 81.9 on val, 86.2, 83.1, total trial 12 + # 87.4, 80.1 val + test_scores = [85.6, 86.2, 87.4] + val_scores = [81.9, 83.1, 80.1] + + # compute mean and std + import numpy as np + + mean_test_score = np.mean(test_scores) + std_test_score = np.std(test_scores) + mean_val_score = np.mean(val_scores) + std_val_score = np.std(val_scores) + print( + mean_test_score, std_test_score, mean_val_score, std_val_score + ) # 86.4%, 0.74, 81.7%, 1.23 diff --git a/docs/CHANGLOG.md b/docs/CHANGLOG.md new file mode 100644 index 00000000..585596ab --- /dev/null +++ b/docs/CHANGLOG.md @@ -0,0 +1,3 @@ +## [0.2.7] - 2025-01-16 + +- Added multimodal support in Generator tutorial and more explanation by Filip. diff --git a/notebooks/tutorials/adalflow_classification_optimization.ipynb b/notebooks/tutorials/adalflow_classification_optimization.ipynb index 36332387..0d3034f7 100644 --- a/notebooks/tutorials/adalflow_classification_optimization.ipynb +++ b/notebooks/tutorials/adalflow_classification_optimization.ipynb @@ -1,967 +1,968 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# 🤗 Welcome to AdalFlow!\n", + "## The PyTorch library to auto-optimize any LLM task pipelines\n", + "\n", + "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", + "\n", + "\n", + "# Quick Links\n", + "\n", + "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n", + "\n", + "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n", + "\n", + "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n", + "\n", + "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", + "\n", + "## 📖 Outline\n", + "\n", + "This is the code for a classification optimization tutorial ![image.png]()\n" + ], + "metadata": { + "id": "xHF95Kr4CzGq" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "# Installation\n", + "\n", + "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq` from the extra packages.\n", + "\n", + " ```bash\n", + " pip install adalflow[openai,groq]\n", + " ```\n", + "2. Setup `openai` and `groq` API key in the environment variables\n", + "\n", + "You can choose to use different client. You can import the model client you prefer. We support `Anthropic`, `Cohere`, `Google`, `GROQ`, `OpenAI`, `Transformer` and more in development. We will use OpenAI here as an example.Please refer to our [full installation guide](https://adalflow.sylph.ai/get_started/installation.html)" + ], + "metadata": { + "id": "Kof5M6DRaKhh" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tAp3eDjOCma1" + }, + "outputs": [], + "source": [ + "from IPython.display import clear_output\n", + "\n", + "!pip install -U adalflow[openai] # also install the package for the model client you'll use\n", + "!pip install datasets\n", + "clear_output()" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip uninstall httpx anyio -y\n", + "!pip install “anyio>=3.1.0,<4.0”\n", + "!pip install httpx==0.24.1" + ], + "metadata": { "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" + "id": "CU672Gt4bY7b", + "outputId": "532c84d2-c7bd-40ac-c050-e2c5dddc8946" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Found existing installation: httpx 0.28.1\n", + "Uninstalling httpx-0.28.1:\n", + " Successfully uninstalled httpx-0.28.1\n", + "Found existing installation: anyio 3.7.1\n", + "Uninstalling anyio-3.7.1:\n", + " Successfully uninstalled anyio-3.7.1\n", + "/bin/bash: line 1: 4.0”: No such file or directory\n", + "Collecting httpx==0.24.1\n", + " Downloading httpx-0.24.1-py3-none-any.whl.metadata (7.4 kB)\n", + "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (2024.8.30)\n", + "Collecting httpcore<0.18.0,>=0.15.0 (from httpx==0.24.1)\n", + " Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)\n", + "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (3.10)\n", + "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (1.3.1)\n", + "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (0.14.0)\n", + "Collecting anyio<5.0,>=3.0 (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1)\n", + " Downloading anyio-4.7.0-py3-none-any.whl.metadata (4.7 kB)\n", + "Requirement already satisfied: exceptiongroup>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (1.2.2)\n", + "Requirement already satisfied: typing_extensions>=4.5 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (4.12.2)\n", + "Downloading httpx-0.24.1-py3-none-any.whl (75 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading httpcore-0.17.3-py3-none-any.whl (74 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading anyio-4.7.0-py3-none-any.whl (93 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.1/93.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: anyio, httpcore, httpx\n", + " Attempting uninstall: httpcore\n", + " Found existing installation: httpcore 1.0.7\n", + " Uninstalling httpcore-1.0.7:\n", + " Successfully uninstalled httpcore-1.0.7\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "jupyter-server 1.24.0 requires anyio<4,>=3.1.0, but you have anyio 4.7.0 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed anyio-4.7.0 httpcore-0.17.3 httpx-0.24.1\n" + ] } + ] }, - "cells": [ - { - "cell_type": "markdown", - "source": [ - "# 🤗 Welcome to AdalFlow!\n", - "## The PyTorch library to auto-optimize any LLM task pipelines\n", - "\n", - "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", - "\n", - "\n", - "# Quick Links\n", - "\n", - "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n", - "\n", - "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n", - "\n", - "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n", - "\n", - "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", - "\n", - "## 📖 Outline\n", - "\n", - "This is the code for a classification optimization tutorial ![image.png]()\n" - ], - "metadata": { - "id": "xHF95Kr4CzGq" - } - }, - { - "cell_type": "markdown", - "source": [ - "\n", - "# Installation\n", - "\n", - "1. Use `pip` to install the `adalflow` Python package. We will need `openai`, `groq` from the extra packages.\n", - "\n", - " ```bash\n", - " pip install adalflow[openai,groq]\n", - " ```\n", - "2. Setup `openai` and `groq` API key in the environment variables\n", - "\n", - "You can choose to use different client. You can import the model client you prefer. We support `Anthropic`, `Cohere`, `Google`, `GROQ`, `OpenAI`, `Transformer` and more in development. We will use OpenAI here as an example.Please refer to our [full installation guide](https://adalflow.sylph.ai/get_started/installation.html)" - ], - "metadata": { - "id": "Kof5M6DRaKhh" - } + { + "cell_type": "markdown", + "source": [ + "## Set Environment Variables\n", + "\n", + "Run the following code and pass your api key.\n", + "\n", + "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n", + "\n", + "*Go to [OpenAI](https://platform.openai.com/docs/introduction) to get API keys if you don't already have.*" + ], + "metadata": { + "id": "KapUyHMM07pJ" + } + }, + { + "cell_type": "code", + "source": [ + "import os\n", + "\n", + "from getpass import getpass\n", + "\n", + "# Prompt user to enter their API keys securely\n", + "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n", + "\n", + "\n", + "# Set environment variables\n", + "os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", + "\n", + "print(\"API keys have been set.\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "ONfzF9Puzdd_", + "outputId": "a8ca0388-be6e-4b7a-cd05-d4ec52f64e95" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tAp3eDjOCma1" - }, - "outputs": [], - "source": [ - "from IPython.display import clear_output\n", - "\n", - "!pip install -U adalflow[openai] # also install the package for the model client you'll use\n", - "!pip install datasets\n", - "clear_output()" - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Please enter your OpenAI API key: ··········\n", + "API keys have been set.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Prepare data structures and prompt template" + ], + "metadata": { + "id": "4W3yEpRpepNK" + } + }, + { + "cell_type": "code", + "source": [ + "from dataclasses import dataclass, field\n", + "from typing import List, Dict, Union, Optional, Tuple, Any, Callable\n", + "from datasets import load_dataset\n", + "from adalflow.components.model_client import OpenAIClient\n", + "import adalflow as adal\n", + "from adalflow.core.component import Component\n", + "from adalflow.datasets.types import TrecData\n", + "from adalflow.datasets.trec import TrecDataset\n", + "\n", + "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", + "\n", + "\n", + "_COARSE_LABELS = [\"ABBR\", \"DESC\", \"ENTY\", \"HUM\", \"LOC\", \"NUM\"]\n", + "\n", + "_COARSE_LABELS_DESC = [\n", + " \"Abbreviation: Questions about abbreviations and their meanings\",\n", + " \"Description: Questions seeking descriptions of people, things, or concepts\",\n", + " \"Entity: Questions about entities (e.g., animals, colors, inventions)\",\n", + " \"Human: Questions about people or organizations\",\n", + " \"Location: Questions about places, cities, countries\",\n", + " \"Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\",\n", + "]\n", + "\n", + "\n", + "template = r\"\"\"\n", + " {{system_prompt}}\n", + " {% if output_format_str is not none %}\n", + " {{output_format_str}}\n", + " {% endif %}\n", + " {% if few_shot_demos is not none %}\n", + " Here are some examples:\n", + " {{few_shot_demos}}\n", + " {% endif %}\n", + " \n", + " \n", + " {{input_str}}\n", + " \n", + " \"\"\"\n", + "\n", + "task_desc_template = r\"\"\"You are a classifier. Given a question, you need to classify it into one of the following classes:\n", + " Format: class_index. class_name, class_description\n", + " {% if classes %}\n", + " {% for class in classes %}\n", + " {{loop.index-1}}. {{class.label}}, {{class.desc}}\n", + " {% endfor %}\n", + " {% endif %}\n", + " - Do not try to answer the question:\n", + " \"\"\"\n", + "\n", + "\n", + "@dataclass\n", + "class TRECExtendedData(TrecData):\n", + " rationale: str = field(\n", + " metadata={\n", + " \"desc\": \"Your step-by-step reasoning to classify the question to class_name\"\n", + " },\n", + " default=None,\n", + " )\n", + " __input_fields__ = [\"question\"]\n", + " __output_fields__ = [\n", + " \"rationale\",\n", + " \"class_name\",\n", + " ] # it is important to have the rationale before the class_name\n", + "\n", + "\n", + "def load_datasets():\n", + " \"\"\"Load the dataset\"\"\"\n", + " train_data = TrecDataset(split=\"train\")\n", + " val_data = TrecDataset(split=\"val\")\n", + " test_data = TrecDataset(split=\"test\")\n", + " return train_data, val_data, test_data # 0.694, 0.847" + ], + "metadata": { + "id": "ZZIEtZYHNVjo" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# prepare models\n", + "\n", + "from adalflow.components.model_client.openai_client import OpenAIClient\n", + "\n", + "# used as the target model\n", + "gpt_3_model = {\n", + " \"model_client\": OpenAIClient(),\n", + " \"model_kwargs\": {\n", + " \"model\": \"gpt-3.5-turbo\",\n", + " \"max_tokens\": 2000,\n", + " \"temperature\": 0.0,\n", + " \"top_p\": 0.99,\n", + " \"frequency_penalty\": 0,\n", + " \"presence_penalty\": 0,\n", + " \"stop\": None,\n", + " },\n", + "}\n", + "\n", + "# used as optimizer and backward engine\n", + "gpt_4o_mini_model = {\n", + " \"model_client\": OpenAIClient(),\n", + " \"model_kwargs\": {\n", + " \"model\": \"gpt-4o-mini\",\n", + " \"temperature\": 1,\n", + " \"top_p\": 0.99,\n", + " \"max_tokens\": 1000,\n", + " # \"frequency_penalty\": 1, # high for nto repeating prompt\n", + " },\n", + "}" + ], + "metadata": { + "id": "yAvzn7DZeUX-" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Create the task pipeline" + ], + "metadata": { + "id": "G664uy9MgDdC" + } + }, + { + "cell_type": "code", + "source": [ + "class TRECClassifierStructuredOutput(adal.Component):\n", + "\n", + " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n", + " super().__init__()\n", + "\n", + " label_desc = [\n", + " {\"label\": label, \"desc\": desc}\n", + " for label, desc in zip(_COARSE_LABELS, _COARSE_LABELS_DESC)\n", + " ]\n", + "\n", + " task_desc_str = adal.Prompt(\n", + " template=task_desc_template, prompt_kwargs={\"classes\": label_desc}\n", + " )()\n", + "\n", + " self.data_class = TRECExtendedData\n", + " self.data_class.set_task_desc(task_desc_str)\n", + "\n", + " self.parser = adal.DataClassParser(\n", + " data_class=self.data_class, return_data_class=True, format_type=\"yaml\"\n", + " )\n", + "\n", + " prompt_kwargs = {\n", + " \"system_prompt\": adal.Parameter(\n", + " data=self.parser.get_task_desc_str(),\n", + " role_desc=\"Task description\",\n", + " requires_opt=True,\n", + " param_type=adal.ParameterType.PROMPT,\n", + " ),\n", + " \"output_format_str\": adal.Parameter(\n", + " data=self.parser.get_output_format_str(),\n", + " role_desc=\"Output format requirements\",\n", + " requires_opt=False,\n", + " param_type=adal.ParameterType.PROMPT,\n", + " ),\n", + " \"few_shot_demos\": adal.Parameter(\n", + " data=None,\n", + " requires_opt=True,\n", + " role_desc=\"Few shot examples to help the model\",\n", + " param_type=adal.ParameterType.DEMOS,\n", + " ),\n", + " }\n", + "\n", + " self.llm = adal.Generator(\n", + " model_client=model_client,\n", + " model_kwargs=model_kwargs,\n", + " prompt_kwargs=prompt_kwargs,\n", + " template=template,\n", + " output_processors=self.parser,\n", + " use_cache=True,\n", + " )\n", + "\n", + " def _prepare_input(self, question: str):\n", + " input_data = self.data_class(question=question)\n", + " input_str = self.parser.get_input_str(input_data)\n", + " prompt_kwargs = {\n", + " \"input_str\": adal.Parameter(\n", + " data=input_str, requires_opt=False, role_desc=\"input to the LLM\"\n", + " )\n", + " }\n", + " return prompt_kwargs\n", + "\n", + " def call(\n", + " self, question: str, id: Optional[str] = None\n", + " ) -> Union[adal.GeneratorOutput, adal.Parameter]:\n", + " prompt_kwargs = self._prepare_input(question)\n", + " output = self.llm(prompt_kwargs=prompt_kwargs, id=id)\n", + " return output" + ], + "metadata": { + "id": "3Q3H9XC4Ncfi" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Inference the task pipeline and draw the computation graph" + ], + "metadata": { + "id": "gj08oOqqgGyr" + } + }, + { + "cell_type": "code", + "source": [ + "# load dataset to get one example\n", + "\n", + "train_dataset, val_dataset, test_dataset = load_datasets()\n", + "example = train_dataset[0]\n", + "print(example)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "qtvLN8zOgnSg", + "outputId": "9996f8c3-371d-4b5c-ec48-e8cf6d6c396b" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "!pip uninstall httpx anyio -y\n", - "!pip install “anyio>=3.1.0,<4.0”\n", - "!pip install httpx==0.24.1" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "CU672Gt4bY7b", - "outputId": "532c84d2-c7bd-40ac-c050-e2c5dddc8946" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Found existing installation: httpx 0.28.1\n", - "Uninstalling httpx-0.28.1:\n", - " Successfully uninstalled httpx-0.28.1\n", - "Found existing installation: anyio 3.7.1\n", - "Uninstalling anyio-3.7.1:\n", - " Successfully uninstalled anyio-3.7.1\n", - "/bin/bash: line 1: 4.0”: No such file or directory\n", - "Collecting httpx==0.24.1\n", - " Downloading httpx-0.24.1-py3-none-any.whl.metadata (7.4 kB)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (2024.8.30)\n", - "Collecting httpcore<0.18.0,>=0.15.0 (from httpx==0.24.1)\n", - " Downloading httpcore-0.17.3-py3-none-any.whl.metadata (18 kB)\n", - "Requirement already satisfied: idna in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (3.10)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from httpx==0.24.1) (1.3.1)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (0.14.0)\n", - "Collecting anyio<5.0,>=3.0 (from httpcore<0.18.0,>=0.15.0->httpx==0.24.1)\n", - " Downloading anyio-4.7.0-py3-none-any.whl.metadata (4.7 kB)\n", - "Requirement already satisfied: exceptiongroup>=1.0.2 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (1.2.2)\n", - "Requirement already satisfied: typing_extensions>=4.5 in /usr/local/lib/python3.10/dist-packages (from anyio<5.0,>=3.0->httpcore<0.18.0,>=0.15.0->httpx==0.24.1) (4.12.2)\n", - "Downloading httpx-0.24.1-py3-none-any.whl (75 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading httpcore-0.17.3-py3-none-any.whl (74 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hDownloading anyio-4.7.0-py3-none-any.whl (93 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m93.1/93.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: anyio, httpcore, httpx\n", - " Attempting uninstall: httpcore\n", - " Found existing installation: httpcore 1.0.7\n", - " Uninstalling httpcore-1.0.7:\n", - " Successfully uninstalled httpcore-1.0.7\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "jupyter-server 1.24.0 requires anyio<4,>=3.1.0, but you have anyio 4.7.0 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed anyio-4.7.0 httpcore-0.17.3 httpx-0.24.1\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "TrecData(id='e73a82a7-6a3d-4947-90f5-03739e169db0', question='When reading classified ads , what does EENTY : other stand for ?', class_name='ABBR', class_index=0)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "task = TRECClassifierStructuredOutput(\n", + " model_client=gpt_3_model[\"model_client\"],\n", + " model_kwargs=gpt_3_model[\"model_kwargs\"],\n", + ")\n", + "task.train()\n", + "\n", + "output = task(question=example.question, id=example.id)\n", + "print(output)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "cKuW3QlhgLTG", + "outputId": "7f1f9cd6-9615-4b41-ecc5-5901626d57ae" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "markdown", - "source": [ - "## Set Environment Variables\n", - "\n", - "Run the following code and pass your api key.\n", - "\n", - "Note: for normal `.py` projects, follow our [official installation guide](https://lightrag.sylph.ai/get_started/installation.html).\n", - "\n", - "*Go to [OpenAI](https://platform.openai.com/docs/introduction) to get API keys if you don't already have.*" - ], - "metadata": { - "id": "KapUyHMM07pJ" - } + "output_type": "stream", + "name": "stdout", + "text": [ + "Parameter(name=Generator_output, requires_opt=True, param_type=generator_output (The output of the generator.), role_desc=Output from (llm) Generator, data=```\n", + "rationale: The question is asking for the meaning of the abbreviation \"EENTY\" in classified ads, which falls under the ABBR class.\n", + "class_name: ABBR\n", + "```, predecessors={Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n", + "```\n", + "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n", + "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n", + "```\n", + "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + "-Follow the YAML formatting conventions with an indent of 2 spaces.\n", + "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", + "-Quote the string values properly., predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n", + " Format: class_index. class_name, class_description\n", + " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n", + " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n", + " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n", + " 3. HUM, Human: Questions about people or organizations\n", + " 4. LOC, Location: Questions about places, cities, countries\n", + " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n", + " - Do not try to answer the question:\n", + " , predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={})}, gradients=[], raw_response=None, input_args={'prompt_kwargs': {'system_prompt': Parameter(name=Task_descr, requires_opt=True, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Task description, data=You are a classifier. Given a question, you need to classify it into one of the following classes:\n", + " Format: class_index. class_name, class_description\n", + " 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\n", + " 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\n", + " 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\n", + " 3. HUM, Human: Questions about people or organizations\n", + " 4. LOC, Location: Questions about places, cities, countries\n", + " 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\n", + " - Do not try to answer the question:\n", + " , predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), 'output_format_str': Parameter(name=Output_for, requires_opt=False, param_type=prompt (Instruction to the language model on task, data, and format.), role_desc=Output format requirements, data=Your output should be formatted as a standard YAML instance with the following schema:\n", + "```\n", + "rationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\n", + "class_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\n", + "```\n", + "-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + "-Follow the YAML formatting conventions with an indent of 2 spaces.\n", + "-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", + "-Quote the string values properly., predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), 'few_shot_demos': Parameter(name=Few_shot_e, requires_opt=True, param_type=demos (A few examples to guide the language model.), role_desc=Few shot examples to help the model, data=None, predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={}), 'input_str': Parameter(name=Input_to_t, requires_opt=False, param_type=none (), role_desc=input to the LLM, data=question: 'When reading classified ads , what does EENTY : other stand for ?', predecessors=set(), gradients=[], raw_response=None, input_args=None, traces={})}, 'model_kwargs': {'model': 'gpt-3.5-turbo', 'max_tokens': 2000, 'temperature': 0.0, 'top_p': 0.99, 'frequency_penalty': 0, 'presence_penalty': 0, 'stop': None}}, traces={})\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "class TrecClassifierAdal(adal.AdalComponent):\n", + " def __init__(\n", + " self,\n", + " model_client: adal.ModelClient,\n", + " model_kwargs: Dict,\n", + " teacher_model_config: Dict,\n", + " backward_engine_model_config: Dict,\n", + " text_optimizer_model_config: Dict,\n", + " ):\n", + " task = TRECClassifierStructuredOutput(model_client, model_kwargs)\n", + " eval_fn = AnswerMatchAcc(type=\"exact_match\").compute_single_item\n", + " loss_fn = adal.EvalFnToTextLoss(\n", + " eval_fn=eval_fn,\n", + " eval_fn_desc=\"exact_match: 1 if str(y) == str(y_gt) else 0\",\n", + " )\n", + " super().__init__(\n", + " task=task,\n", + " eval_fn=eval_fn,\n", + " loss_fn=loss_fn,\n", + " backward_engine_model_config=backward_engine_model_config,\n", + " text_optimizer_model_config=text_optimizer_model_config,\n", + " teacher_model_config=teacher_model_config,\n", + " )\n", + "\n", + " def prepare_task(self, sample: TRECExtendedData):\n", + " return self.task.call, {\"question\": sample.question, \"id\": sample.id}\n", + "\n", + " def prepare_eval(\n", + " self, sample: TRECExtendedData, y_pred: adal.GeneratorOutput\n", + " ) -> float:\n", + " y_label = -1\n", + " if y_pred and y_pred.data is not None and y_pred.data.class_name is not None:\n", + " y_label = y_pred.data.class_name\n", + " return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.class_name}\n", + "\n", + " def prepare_loss(\n", + " self, sample: TRECExtendedData, y_pred: adal.Parameter, *args, **kwargs\n", + " ) -> Tuple[Callable[..., Any], Dict]:\n", + " full_response = y_pred.full_response\n", + " y_label = -1\n", + " if (\n", + " full_response\n", + " and full_response.data is not None\n", + " and full_response.data.class_name is not None\n", + " ):\n", + " y_label = full_response.data.class_name\n", + "\n", + " y_pred.eval_input = y_label\n", + " y_gt = adal.Parameter(\n", + " name=\"y_gt\",\n", + " data=sample.class_name,\n", + " eval_input=sample.class_name,\n", + " requires_opt=False,\n", + " )\n", + " return self.loss_fn, {\"kwargs\": {\"y\": y_pred, \"y_gt\": y_gt}}" + ], + "metadata": { + "id": "HpkQYsh2NevT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def train(\n", + " model_client: adal.ModelClient,\n", + " model_kwargs: Dict,\n", + " train_batch_size=4,\n", + " raw_shots: int = 0,\n", + " bootstrap_shots: int = 1,\n", + " max_steps=12,\n", + " num_workers=4,\n", + " strategy=\"constrained\",\n", + " optimization_order=\"sequential\",\n", + " debug=False,\n", + "):\n", + " print(\"Starting training process...\")\n", + "\n", + " # Define the model configuration for all components\n", + " gpt_4o_model = {\n", + " \"model_client\": OpenAIClient(),\n", + " \"model_kwargs\": {\n", + " \"model\": \"gpt-4o-mini\",\n", + " \"temperature\": 1,\n", + " \"top_p\": 0.99,\n", + " \"max_tokens\": 1000,\n", + " # \"frequency_penalty\": 1, # high for nto repeating prompt\n", + " },\n", + " }\n", + "\n", + " print(f\"Component model configuration: {gpt_4o_model}\")\n", + "\n", + " try:\n", + " print(\"Initializing ADAL component...\")\n", + " adal_component = TrecClassifierAdal(\n", + " model_client=model_client,\n", + " model_kwargs=model_kwargs,\n", + " text_optimizer_model_config=gpt_4o_model,\n", + " backward_engine_model_config=gpt_4o_model,\n", + " teacher_model_config=gpt_4o_model,\n", + " )\n", + " print(\"ADAL component initialized successfully\")\n", + "\n", + " print(\"Initializing trainer...\")\n", + " trainer = adal.Trainer(\n", + " train_batch_size=train_batch_size,\n", + " adaltask=adal_component,\n", + " strategy=strategy,\n", + " max_steps=max_steps,\n", + " num_workers=num_workers,\n", + " raw_shots=raw_shots,\n", + " bootstrap_shots=bootstrap_shots,\n", + " debug=debug,\n", + " weighted_sampling=True,\n", + " optimization_order=optimization_order,\n", + " exclude_input_fields_from_bootstrap_demos=True,\n", + " )\n", + " print(\"Trainer initialized successfully\")\n", + "\n", + " print(\"Loading datasets...\")\n", + " train_dataset, val_dataset, test_dataset = load_datasets()\n", + " print(\n", + " f\"Datasets loaded - Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}\"\n", + " )\n", + "\n", + " print(\"Starting model training...\")\n", + " trainer.fit(\n", + " train_dataset=train_dataset,\n", + " val_dataset=test_dataset,\n", + " debug=debug,\n", + " )\n", + " print(\"Training completed successfully\")\n", + "\n", + " except Exception as e:\n", + " print(f\"Error occurred: {str(e)}\")\n", + " raise" + ], + "metadata": { + "id": "PEj6xiZ5dVaj" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train(**gpt_3_model)" + ], + "metadata": { + "id": "GnlZBQOMEj6E", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "outputId": "055a95c4-ccae-4028-d904-86b839bc1c14" + }, + "execution_count": null, + "outputs": [ { - "cell_type": "code", - "source": [ - "import os\n", - "\n", - "from getpass import getpass\n", - "\n", - "# Prompt user to enter their API keys securely\n", - "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n", - "\n", - "\n", - "# Set environment variables\n", - "os.environ[\"OPENAI_API_KEY\"] = openai_api_key\n", - "\n", - "print(\"API keys have been set.\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "ONfzF9Puzdd_", - "outputId": "a8ca0388-be6e-4b7a-cd05-d4ec52f64e95" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Please enter your OpenAI API key: ··········\n", - "API keys have been set.\n" - ] - } - ] + "output_type": "stream", + "name": "stdout", + "text": [ + "Starting training process...\n", + "Component model configuration: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}}\n", + "Initializing ADAL component...\n", + "ADAL component initialized successfully\n", + "Initializing trainer...\n", + "Trainer initialized successfully\n", + "Loading datasets...\n", + "Datasets loaded - Train size: 120, Val size: 36, Test size: 144\n", + "Starting model training...\n", + "raw_shots: 0, bootstrap_shots: 1\n", + "Configuring teacher generator.\n", + "Configuring teacher generator for Generator(\n", + " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", + " (prompt): Prompt(\n", + " template: \n", + " {{system_prompt}}\n", + " {% if output_format_str is not none %}\n", + " {{output_format_str}}\n", + " {% endif %}\n", + " {% if few_shot_demos is not none %}\n", + " Here are some examples:\n", + " {{few_shot_demos}}\n", + " {% endif %}\n", + " \n", + " \n", + " {{input_str}}\n", + " \n", + " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): DataClassParser(\n", + " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", + " (_output_processor): YamlParser()\n", + " (output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard YAML instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", + " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", + " -Quote the string values properly., prompt_variables: ['schema']\n", + " )\n", + " )\n", + ")\n", + "Teacher generator set: Generator(\n", + " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", + " (prompt): Prompt(\n", + " template: \n", + " {{system_prompt}}\n", + " {% if output_format_str is not none %}\n", + " {{output_format_str}}\n", + " {% endif %}\n", + " {% if few_shot_demos is not none %}\n", + " Here are some examples:\n", + " {{few_shot_demos}}\n", + " {% endif %}\n", + " \n", + " \n", + " {{input_str}}\n", + " \n", + " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): DataClassParser(\n", + " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", + " (_output_processor): YamlParser()\n", + " (output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard YAML instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", + " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", + " -Quote the string values properly., prompt_variables: ['schema']\n", + " )\n", + " )\n", + "), teacher Generator(\n", + " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", + " (prompt): Prompt(\n", + " template: \n", + " {{system_prompt}}\n", + " {% if output_format_str is not none %}\n", + " {{output_format_str}}\n", + " {% endif %}\n", + " {% if few_shot_demos is not none %}\n", + " Here are some examples:\n", + " {{few_shot_demos}}\n", + " {% endif %}\n", + " \n", + " \n", + " {{input_str}}\n", + " \n", + " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): DataClassParser(\n", + " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", + " (_output_processor): YamlParser()\n", + " (output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard YAML instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", + " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", + " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", + " -Quote the string values properly., prompt_variables: ['schema']\n", + " )\n", + " )\n", + ")\n", + "Teacher generator configured.\n", + "Configured demo optimizers\n", + "Backward engine configured for all generators.\n" + ] }, { - "cell_type": "markdown", - "source": [ - "Prepare data structures and prompt template" - ], - "metadata": { - "id": "4W3yEpRpepNK" - } + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "Loading Data: 100%|██████████| 144/144 [00:00<00:00, 9161.62it/s]\n", + "Predicting: step(0): 0.8264 across 144 samples, Max potential: 0.8264: 100%|██████████| 144/144 [00:19<00:00, 7.39it/s]\n" + ] }, { - "cell_type": "code", - "source": [ - "from dataclasses import dataclass, field\n", - "from typing import List, Dict, Union, Optional, Tuple, Any, Callable\n", - "from datasets import load_dataset\n", - "from adalflow.components.model_client import OpenAIClient\n", - "import adalflow as adal\n", - "from adalflow.core.component import Component\n", - "from adalflow.datasets.types import TrecData\n", - "from adalflow.datasets.trec import TrecDataset\n", - "\n", - "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", - "\n", - "\n", - "_COARSE_LABELS = [\"ABBR\", \"DESC\", \"ENTY\", \"HUM\", \"LOC\", \"NUM\"]\n", - "\n", - "_COARSE_LABELS_DESC = [\n", - " \"Abbreviation: Questions about abbreviations and their meanings\",\n", - " \"Description: Questions seeking descriptions of people, things, or concepts\",\n", - " \"Entity: Questions about entities (e.g., animals, colors, inventions)\",\n", - " \"Human: Questions about people or organizations\",\n", - " \"Location: Questions about places, cities, countries\",\n", - " \"Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\",\n", - "]\n", - "\n", - "\n", - "template = r\"\"\"\n", - " {{system_prompt}}\n", - " {% if output_format_str is not none %}\n", - " {{output_format_str}}\n", - " {% endif %}\n", - " {% if few_shot_demos is not none %}\n", - " Here are some examples:\n", - " {{few_shot_demos}}\n", - " {% endif %}\n", - " \n", - " \n", - " {{input_str}}\n", - " \n", - " \"\"\"\n", - "\n", - "task_desc_template = r\"\"\"You are a classifier. Given a question, you need to classify it into one of the following classes:\n", - " Format: class_index. class_name, class_description\n", - " {% if classes %}\n", - " {% for class in classes %}\n", - " {{loop.index-1}}. {{class.label}}, {{class.desc}}\n", - " {% endfor %}\n", - " {% endif %}\n", - " - Do not try to answer the question:\n", - " \"\"\"\n", - "\n", - "\n", - "@dataclass\n", - "class TRECExtendedData(TrecData):\n", - " rationale: str = field(\n", - " metadata={\n", - " \"desc\": \"Your step-by-step reasoning to classify the question to class_name\"\n", - " },\n", - " default=None,\n", - " )\n", - " __input_fields__ = [\"question\"]\n", - " __output_fields__ = [\n", - " \"rationale\",\n", - " \"class_name\",\n", - " ] # it is important to have the rationale before the class_name\n", - "\n", - "def load_datasets():\n", - " \"\"\"Load the dataset\"\"\"\n", - " train_data = TrecDataset(split=\"train\")\n", - " val_data = TrecDataset(split=\"val\")\n", - " test_data = TrecDataset(split=\"test\")\n", - " return train_data, val_data, test_data # 0.694, 0.847" - ], - "metadata": { - "id": "ZZIEtZYHNVjo" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stdout", + "text": [ + "completed_samples: 144, len: 144\n", + "Initial validation score: 0.8263888888888888\n", + "Initial test score: None\n", + "Checkpoint path: /root/.adalflow/ckpt/TrecClassifierAdal\n", + "save to /root/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_a6e76_run_1.json\n" + ] }, { - "cell_type": "code", - "source": [ - "# prepare models\n", - "\n", - "from adalflow.components.model_client.openai_client import OpenAIClient\n", - "\n", - "# used as the target model\n", - "gpt_3_model = {\n", - " \"model_client\": OpenAIClient(),\n", - " \"model_kwargs\": {\n", - " \"model\": \"gpt-3.5-turbo\",\n", - " \"max_tokens\": 2000,\n", - " \"temperature\": 0.0,\n", - " \"top_p\": 0.99,\n", - " \"frequency_penalty\": 0,\n", - " \"presence_penalty\": 0,\n", - " \"stop\": None,\n", - " },\n", - "}\n", - "\n", - "# used as optimizer and backward engine\n", - "gpt_4o_mini_model = {\n", - " \"model_client\": OpenAIClient(),\n", - " \"model_kwargs\": {\n", - " \"model\": \"gpt-4o-mini\",\n", - " \"temperature\": 1,\n", - " \"top_p\": 0.99,\n", - " \"max_tokens\": 1000,\n", - " # \"frequency_penalty\": 1, # high for nto repeating prompt\n", - " },\n", - "}" - ], - "metadata": { - "id": "yAvzn7DZeUX-" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "Training Step: 1: 0%| | 0/30 [00:00 Union[adal.GeneratorOutput, adal.Parameter]:\n", - " prompt_kwargs = self._prepare_input(question)\n", - " output = self.llm(prompt_kwargs=prompt_kwargs, id=id)\n", - " return output" - ], - "metadata": { - "id": "3Q3H9XC4Ncfi" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "\n", + "Proposing: 0%| | 0/5 [00:00 float:\n", - " y_label = -1\n", - " if y_pred and y_pred.data is not None and y_pred.data.class_name is not None:\n", - " y_label = y_pred.data.class_name\n", - " return self.eval_fn, {\"y\": y_label, \"y_gt\": sample.class_name}\n", - "\n", - " def prepare_loss(\n", - " self, sample: TRECExtendedData, y_pred: adal.Parameter, *args, **kwargs\n", - " ) -> Tuple[Callable[..., Any], Dict]:\n", - " full_response = y_pred.full_response\n", - " y_label = -1\n", - " if (\n", - " full_response\n", - " and full_response.data is not None\n", - " and full_response.data.class_name is not None\n", - " ):\n", - " y_label = full_response.data.class_name\n", - "\n", - " y_pred.eval_input = y_label\n", - " y_gt = adal.Parameter(\n", - " name=\"y_gt\",\n", - " data=sample.class_name,\n", - " eval_input=sample.class_name,\n", - " requires_opt=False,\n", - " )\n", - " return self.loss_fn, {\"kwargs\": {\"y\": y_pred, \"y_gt\": y_gt}}" - ], - "metadata": { - "id": "HpkQYsh2NevT" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stderr", + "text": [ + "\n", + "\n", + "\n", + "Loading Data: 100%|██████████| 3/3 [00:00<00:00, 302.95it/s]\n", + "Predicting: step(0): 0.6667 across 3 samples, Max potential: 0.6667: 100%|██████████| 3/3 [00:00<00:00, 4.25it/s]\n", + "\n", + "\n", + "Proposing: 40%|████ | 2/5 [00:06<00:09, 3.24s/it]\u001b[A\u001b[A" + ] }, { - "cell_type": "code", - "source": [ - "def train(\n", - " model_client: adal.ModelClient,\n", - " model_kwargs: Dict,\n", - " train_batch_size=4,\n", - " raw_shots: int = 0,\n", - " bootstrap_shots: int = 1,\n", - " max_steps=12,\n", - " num_workers=4,\n", - " strategy=\"constrained\",\n", - " optimization_order=\"sequential\",\n", - " debug=False,\n", - "):\n", - " print(\"Starting training process...\")\n", - "\n", - " # Define the model configuration for all components\n", - " gpt_4o_model = {\n", - " \"model_client\": OpenAIClient(),\n", - " \"model_kwargs\": {\n", - " \"model\": \"gpt-4o-mini\",\n", - " \"temperature\": 1,\n", - " \"top_p\": 0.99,\n", - " \"max_tokens\": 1000,\n", - " # \"frequency_penalty\": 1, # high for nto repeating prompt\n", - " },\n", - " }\n", - "\n", - " print(f\"Component model configuration: {gpt_4o_model}\")\n", - "\n", - " try:\n", - " print(\"Initializing ADAL component...\")\n", - " adal_component = TrecClassifierAdal(\n", - " model_client=model_client,\n", - " model_kwargs=model_kwargs,\n", - " text_optimizer_model_config=gpt_4o_model,\n", - " backward_engine_model_config=gpt_4o_model,\n", - " teacher_model_config=gpt_4o_model,\n", - " )\n", - " print(\"ADAL component initialized successfully\")\n", - "\n", - " print(\"Initializing trainer...\")\n", - " trainer = adal.Trainer(\n", - " train_batch_size=train_batch_size,\n", - " adaltask=adal_component,\n", - " strategy=strategy,\n", - " max_steps=max_steps,\n", - " num_workers=num_workers,\n", - " raw_shots=raw_shots,\n", - " bootstrap_shots=bootstrap_shots,\n", - " debug=debug,\n", - " weighted_sampling=True,\n", - " optimization_order=optimization_order,\n", - " exclude_input_fields_from_bootstrap_demos=True,\n", - " )\n", - " print(\"Trainer initialized successfully\")\n", - "\n", - " print(\"Loading datasets...\")\n", - " train_dataset, val_dataset, test_dataset = load_datasets()\n", - " print(\n", - " f\"Datasets loaded - Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}\"\n", - " )\n", - "\n", - " print(\"Starting model training...\")\n", - " trainer.fit(\n", - " train_dataset=train_dataset,\n", - " val_dataset=test_dataset,\n", - " debug=debug,\n", - " )\n", - " print(\"Training completed successfully\")\n", - "\n", - " except Exception as e:\n", - " print(f\"Error occurred: {str(e)}\")\n", - " raise" - ], - "metadata": { - "id": "PEj6xiZ5dVaj" - }, - "execution_count": null, - "outputs": [] + "output_type": "stream", + "name": "stdout", + "text": [ + "completed_samples: 3, len: 3\n", + "Fail subset check, try next proposal: 0.6666666666666666 <= 0.6666666666666666\n" + ] }, { - "cell_type": "code", - "source": [ - "train(**gpt_3_model)" - ], - "metadata": { - "id": "GnlZBQOMEj6E", - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "outputId": "055a95c4-ccae-4028-d904-86b839bc1c14" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Starting training process...\n", - "Component model configuration: {'model_client': OpenAIClient(), 'model_kwargs': {'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}}\n", - "Initializing ADAL component...\n", - "ADAL component initialized successfully\n", - "Initializing trainer...\n", - "Trainer initialized successfully\n", - "Loading datasets...\n", - "Datasets loaded - Train size: 120, Val size: 36, Test size: 144\n", - "Starting model training...\n", - "raw_shots: 0, bootstrap_shots: 1\n", - "Configuring teacher generator.\n", - "Configuring teacher generator for Generator(\n", - " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", - " (prompt): Prompt(\n", - " template: \n", - " {{system_prompt}}\n", - " {% if output_format_str is not none %}\n", - " {{output_format_str}}\n", - " {% endif %}\n", - " {% if few_shot_demos is not none %}\n", - " Here are some examples:\n", - " {{few_shot_demos}}\n", - " {% endif %}\n", - " \n", - " \n", - " {{input_str}}\n", - " \n", - " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", - " )\n", - " (model_client): OpenAIClient()\n", - " (output_processors): DataClassParser(\n", - " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", - " (_output_processor): YamlParser()\n", - " (output_format_prompt): Prompt(\n", - " template: Your output should be formatted as a standard YAML instance with the following schema:\n", - " ```\n", - " {{schema}}\n", - " ```\n", - " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", - " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", - " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", - " -Quote the string values properly., prompt_variables: ['schema']\n", - " )\n", - " )\n", - ")\n", - "Teacher generator set: Generator(\n", - " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", - " (prompt): Prompt(\n", - " template: \n", - " {{system_prompt}}\n", - " {% if output_format_str is not none %}\n", - " {{output_format_str}}\n", - " {% endif %}\n", - " {% if few_shot_demos is not none %}\n", - " Here are some examples:\n", - " {{few_shot_demos}}\n", - " {% endif %}\n", - " \n", - " \n", - " {{input_str}}\n", - " \n", - " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", - " )\n", - " (model_client): OpenAIClient()\n", - " (output_processors): DataClassParser(\n", - " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", - " (_output_processor): YamlParser()\n", - " (output_format_prompt): Prompt(\n", - " template: Your output should be formatted as a standard YAML instance with the following schema:\n", - " ```\n", - " {{schema}}\n", - " ```\n", - " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", - " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", - " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", - " -Quote the string values properly., prompt_variables: ['schema']\n", - " )\n", - " )\n", - "), teacher Generator(\n", - " model_kwargs={'model': 'gpt-4o-mini', 'temperature': 1, 'top_p': 0.99, 'max_tokens': 1000}, trainable_prompt_kwargs=[]\n", - " (prompt): Prompt(\n", - " template: \n", - " {{system_prompt}}\n", - " {% if output_format_str is not none %}\n", - " {{output_format_str}}\n", - " {% endif %}\n", - " {% if few_shot_demos is not none %}\n", - " Here are some examples:\n", - " {{few_shot_demos}}\n", - " {% endif %}\n", - " \n", - " \n", - " {{input_str}}\n", - " \n", - " , prompt_kwargs: {'system_prompt': 'You are a classifier. Given a question, you need to classify it into one of the following classes:\\n Format: class_index. class_name, class_description\\n 0. ABBR, Abbreviation: Questions about abbreviations and their meanings\\n 1. DESC, Description: Questions seeking descriptions of people, things, or concepts\\n 2. ENTY, Entity: Questions about entities (e.g., animals, colors, inventions)\\n 3. HUM, Human: Questions about people or organizations\\n 4. LOC, Location: Questions about places, cities, countries\\n 5. NUM, Numeric: Questions seeking numeric answers (e.g., dates, amounts, distances)\\n - Do not try to answer the question:\\n ', 'output_format_str': 'Your output should be formatted as a standard YAML instance with the following schema:\\n```\\nrationale: Your step-by-step reasoning to classify the question to class_name (str) (optional)\\nclass_name: One of {ABBR, ENTY, DESC, HUM, LOC, NUM} (str) (optional)\\n```\\n-Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\\n-Follow the YAML formatting conventions with an indent of 2 spaces.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\\n-Quote the string values properly.', 'few_shot_demos': 'None'}, prompt_variables: ['output_format_str', 'system_prompt', 'input_str', 'few_shot_demos']\n", - " )\n", - " (model_client): OpenAIClient()\n", - " (output_processors): DataClassParser(\n", - " data_class=TRECExtendedData, format_type=yaml, return_data_class=True, input_fields=['question'], output_fields=['rationale', 'class_name']\n", - " (_output_processor): YamlParser()\n", - " (output_format_prompt): Prompt(\n", - " template: Your output should be formatted as a standard YAML instance with the following schema:\n", - " ```\n", - " {{schema}}\n", - " ```\n", - " -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!\n", - " -Follow the YAML formatting conventions with an indent of 2 spaces.\n", - " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the YAML output.\n", - " -Quote the string values properly., prompt_variables: ['schema']\n", - " )\n", - " )\n", - ")\n", - "Teacher generator configured.\n", - "Configured demo optimizers\n", - "Backward engine configured for all generators.\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n", - "Loading Data: 100%|██████████| 144/144 [00:00<00:00, 9161.62it/s]\n", - "Predicting: step(0): 0.8264 across 144 samples, Max potential: 0.8264: 100%|██████████| 144/144 [00:19<00:00, 7.39it/s]\n" - ] - }, - { - "output_type": "stream", - "name": "stdout", - "text": [ - "completed_samples: 144, len: 144\n", - "Initial validation score: 0.8263888888888888\n", - "Initial test score: None\n", - "Checkpoint path: /root/.adalflow/ckpt/TrecClassifierAdal\n", - "save to /root/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_a6e76_run_1.json\n" - ] - }, - { - "output_type": "stream", - "name": "stderr", - "text": [ - "\n", - "Training Step: 1: 0%| | 0/30 [00:00\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mgpt_3_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model_client, model_kwargs, train_batch_size, raw_shots, bootstrap_shots, max_steps, num_workers, strategy, optimization_order, debug)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Starting model training...\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m trainer.fit(\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0mtrain_dataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mval_dataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, adaltask, train_loader, train_dataset, val_dataset, test_dataset, debug, save_traces, raw_shots, bootstrap_shots, resume_from_ckpt)\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0mstarting_step\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_steps\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 478\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrategy\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"constrained\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 479\u001b[0;31m trainer_results = self._fit_text_grad_constraint(\n\u001b[0m\u001b[1;32m 480\u001b[0m \u001b[0mtrain_loader\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0mval_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_fit_text_grad_constraint\u001b[0;34m(self, train_loader, val_dataset, test_dataset, trainer_results, starting_step)\u001b[0m\n\u001b[1;32m 1779\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1780\u001b[0m all_samples, all_losses, all_y_preds = (\n\u001b[0;32m-> 1781\u001b[0;31m self._text_grad_constraint_propose_step(\n\u001b[0m\u001b[1;32m 1782\u001b[0m \u001b[0msteps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1783\u001b[0m \u001b[0mall_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mall_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_text_grad_constraint_propose_step\u001b[0;34m(self, steps, all_samples, all_losses, all_y_preds, include_demo_optimizers)\u001b[0m\n\u001b[1;32m 1657\u001b[0m \u001b[0;31m# print(f\"Proposing step: {i}\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[0;31m# self.optimizer.propose()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1659\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_propose_text_optimizers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# new prompts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1660\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minclude_demo_optimizers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1661\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_demo_optimizers_propose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_propose_text_optimizers\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_propose_text_optimizers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext_optimizer\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext_optimizers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0mtext_optimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpropose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_trainable_text_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/text_grad/tgd_optimizer.py\u001b[0m in \u001b[0;36mpropose\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 323\u001b[0m }\n\u001b[1;32m 324\u001b[0m \u001b[0;31m# turn off cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m response = self.llm_optimizer.call(\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0mprompt_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt_kwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0mno_cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/core/generator.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, prompt_kwargs, model_kwargs, use_cache, id)\u001b[0m\n\u001b[1;32m 771\u001b[0m \u001b[0;31m# call the model client\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 773\u001b[0;31m \u001b[0mcompletion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 774\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_use_cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 775\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/core/generator.py\u001b[0m in \u001b[0;36m_model_client_call\u001b[0;34m(self, api_kwargs, use_cache)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mcached_completion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_content\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mcached_completion\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcached_completion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/backoff/_sync.py\u001b[0m in \u001b[0;36mretry\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mexception\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0mmax_tries_exceeded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtries\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmax_tries_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/components/model_client/openai_client.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, api_kwargs, model_type)\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat_completion_parser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle_streaming_response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mapi_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 287\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mapi_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 288\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"model_type {model_type} is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_utils/_utils.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"Missing required argument: {quote(missing[0])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m \u001b[0;31m# type: ignore\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/resources/chat/completions.py\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)\u001b[0m\n\u001b[1;32m 827\u001b[0m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[1;32m 828\u001b[0m \u001b[0mvalidate_response_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse_format\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 829\u001b[0;31m return self._post(\n\u001b[0m\u001b[1;32m 830\u001b[0m \u001b[0;34m\"/chat/completions\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m body=maybe_transform(\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[0m\n\u001b[1;32m 1276\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"post\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mto_httpx_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m )\n\u001b[0;32m-> 1278\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mResponseT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcast_to\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream_cls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream_cls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1279\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1280\u001b[0m def patch(\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, cast_to, options, remaining_retries, stream, stream_cls)\u001b[0m\n\u001b[1;32m 953\u001b[0m \u001b[0mretries_taken\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 954\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 955\u001b[0;31m return self._request(\n\u001b[0m\u001b[1;32m 956\u001b[0m \u001b[0mcast_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcast_to\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 957\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36m_request\u001b[0;34m(self, cast_to, options, retries_taken, stream, stream_cls)\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 990\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 991\u001b[0;31m response = self._client.send(\n\u001b[0m\u001b[1;32m 992\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_should_stream_response_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0mauth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_request_auth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 901\u001b[0;31m response = self._send_handling_auth(\n\u001b[0m\u001b[1;32m 902\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 903\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauth\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m 927\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 928\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 929\u001b[0;31m response = self._send_handling_redirects(\n\u001b[0m\u001b[1;32m 930\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m \u001b[0mfollow_redirects\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfollow_redirects\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_single_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event_hooks\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 1000\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1001\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mrequest_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1002\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransport\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1003\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1004\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSyncByteStream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 216\u001b[0m )\n\u001b[1;32m 217\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmap_httpcore_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtyping\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mShieldCancellation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresponse_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionNotAvailable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# The ConnectionNotAvailable exception is a special case, that\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionNotAvailable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_connection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_connect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mRequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNetworkStream\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mTrace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response_closed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_response_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;31m# Sending the request...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0mreason_phrase\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m ) = self._receive_response_headers(**kwargs)\n\u001b[0m\u001b[1;32m 100\u001b[0m trace.return_value = (\n\u001b[1;32m 101\u001b[0m \u001b[0mhttp_version\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 164\u001b[0;31m \u001b[0mevent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_receive_event\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 165\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mResponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mevent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEED_DATA\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m data = self._network_stream.read(\n\u001b[0m\u001b[1;32m 201\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mREAD_NUM_BYTES\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 202\u001b[0m )\n", - "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_backends/sync.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmap_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_bytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtyping\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/lib/python3.10/ssl.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, buflen, flags)\u001b[0m\n\u001b[1;32m 1286\u001b[0m \u001b[0;34m\"non-zero flags not allowed in calls to recv() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1287\u001b[0m self.__class__)\n\u001b[0;32m-> 1288\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1289\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1290\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/lib/python3.10/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1160\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1162\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSSLError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mSSL_ERROR_EOF\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ] + "output_type": "stream", + "name": "stderr", + "text": [ + "Proposing: 40%|████ | 2/5 [00:07<00:10, 3.55s/it]\n", + "Training Step: 1: 0%| | 0/30 [00:18\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtrain\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mgpt_3_model\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mtrain\u001b[0;34m(model_client, model_kwargs, train_batch_size, raw_shots, bootstrap_shots, max_steps, num_workers, strategy, optimization_order, debug)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Starting model training...\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m trainer.fit(\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0mtrain_dataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0mval_dataset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtest_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, adaltask, train_loader, train_dataset, val_dataset, test_dataset, debug, save_traces, raw_shots, bootstrap_shots, resume_from_ckpt)\u001b[0m\n\u001b[1;32m 477\u001b[0m \u001b[0mstarting_step\u001b[0m \u001b[0;34m+=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_steps\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 478\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrategy\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"constrained\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 479\u001b[0;31m trainer_results = self._fit_text_grad_constraint(\n\u001b[0m\u001b[1;32m 480\u001b[0m \u001b[0mtrain_loader\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 481\u001b[0m \u001b[0mval_dataset\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_fit_text_grad_constraint\u001b[0;34m(self, train_loader, val_dataset, test_dataset, trainer_results, starting_step)\u001b[0m\n\u001b[1;32m 1779\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1780\u001b[0m all_samples, all_losses, all_y_preds = (\n\u001b[0;32m-> 1781\u001b[0;31m self._text_grad_constraint_propose_step(\n\u001b[0m\u001b[1;32m 1782\u001b[0m \u001b[0msteps\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msteps\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1783\u001b[0m \u001b[0mall_samples\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mall_samples\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_text_grad_constraint_propose_step\u001b[0;34m(self, steps, all_samples, all_losses, all_y_preds, include_demo_optimizers)\u001b[0m\n\u001b[1;32m 1657\u001b[0m \u001b[0;31m# print(f\"Proposing step: {i}\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1658\u001b[0m \u001b[0;31m# self.optimizer.propose()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1659\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_propose_text_optimizers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# new prompts\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1660\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minclude_demo_optimizers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1661\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_demo_optimizers_propose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/trainer/trainer.py\u001b[0m in \u001b[0;36m_propose_text_optimizers\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 857\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_propose_text_optimizers\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 858\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mtext_optimizer\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtext_optimizers\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 859\u001b[0;31m \u001b[0mtext_optimizer\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpropose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 860\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 861\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_get_trainable_text_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/optim/text_grad/tgd_optimizer.py\u001b[0m in \u001b[0;36mpropose\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 323\u001b[0m }\n\u001b[1;32m 324\u001b[0m \u001b[0;31m# turn off cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 325\u001b[0;31m response = self.llm_optimizer.call(\n\u001b[0m\u001b[1;32m 326\u001b[0m \u001b[0mprompt_kwargs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mprompt_kwargs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0muse_cache\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0mno_cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 327\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/core/generator.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, prompt_kwargs, model_kwargs, use_cache, id)\u001b[0m\n\u001b[1;32m 771\u001b[0m \u001b[0;31m# call the model client\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 772\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 773\u001b[0;31m \u001b[0mcompletion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 774\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0muse_cache\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_use_cache\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 775\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/core/generator.py\u001b[0m in \u001b[0;36m_model_client_call\u001b[0;34m(self, api_kwargs, use_cache)\u001b[0m\n\u001b[1;32m 345\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 346\u001b[0m \u001b[0mcached_completion\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex_content\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 347\u001b[0;31m \u001b[0;32mif\u001b[0m \u001b[0mcached_completion\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 348\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mcached_completion\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 349\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/backoff/_sync.py\u001b[0m in \u001b[0;36mretry\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 103\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 104\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 105\u001b[0;31m \u001b[0mret\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtarget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 106\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mexception\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0mmax_tries_exceeded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mtries\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mmax_tries_value\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/adalflow/components/model_client/openai_client.py\u001b[0m in \u001b[0;36mcall\u001b[0;34m(self, api_kwargs, model_type)\u001b[0m\n\u001b[1;32m 285\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat_completion_parser\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mhandle_streaming_response\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 286\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mapi_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 287\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync_client\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchat\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcompletions\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcreate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mapi_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 288\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"model_type {model_type} is not supported\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_utils/_utils.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mmsg\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34mf\"Missing required argument: {quote(missing[0])}\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m \u001b[0;31m# type: ignore\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/resources/chat/completions.py\u001b[0m in \u001b[0;36mcreate\u001b[0;34m(self, messages, model, audio, frequency_penalty, function_call, functions, logit_bias, logprobs, max_completion_tokens, max_tokens, metadata, modalities, n, parallel_tool_calls, prediction, presence_penalty, response_format, seed, service_tier, stop, store, stream, stream_options, temperature, tool_choice, tools, top_logprobs, top_p, user, extra_headers, extra_query, extra_body, timeout)\u001b[0m\n\u001b[1;32m 827\u001b[0m ) -> ChatCompletion | Stream[ChatCompletionChunk]:\n\u001b[1;32m 828\u001b[0m \u001b[0mvalidate_response_format\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse_format\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 829\u001b[0;31m return self._post(\n\u001b[0m\u001b[1;32m 830\u001b[0m \u001b[0;34m\"/chat/completions\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m body=maybe_transform(\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mpost\u001b[0;34m(self, path, cast_to, body, options, files, stream, stream_cls)\u001b[0m\n\u001b[1;32m 1276\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"post\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0murl\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mjson_data\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbody\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfiles\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mto_httpx_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiles\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1277\u001b[0m )\n\u001b[0;32m-> 1278\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mResponseT\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcast_to\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopts\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstream_cls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream_cls\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1279\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1280\u001b[0m def patch(\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36mrequest\u001b[0;34m(self, cast_to, options, remaining_retries, stream, stream_cls)\u001b[0m\n\u001b[1;32m 953\u001b[0m \u001b[0mretries_taken\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 954\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 955\u001b[0;31m return self._request(\n\u001b[0m\u001b[1;32m 956\u001b[0m \u001b[0mcast_to\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcast_to\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 957\u001b[0m \u001b[0moptions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moptions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/openai/_base_client.py\u001b[0m in \u001b[0;36m_request\u001b[0;34m(self, cast_to, options, retries_taken, stream, stream_cls)\u001b[0m\n\u001b[1;32m 989\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 990\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 991\u001b[0;31m response = self._client.send(\n\u001b[0m\u001b[1;32m 992\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 993\u001b[0m \u001b[0mstream\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstream\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_should_stream_response_body\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36msend\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m 899\u001b[0m \u001b[0mauth\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_build_request_auth\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 901\u001b[0;31m response = self._send_handling_auth(\n\u001b[0m\u001b[1;32m 902\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 903\u001b[0m \u001b[0mauth\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mauth\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m 927\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 928\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 929\u001b[0;31m response = self._send_handling_redirects(\n\u001b[0m\u001b[1;32m 930\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 931\u001b[0m \u001b[0mfollow_redirects\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfollow_redirects\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m 964\u001b[0m \u001b[0mhook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 965\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 966\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_send_single_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 967\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mhook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_event_hooks\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"response\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_client.py\u001b[0m in \u001b[0;36m_send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 1000\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1001\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mrequest_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1002\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtransport\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1003\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1004\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mSyncByteStream\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 216\u001b[0m )\n\u001b[1;32m 217\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmap_httpcore_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 218\u001b[0;31m \u001b[0mresp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_pool\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreq\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 219\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 220\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstream\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtyping\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mIterable\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mShieldCancellation\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresponse_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection_pool.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0mresponse\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconnection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mConnectionNotAvailable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;31m# The ConnectionNotAvailable exception is a special case, that\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/connection.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 94\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mConnectionNotAvailable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 95\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_connection\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mhandle_request\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 97\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_connect\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mRequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNetworkStream\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 119\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mTrace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"response_closed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mtrace\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_response_closed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 121\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 122\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 123\u001b[0m \u001b[0;31m# Sending the request...\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36mhandle_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[0mreason_phrase\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[0mheaders\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 99\u001b[0;31m ) = self._receive_response_headers(**kwargs)\n\u001b[0m\u001b[1;32m 100\u001b[0m trace.return_value = (\n\u001b[1;32m 101\u001b[0m \u001b[0mhttp_version\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 163\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 164\u001b[0;31m \u001b[0mevent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_receive_event\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 165\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mevent\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mResponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 166\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_sync/http11.py\u001b[0m in \u001b[0;36m_receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 199\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mevent\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mh11\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNEED_DATA\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 200\u001b[0;31m data = self._network_stream.read(\n\u001b[0m\u001b[1;32m 201\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mREAD_NUM_BYTES\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 202\u001b[0m )\n", + "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/httpcore/_backends/sync.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mmap_exceptions\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mexc_map\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 27\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msettimeout\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 28\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sock\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmax_bytes\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 29\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbytes\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtyping\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mOptional\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.10/ssl.py\u001b[0m in \u001b[0;36mrecv\u001b[0;34m(self, buflen, flags)\u001b[0m\n\u001b[1;32m 1286\u001b[0m \u001b[0;34m\"non-zero flags not allowed in calls to recv() on %s\"\u001b[0m \u001b[0;34m%\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1287\u001b[0m self.__class__)\n\u001b[0;32m-> 1288\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1289\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1290\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrecv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbuflen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mflags\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/usr/lib/python3.10/ssl.py\u001b[0m in \u001b[0;36mread\u001b[0;34m(self, len, buffer)\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffer\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1160\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1161\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1162\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mSSLError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1163\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mSSL_ERROR_EOF\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuppress_ragged_eofs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] } - ] + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Issues and feedback\n", + "\n", + "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n", + "\n", + "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)." + ], + "metadata": { + "id": "AmkbyxmuruUu" + } + } + ] } diff --git a/notebooks/tutorials/adalflow_embedder.ipynb b/notebooks/tutorials/adalflow_embedder.ipynb index 8150bf07..bdd20ba3 100644 --- a/notebooks/tutorials/adalflow_embedder.ipynb +++ b/notebooks/tutorials/adalflow_embedder.ipynb @@ -228,8 +228,9 @@ "from adalflow.core.component import Component\n", "from copy import deepcopy\n", "\n", + "\n", "class DecreaseEmbeddingDim(Component):\n", - " def __init__(self, old_dim: int, new_dim: int, normalize: bool = True):\n", + " def __init__(self, old_dim: int, new_dim: int, normalize: bool = True):\n", " super().__init__()\n", " self.old_dim = old_dim\n", " self.new_dim = new_dim\n", @@ -258,10 +259,10 @@ "outputs": [], "source": [ "local_embedder_256 = Embedder(\n", - " model_client=TransformersClient(),\n", - " model_kwargs=model_kwargs,\n", - " output_processors=DecreaseEmbeddingDim(768, 256),\n", - " )\n", + " model_client=TransformersClient(),\n", + " model_kwargs=model_kwargs,\n", + " output_processors=DecreaseEmbeddingDim(768, 256),\n", + ")\n", "print(local_embedder_256)" ] }, diff --git a/notebooks/tutorials/adalflow_modelclient.ipynb b/notebooks/tutorials/adalflow_modelclient.ipynb index f1c89bee..05ff5f3d 100644 --- a/notebooks/tutorials/adalflow_modelclient.ipynb +++ b/notebooks/tutorials/adalflow_modelclient.ipynb @@ -2059,85 +2059,93 @@ "def analyze_single_image():\n", " \"\"\"Example of analyzing a single image with GPT-4 Vision\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", " \"model\": \"gpt-4o-mini\",\n", " \"images\": \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n", - " \"max_tokens\": 300\n", - " }\n", + " \"max_tokens\": 300,\n", + " },\n", + " )\n", + "\n", + " response = gen(\n", + " {\"input_str\": \"What do you see in this image? Be detailed but concise.\"}\n", " )\n", - " \n", - " response = gen({\"input_str\": \"What do you see in this image? Be detailed but concise.\"})\n", " print(\"\\n=== Single Image Analysis ===\")\n", " print(f\"Description: {response.raw_response}\")\n", "\n", + "\n", "def analyze_multiple_images():\n", " \"\"\"Example of analyzing multiple images in one prompt\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " # List of images to analyze together\n", " images = [\n", " \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n", - " \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/sad_cat.jpg\"\n", + " \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/sad_cat.jpg\",\n", " ]\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", - " model_kwargs={\n", - " \"model\": \"gpt-4o-mini\",\n", - " \"images\": images,\n", - " \"max_tokens\": 300\n", + " model_kwargs={\"model\": \"gpt-4o-mini\", \"images\": images, \"max_tokens\": 300},\n", + " )\n", + "\n", + " response = gen(\n", + " {\n", + " \"input_str\": \"Compare and contrast these two images. What are the main differences?\"\n", " }\n", " )\n", - " \n", - " response = gen({\"input_str\": \"Compare and contrast these two images. What are the main differences?\"})\n", " print(\"\\n=== Multiple Images Analysis ===\")\n", " print(f\"Comparison: {response.raw_response}\")\n", "\n", + "\n", "def generate_art_with_dalle():\n", " \"\"\"Example of generating art using DALL-E 3\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", " \"model\": \"dall-e-3\",\n", " \"size\": \"1024x1024\",\n", " \"quality\": \"standard\",\n", - " \"n\": 1\n", + " \"n\": 1,\n", + " },\n", + " )\n", + "\n", + " response = gen(\n", + " {\n", + " \"input_str\": \"A serene Japanese garden with a small bridge over a koi pond, cherry blossoms falling gently in the breeze\"\n", " }\n", " )\n", - " \n", - " response = gen({\n", - " \"input_str\": \"A serene Japanese garden with a small bridge over a koi pond, cherry blossoms falling gently in the breeze\"\n", - " })\n", " print(\"\\n=== Art Generation with DALL-E 3 ===\")\n", " print(f\"Generated Image URL: {response.data}\")\n", "\n", + "\n", "def create_image_variations(image_path=\"path/to/your/image.jpg\"):\n", " \"\"\"Example of creating variations of an existing image\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", " \"model\": \"dall-e-2\",\n", " \"image\": image_path,\n", " \"n\": 2, # Generate 2 variations\n", - " \"size\": \"1024x1024\"\n", - " }\n", + " \"size\": \"1024x1024\",\n", + " },\n", " )\n", - " \n", + "\n", " response = gen({\"input_str\": \"\"})\n", " print(\"\\n=== Image Variations ===\")\n", " print(f\"Variation URLs: {response.data}\")\n", "\n", + "\n", "def edit_image_with_mask(image_path=\"path/to/image.jpg\", mask_path=\"path/to/mask.jpg\"):\n", " \"\"\"Example of editing specific parts of an image using a mask\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", @@ -2145,40 +2153,37 @@ " \"image\": image_path,\n", " \"mask\": mask_path,\n", " \"n\": 1,\n", - " \"size\": \"1024x1024\"\n", - " }\n", + " \"size\": \"1024x1024\",\n", + " },\n", " )\n", - " \n", - " response = gen({\n", - " \"input_str\": \"Replace the masked area with a beautiful sunset\"\n", - " })\n", + "\n", + " response = gen({\"input_str\": \"Replace the masked area with a beautiful sunset\"})\n", " print(\"\\n=== Image Editing ===\")\n", " print(f\"Edited Image URL: {response.data}\")\n", "\n", + "\n", "def mixed_image_text_conversation():\n", " \"\"\"Example of having a conversation that includes both images and text\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", " \"model\": \"gpt-4o-mini\",\n", " \"images\": [\n", " \"https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/images/happy_cat.jpg\",\n", - " \"https://path/to/local/image.jpg\" # Replace with your local image path\n", + " \"https://path/to/local/image.jpg\", # Replace with your local image path\n", " ],\n", - " \"max_tokens\": 300\n", - " }\n", + " \"max_tokens\": 300,\n", + " },\n", " )\n", - " \n", + "\n", " conversation = \"\"\"You are a helpful assistant skilled in analyzing images and providing detailed descriptions.\n", " I'm showing you two images. Please analyze them and tell me what emotions they convey.\"\"\"\n", - " \n", + "\n", " response = gen({\"input_str\": conversation})\n", " print(\"\\n=== Mixed Image-Text Conversation ===\")\n", - " print(f\"Assistant's Analysis: {response.raw_response}\")\n", - "\n", - "\n" + " print(f\"Assistant's Analysis: {response.raw_response}\")" ] }, { @@ -2189,16 +2194,16 @@ "source": [ "if __name__ == \"__main__\":\n", " print(\"OpenAI Image Processing Examples\\n\")\n", - " \n", + "\n", " # Basic image analysis\n", " analyze_single_image()\n", - " \n", + "\n", " # Multiple image analysis\n", " analyze_multiple_images()\n", - " \n", + "\n", " # Image generation\n", " generate_art_with_dalle()\n", - " \n", + "\n", " # create_image_variations()\n", " # edit_image_with_mask(, )\n", " # mixed_image_text_conversation()" @@ -2230,23 +2235,27 @@ "source": [ "class ImageGenerator(Generator):\n", " \"\"\"Generator subclass for image generation.\"\"\"\n", + "\n", " model_type = ModelType.IMAGE_GENERATION\n", "\n", + "\n", "def test_vision_and_generation():\n", " \"\"\"Test both vision analysis and image generation\"\"\"\n", " client = OpenAIClient()\n", - " \n", + "\n", " # 1. Test Vision Analysis\n", " vision_gen = Generator(\n", " model_client=client,\n", " model_kwargs={\n", " \"model\": \"gpt-4o-mini\",\n", " \"images\": \"https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png\",\n", - " \"max_tokens\": 300\n", - " }\n", + " \"max_tokens\": 300,\n", + " },\n", + " )\n", + "\n", + " vision_response = vision_gen(\n", + " {\"input_str\": \"What do you see in this image? Be detailed but concise.\"}\n", " )\n", - " \n", - " vision_response = vision_gen({\"input_str\": \"What do you see in this image? Be detailed but concise.\"})\n", " print(\"\\n=== Vision Analysis ===\")\n", " print(f\"Description: {vision_response.raw_response}\")\n", "\n", @@ -2257,12 +2266,14 @@ " \"model\": \"dall-e-3\",\n", " \"size\": \"1024x1024\",\n", " \"quality\": \"standard\",\n", - " \"n\": 1\n", - " }\n", + " \"n\": 1,\n", + " },\n", " )\n", - " \n", + "\n", " # For image generation, input_str becomes the prompt\n", - " response = dalle_gen({\"input_str\": \"A happy siamese cat playing with a red ball of yarn\"})\n", + " response = dalle_gen(\n", + " {\"input_str\": \"A happy siamese cat playing with a red ball of yarn\"}\n", + " )\n", " print(\"\\n=== DALL-E Generation ===\")\n", " print(f\"Generated Image URL: {response.data}\")" ] @@ -2288,10 +2299,10 @@ " model_kwargs={\n", " \"model\": \"gpt-4o-mini\",\n", " \"images\": \"https://invalid.url/nonexistent.jpg\",\n", - " \"max_tokens\": 300\n", - " }\n", + " \"max_tokens\": 300,\n", + " },\n", " )\n", - " \n", + "\n", " print(\"\\n=== Testing Invalid Image URL ===\")\n", " response = gen({\"input_str\": \"What do you see in this image?\"})\n", " print(f\"Response with invalid image URL: {response}\")" @@ -2306,7 +2317,7 @@ "if __name__ == \"__main__\":\n", " print(\"Starting OpenAI Vision and DALL-E test...\\n\")\n", " test_invalid_image_url()\n", - " test_vision_and_generation() " + " test_vision_and_generation()" ] }, { diff --git a/test_graph.py b/test_graph.py new file mode 100644 index 00000000..e1577916 --- /dev/null +++ b/test_graph.py @@ -0,0 +1,272 @@ +# node_graph_visualizer.py + +import os +from pyvis.network import Network +import streamlit as st +import networkx as nx +from jinja2 import Template + + +# Node class definition +class Node: + def __init__( + self, + id, + name, + role_desc, + data, + data_id, + previous_data, + requires_opt, + param_type, + gradients, + ): + self.id = id + self.name = name + self.role_desc = role_desc + self.data = data + self.data_id = data_id + self.previous_data = previous_data + self.requires_opt = requires_opt + self.param_type = param_type + self.gradients = gradients + + def get_gradients_names(self): + return self.gradients.split(", ") if self.gradients else [] + + +# Function to generate individual HTML pages for each node +def generate_node_html(node, output_dir="node_pages"): + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + filename = f"{output_dir}/{node.name}.html" + # from_param = Parameter("from", "From Parameter") + # dummy_gradients = Gradient("dummy", "Dummy Gradient") + with open(filename, "w") as file: + file.write( + f""" + + + + + + {node.name} + + +

Details for Node: {node.name}

+

ID: {node.id}

+

Role: {node.role_desc}

+

Data: {node.data}

+

Data ID: {node.data_id}

+

Previous Value: {node.previous_data}

+

Requires Optimization: {node.requires_opt}

+

Type: {node.param_type}

+

Gradients: {', '.join(node.get_gradients_names())}

+ + + """ + ) + print(f"Generated HTML for node: {node.name} at {filename}") + + +# Function to create the main graph with clickable links to individual node pages +def create_graph_with_links( + nodes, edges, output_dir="node_pages", main_file="graph.html" +): + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + net = Network(height="750px", width="100%", directed=True) + net.toggle_physics(True) + net.template = Template( + """ + + + + + + + + +
+
+ +
+
+ + + + """ + ) + + for node in nodes: + # Generate individual HTML pages for each node + generate_node_html(node, output_dir) + + # Add node to the main graph with link to its HTML page + net.add_node( + node.id, + label=node.name, + title=f"Open Details", + shape="dot", + size=15, + url=f"{output_dir}/{node.name}.html", # Add the URL here + ) + + for edge in edges: + net.add_edge(edge[0].id, edge[1].id) + + net.show(main_file) + print(f"Generated main graph HTML at {main_file}") + + +# Function to create a Streamlit app for interactive graph exploration +def create_interactive_streamlit_app(nodes, edges): + G = nx.DiGraph() + for node in nodes: + G.add_node(node.id, node_obj=node) + G.add_edges_from([(edge[0].id, edge[1].id) for edge in edges]) + + st.title("Interactive Graph Visualization") + st.sidebar.title("Node Selector") + selected_node_name = st.sidebar.selectbox( + "Select a node", [node.name for node in nodes] + ) + + net = Network(height="500px", width="100%", directed=True) + net.template = Template( + """ + + + + + + + + +
+
+ + + + """ + ) + + for node in nodes: + net.add_node(node.id, label=node.name) + for edge in edges: + net.add_edge(edge[0].id, edge[1].id) + + net.save_graph("graph.html") + st.components.v1.html(open("graph.html", "r").read(), height=550) + + if selected_node_name: + selected_node = next(node for node in nodes if node.name == selected_node_name) + st.subheader(f"Details for Node: {selected_node.name}") + st.write(f"**ID**: {selected_node.id}") + st.write(f"**Role**: {selected_node.role_desc}") + st.write(f"**Data**: {selected_node.data}") + st.write(f"**Data ID**: {selected_node.data_id}") + st.write(f"**Previous Value**: {selected_node.previous_data}") + st.write(f"**Requires Optimization**: {selected_node.requires_opt}") + st.write(f"**Type**: {selected_node.param_type}") + st.write(f"**Gradients**: {', '.join(selected_node.get_gradients_names())}") + + +if __name__ == "__main__": + # Dummy data + dummy_nodes = [ + Node(1, "Node1", "Input", "Value1", "D1", "Prev1", "Yes", "Type1", "Grad1"), + Node(2, "Node2", "Process", "Value2", "D2", "Prev2", "No", "Type2", "Grad2"), + Node(3, "Node3", "Output", "Value3", "D3", "Prev3", "Yes", "Type3", "Grad3"), + ] + + dummy_edges = [(dummy_nodes[0], dummy_nodes[1]), (dummy_nodes[1], dummy_nodes[2])] + + # Test HTML generation + create_graph_with_links(dummy_nodes, dummy_edges) + + # Uncomment the following line to test the Streamlit app + # create_interactive_streamlit_app(dummy_nodes, dummy_edges) diff --git a/text_grad_2_0_recompute.py b/text_grad_2_0_recompute.py new file mode 100644 index 00000000..bd52905b --- /dev/null +++ b/text_grad_2_0_recompute.py @@ -0,0 +1,127 @@ +import json +import os +import math + + +def recompute_metrics_and_update_summary(result_file): + try: + # Load the results file + with open(result_file, "r") as f: + ckpt_values = json.load(f) + + # Initialize variables for metrics computation + highest_test_score = 0 + mean_test_score = 0 + standard_deviation = 0 + past_highest_scores = [] + past_highest_test_scores = [] + + average_pass_rate_list = [] + average_pass_prompts_list = [] + average_total_prompts_list = [] + + highest_val_score = 0 + + # Process each experiment + for experiment, data in ckpt_values.items(): + if "summary" in experiment: + continue # Skip summary entries + + ckpt_path = data + + if os.path.exists(ckpt_path): + with open(ckpt_path, "r") as ckpt_file: + experiment_data = json.load(ckpt_file) + + val_scores = experiment_data.get("val_scores", []) + test_scores = experiment_data.get("test_scores", []) + _high_test_score = max(val_scores, default=0) + _high_val_score = max(test_scores, default=0) + + past_highest_scores.append(_high_test_score) + past_highest_test_scores.append(_high_val_score) + + if _high_test_score > highest_test_score: + highest_test_score = _high_test_score + + if _high_val_score > highest_val_score: + highest_val_score = _high_val_score + + effective_measures = experiment_data.get("effective_measure", {}) + + if effective_measures: + pass_num = effective_measures["valset"].get("pass", 0) + total_val_prompts = effective_measures["valset"].get( + "pass", 0 + ) + effective_measures["valset"].get("fail", 0) + else: + total_val_prompts = len(val_scores) - 1 + pass_num = len(set(val_scores)) + + average_pass_rate = ( + pass_num / total_val_prompts if total_val_prompts > 0 else 0 + ) + average_pass_rate_list.append(average_pass_rate) + average_pass_prompts_list.append(pass_num) + average_total_prompts_list.append(total_val_prompts) + + # Compute final metrics + if past_highest_scores: + mean_test_score = sum(past_highest_scores) / len(past_highest_scores) + standard_deviation = math.sqrt( + sum((x - mean_test_score) ** 2 for x in past_highest_scores) + / len(past_highest_scores) + ) + + average_pass_rate = ( + sum(average_pass_rate_list) / len(average_pass_rate_list) + if average_pass_rate_list + else 0 + ) + average_pass_prompts = ( + sum(average_pass_prompts_list) / len(average_pass_prompts_list) + if average_pass_prompts_list + else 0 + ) + average_total_prompts = ( + sum(average_total_prompts_list) / len(average_total_prompts_list) + if average_total_prompts_list + else 0 + ) + + # Update the summary in ckpt_values + summary_key = "summary" + ckpt_values[summary_key] = { + "highest_test_score": highest_test_score, + "mean_test_score": mean_test_score, + "standard_deviation": standard_deviation, + "average_pass_rate": average_pass_rate, + "average_pass_prompts": average_pass_prompts, + "average_total_prompts": average_total_prompts, + "past_highest_scores": past_highest_scores, + "past_highest_test_scores": past_highest_test_scores, + "highest_val_score": highest_val_score, + } + + # Save updated ckpt_values back to the file + with open(result_file, "w") as f: + json.dump(ckpt_values, f, indent=4) + + return ckpt_values[summary_key] + + except Exception as e: + print(f"Error while recomputing metrics: {e}") + return None + + +# Usage +if __name__ == "__main__": + result_file = "results.json" # Replace with your actual result file + result_file = "text_grad_2_results_4_runs_1872c441-0db2-4640-9cf6-8ef910744a93.json" + result_file = "text_grad_2_results_4_runs_02b9f463-aa21-4485-9899-07ac2542ddac.json" # only use fullset + summary = recompute_metrics_and_update_summary(result_file) + + if summary: + print("Updated Summary:") + for key, value in summary.items(): + print(f"{key}: {value}") diff --git a/tutorials/multimodal_client_testing_examples.py b/tutorials/multimodal_client_testing_examples.py index 44bdbc69..ee3a171d 100644 --- a/tutorials/multimodal_client_testing_examples.py +++ b/tutorials/multimodal_client_testing_examples.py @@ -17,31 +17,26 @@ from adalflow.core import Generator from adalflow.components.model_client.openai_client import OpenAIClient from adalflow.core.types import ModelType -import asyncio -import numpy as np -from dataclasses import dataclass -from typing import List -from numpy.linalg import norm + class ImageGenerator(Generator): """Generator subclass for image generation.""" + model_type = ModelType.IMAGE_GENERATION + def test_basic_generation(): """Test basic text generation""" client = OpenAIClient() gen = Generator( - model_client=client, - model_kwargs={ - "model": "gpt-4o-mini", - "max_tokens": 100 - } + model_client=client, model_kwargs={"model": "gpt-4o-mini", "max_tokens": 100} ) - + print("\n=== Testing Basic Generation ===") response = gen({"input_str": "Hello, world!"}) print(f"Response: {response}") + def test_invalid_image_url(): """Test Generator output with invalid image URL""" client = OpenAIClient() @@ -50,14 +45,15 @@ def test_invalid_image_url(): model_kwargs={ "model": "gpt-4o-mini", "images": "https://invalid.url/nonexistent.jpg", - "max_tokens": 300 - } + "max_tokens": 300, + }, ) - + print("\n=== Testing Invalid Image URL ===") response = gen({"input_str": "What do you see in this image?"}) print(f"Response with invalid image URL: {response}") + def test_invalid_image_generation(): """Test DALL-E generation with invalid parameters""" client = OpenAIClient() @@ -67,29 +63,32 @@ def test_invalid_image_generation(): "model": "dall-e-3", "size": "invalid_size", # Invalid size parameter "quality": "standard", - "n": 1 - } + "n": 1, + }, ) - + print("\n=== Testing Invalid DALL-E Parameters ===") response = gen({"input_str": "A cat"}) print(f"Response with invalid DALL-E parameters: {response}") + def test_vision_and_generation(): """Test both vision analysis and image generation""" client = OpenAIClient() - + # 1. Test Vision Analysis vision_gen = Generator( model_client=client, model_kwargs={ "model": "gpt-4o-mini", "images": "https://upload.wikimedia.org/wikipedia/en/7/7d/Lenna_%28test_image%29.png", - "max_tokens": 300 - } + "max_tokens": 300, + }, + ) + + vision_response = vision_gen( + {"input_str": "What do you see in this image? Be detailed but concise."} ) - - vision_response = vision_gen({"input_str": "What do you see in this image? Be detailed but concise."}) print("\n=== Vision Analysis ===") print(f"Description: {vision_response.raw_response}") @@ -100,20 +99,23 @@ def test_vision_and_generation(): "model": "dall-e-3", "size": "1024x1024", "quality": "standard", - "n": 1 - } + "n": 1, + }, ) - + # For image generation, input_str becomes the prompt - response = dalle_gen({"input_str": "A happy siamese cat playing with a red ball of yarn"}) + response = dalle_gen( + {"input_str": "A happy siamese cat playing with a red ball of yarn"} + ) print("\n=== DALL-E Generation ===") print(f"Generated Image URL: {response.data}") + if __name__ == "__main__": print("Starting OpenAI Vision and DALL-E test...\n") - + # Run all tests - they will show errors if API key is invalid/empty test_basic_generation() test_invalid_image_url() test_invalid_image_generation() - test_vision_and_generation() \ No newline at end of file + test_vision_and_generation() diff --git a/tutorials/react_note.py b/tutorials/react_note.py index 58dc93e9..072b318b 100644 --- a/tutorials/react_note.py +++ b/tutorials/react_note.py @@ -68,6 +68,50 @@ def test_react_agent(model_client: ModelClient, model_kwargs: dict): print("") +""" +To have an agent. +input, prompt, template, step_history -> generator +-> stepoutput -> step_history -> generator -> stepoutput -> step_history +-> generator -> stepoutput -> step_history -> generator -> stepoutput -> step_history +""" + + +def test_react_agent_train(model_client: ModelClient, model_kwargs: dict): + tools = [multiply, add, divide] + queries = [ + "What is the capital of France? and what is 465 times 321 then add 95297 and then divide by 13.2?", + "Give me 5 words rhyming with cool, and make a 4-sentence poem using them", + ] + # define a generator without tools for comparison + + # generator = Generator( + # model_client=model_client, + # model_kwargs=model_kwargs, + # ) + + react = ReActAgent( + max_steps=6, + add_llm_as_fallback=True, + tools=tools, + model_client=model_client, + model_kwargs=model_kwargs, + ) + # print(react) + react.train() + + for query in queries: + print(f"Query: {query}") + agent_response = react.forward(query) + agent_response.draw_graph() + agent_response.draw_output_subgraph() + # print(f"Agent response: {agent_response}") + + break + # llm_response = generator.call(prompt_kwargs={"input_str": query}) + # print(f"LLM response: {llm_response}") + print("") + + def test_react_agent_use_examples(model_client: ModelClient, model_kwargs: dict): tools = [multiply, add, divide] queries = [ @@ -106,12 +150,12 @@ def test_react_agent_use_examples(model_client: ModelClient, model_kwargs: dict) if __name__ == "__main__": - from adalflow.utils import get_logger - get_logger(level="DEBUG") + # get_logger(level="DEBUG") - test_react_agent(ModelClientType.GROQ(), llama3_model_kwargs) - test_react_agent(ModelClientType.OPENAI(), gpt_model_kwargs) - print("Done") + # test_react_agent(ModelClientType.GROQ(), llama3_model_kwargs) + test_react_agent_train(ModelClientType.OPENAI(), gpt_model_kwargs) + # test_react_agent(ModelClientType.OPENAI(), gpt_model_kwargs) + # print("Done") - test_react_agent_use_examples(ModelClientType.GROQ(), llama3_model_kwargs) + # test_react_agent_use_examples(ModelClientType.GROQ(), llama3_model_kwargs) diff --git a/use_cases/agent/react_agent.ipynb b/use_cases/agent/react_agent.ipynb deleted file mode 100644 index a93cb89e..00000000 --- a/use_cases/agent/react_agent.ipynb +++ /dev/null @@ -1,1387 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# ReAct Agent Use Case" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 1. Q&A Chatbot\n", - "In this tutorial, we will implement ``adalflow ReAct`` to build a Q&A chatbot on [HotpotQA](https://arxiv.org/pdf/1809.09600) dataset. \n", - "\n", - "To learn more about ``adalflow ReAct``, please refer to our developer notes." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# 2. HotpotQA Dataset\n", - "We are using [HotpotQA](https://arxiv.org/pdf/1809.09600). It is a Wikipedia-based multi-hop question and answer dataset." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/alleria/Documents/sylphAI/lightrag_package/LightRAG/.venv/lib/python3.11/site-packages/datasets/table.py:1421: FutureWarning: promote has been superseded by promote_options='default'.\n", - " table = cls._concat_blocks(blocks, axis=0)\n" - ] - } - ], - "source": [ - "# load the dataset\n", - "from datasets import load_dataset\n", - "\n", - "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "len of eval: 7405\n", - "example: {'id': '5a8b57f25542995d1e6f1371', 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'answer': 'yes', 'type': 'comparison', 'level': 'hard', 'supporting_facts': {'title': ['Scott Derrickson', 'Ed Wood'], 'sent_id': [0, 0]}, 'context': {'title': ['Adam Collis', 'Ed Wood (film)', 'Tyler Bates', 'Doctor Strange (2016 film)', 'Hellraiser: Inferno', 'Sinister (film)', 'Deliver Us from Evil (2014 film)', 'Woodson, Arkansas', 'Conrad Brooks', 'The Exorcism of Emily Rose'], 'sentences': [['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\\'s short \"Love in the Ruins\" (1995).', ' In 1998, he played \"Crankshaft\" in Eric Koyanagi\\'s \"Hundred Percent\".'], ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', \" The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.\", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.'], ['Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.', ' Much of his work is in the action and horror film genres, with films like \"Dawn of the Dead, 300, Sucker Punch,\" and \"John Wick.\"', ' He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.', ' With Gunn, he has scored every one of the director\\'s films; including \"Guardians of the Galaxy\", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums \"The Pale Emperor\" and \"Heaven Upside Down\".'], ['Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.', ' It is the fourteenth film of the Marvel Cinematic Universe (MCU).', ' The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.', ' In \"Doctor Strange\", surgeon Strange learns the mystic arts after a career-ending car accident.'], ['Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.', ' It is the fifth installment in the \"Hellraiser\" series and the first \"Hellraiser\" film to go straight-to-DVD.', ' It was directed by Scott Derrickson and released on October 3, 2000.', \" The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.\", \" The film's reviews were mixed.\"], ['Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.', ' It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.'], ['Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.', ' The film is officially based on a 2001 non-fiction book entitled \"Beware the Night\" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was \"inspired by actual accounts\".', ' The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.'], ['Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.', ' Its population was 403 at the 2010 census.', ' It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.', ' Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.', ' Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.'], ['Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.', ' He moved to Hollywood, California in 1948 to pursue a career in acting.', ' He got his start in movies appearing in Ed Wood films such as \"Plan 9 from Outer Space\", \"Glen or Glenda\", and \"Jail Bait.\"', ' He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.', ' He also has since gone on to write, produce and direct several films.'], ['The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.', ' The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.']]}}\n", - "attributes in each sample: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']\n" - ] - } - ], - "source": [ - "# check the data sample\n", - "test_sample = dataset[\"validation\"][0]\n", - "print(f\"len of eval: {len(dataset['validation'])}\")\n", - "print(f\"example: {test_sample}\")\n", - "print(f\"attributes in each sample: {list(test_sample.keys())}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "question: Were Scott Derrickson and Ed Wood of the same nationality?\n", - "answer: yes\n" - ] - } - ], - "source": [ - "# Each sample contains a question and a corresponding answer.\n", - "print(f\"question: {test_sample.get('question')}\")\n", - "print(f\"answer: {test_sample.get('answer')}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 3. Set up\n", - "Please make sure you have set the model client APIs before running the agent. Now import the necessary packages." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import dotenv\n", - "from adalflow.components.model_client import OpenAIClient\n", - "from adalflow.components.agent.react_agent import ReActAgent\n", - "from adalflow.core.tool_helper import FunctionTool\n", - "\n", - "import time\n", - "\n", - "# load evironment, please set the relative path to your .env file that includes the api key\n", - "dotenv.load_dotenv(dotenv_path=\"../../.env\", override=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 4. Create Agent\n", - "To create an gent, we need to define the basic components.\n", - "\n", - "## Tools\n", - "Firstly, we need to specify what functions the agent will need to answer the question. In this case, we are answering the Wikipedia-based questions, we will allow the agent to **search** Wikipedia api. The [ReAct Paper](https://arxiv.org/pdf/2210.03629) includes a **lookup** function that serves as Ctrl+F functionality on the browser.\n", - "\n", - "As ``adalflow ReAct`` has a built in ``finish`` function, we don't need to define by ourselves." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "import requests\n", - "from bs4 import BeautifulSoup\n", - "import re\n", - "import string\n", - "\n", - "\n", - "# copy code from the paper\n", - "def clean_str(p):\n", - " return p.encode().decode(\"unicode-escape\").encode(\"latin1\").decode(\"utf-8\")\n", - "\n", - "\n", - "# normalization copied from the paper's code\n", - "def normalize_answer(s):\n", - " def remove_articles(text):\n", - " return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n", - "\n", - " def white_space_fix(text):\n", - " return \" \".join(text.split())\n", - "\n", - " def remove_punc(text):\n", - " exclude = set(string.punctuation)\n", - " return \"\".join(ch for ch in text if ch not in exclude)\n", - "\n", - " def lower(text):\n", - " return text.lower()\n", - "\n", - " return white_space_fix(remove_articles(remove_punc(lower(s))))\n", - "\n", - "\n", - "def search(entity: str) -> str:\n", - " \"\"\"\n", - " searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\n", - " \"\"\"\n", - " # Format the entity for URL encoding\n", - " entity_formatted = entity.replace(\" \", \"+\")\n", - " url = f\"https://en.wikipedia.org/w/index.php?search={entity_formatted}\"\n", - "\n", - " # Fetch the page\n", - " response = requests.get(url)\n", - " soup = BeautifulSoup(response.text, \"html.parser\")\n", - "\n", - " # Check if the exact page was found or suggest similar items\n", - " # when
is detected, it means the entity page is not found on wikipedia\n", - " result_divs = soup.find_all(\"div\", {\"class\": \"mw-search-result-heading\"})\n", - "\n", - " if (\n", - " result_divs\n", - " ): # this means the searched entity page is not in wikipedia, wikipedia will show a list of similar entities\n", - " # get Similar results\n", - " similar_titles = [div.a.get_text() for div in result_divs]\n", - " return f\"Could not find exact page for '{entity}'. Similar topics: {similar_titles[:5]}\" # return the top 5 similar titles\n", - " else:\n", - " # the paper uses page to represent content in

\n", - " # Extract xontent\n", - " page_list = [\n", - " p.get_text().strip() for p in soup.find_all(\"p\") + soup.find_all(\"ul\")\n", - " ]\n", - " # TODO: Recursive search, if find any concept that needs more search then call search again\n", - " # if any(\"may refer to:\" in p for p in page_list):\n", - " # search(entity)\n", - "\n", - " # restructure & clean the page content following the paper's logic\n", - " page = \"\"\n", - " for p in page_list:\n", - " if len(p.split(\" \")) > 2:\n", - " page += clean_str(p)\n", - " if not p.endswith(\"\\n\"):\n", - " page += \"\\n\"\n", - " paragraphs = page.split(\"\\n\")\n", - " paragraphs = [p.strip() for p in paragraphs if p.strip()]\n", - "\n", - " sentences = []\n", - " for p in paragraphs:\n", - " sentences += p.split(\". \")\n", - " sentences = [s.strip() + \".\" for s in sentences if s.strip()]\n", - "\n", - " # return the first 5 sentences\n", - " if sentences:\n", - " return (\n", - " \" \".join(sentences[:5]) if len(sentences) >= 5 else \" \".join(sentences)\n", - " )\n", - " else:\n", - " return \"No content found on this page.\"\n", - "\n", - " # TODO: clean the paragraphs and return the searched content\n", - "\n", - "\n", - "def lookup(text: str, keyword: str) -> str:\n", - " \"\"\"\n", - " returns the sentences containing keyword in the current passage.\n", - " \"\"\"\n", - " sentences = text.split(\".\")\n", - " matching_sentences = [\n", - " sentence.strip() + \".\"\n", - " for sentence in sentences\n", - " if keyword.lower() in sentence.lower()\n", - " ]\n", - " if not matching_sentences:\n", - " return \"No sentences found with the keyword.\"\n", - " else:\n", - " return \" \".join(\n", - " matching_sentences\n", - " ) # Join all matching sentences into a single string" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "# set up tools for the agent\n", - "tools = [FunctionTool.from_defaults(fn=search), FunctionTool.from_defaults(fn=lookup)]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Examples\n", - "The next thing to add is examples. Few shot prompt engineering is a common practice to improve the model performance.\n", - "\n", - "Let's use the paper's examples. The paper has 6 examples altogether." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "examples = [\n", - " \"\"\"Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\n", - "Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\n", - "Action 1: search(\"Colorado orogeny\")\n", - "Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\n", - "Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.\n", - "Action 2: lookup(\"eastern sector\")\n", - "Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\n", - "Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\n", - "Action 3: search(\"High Plains\")\n", - "Observation 3: High Plains refers to one of two distinct land regions:\n", - "Thought 4: I need to instead search High Plains (United States).\n", - "Action 4: search(\"High Plains (United States)\")\n", - "Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\n", - "Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\n", - "Action 5: finish(\"1,800 to 7,000 ft\")\"\"\",\n", - " \"\"\"Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\n", - "Thought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\n", - "Action 1: search(\"Milhouse\")\n", - "Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\n", - "Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\n", - "Action 2: lookup(\"named after\")\n", - "Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \n", - "Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\n", - "Action 3: finish(\"Richard Nixon\")\"\"\",\n", - " \"\"\"Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\n", - "Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\n", - "Action 1: search(\"Adam Clayton Powell\")\n", - "Observation 1: Could not find [\"Adam Clayton Powell\"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].\n", - "Thought 2: To find the documentary, I can search Adam Clayton Powell (film).\n", - "Action 2: search(\"Adam Clayton Powell (film)\")\n", - "Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\n", - "The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\n", - "Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\n", - "Action 3: finish(\"The Saimaa Gesture\")\"\"\",\n", - " \"\"\"Question: What profession does Nicholas Ray and Elia Kazan have in common?\n", - "Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\n", - "Action 1: search(\"Nicholas Ray\")\n", - "Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\n", - "Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\n", - "Action 2: search(\"Elia Kazan\")\n", - "Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\n", - "Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\n", - "Action 3: finish(\"director, screenwriter, actor\")\"\"\",\n", - " \"\"\"Question: Which magazine was started first Arthur's Magazine or First for Women?\n", - "Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.\n", - "Action 1: search(\"Arthur's Magazine\")\n", - "Observation 1: Arthur's Magazine (1844-€“1846) was an American literary periodical published in Philadelphia in the 19th century. \n", - "Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.\n", - "Action 2: search(\"First for Women\")\n", - "Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \n", - "Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.\n", - "Action 3: finish(\"Arthur's Magazine\")\"\"\",\n", - " \"\"\"Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\n", - "Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\n", - "Action 1: search(\"Pavel Urysohn\")\n", - "Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 – August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\n", - "Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\n", - "Action 2: search(\"Leonid Levin\")\n", - "Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \n", - "Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \n", - "Action 3: finish(\"yes\")\"\"\",\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n", - "\n", - "preset_prompt_kwargs = {\"examples\": examples}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model\n", - "\n", - "Next, we can choose the model to call. In this example we will use OpenAIClient ``gpt-3.5-turbo`` model. We will set the ``temperature`` at 0.0 to make the response as consistent as possible." - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "gpt_model_kwargs = {\n", - " \"model\": \"gpt-3.5-turbo\",\n", - " \"temperature\": 0.0,\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Agent\n", - "Combining the previous components, we can define the agent." - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ReActAgent(\n", - " tools=[FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)], max_steps=3, model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n", - " (prompt): Prompt(\n", - " template: \n", - " {# role/task description #}\n", - " You task is to answer user's query with minimum steps and maximum accuracy using the tools provided.\n", - " {# REACT instructions #}\n", - " Each step you will read the previous Thought, Action, and Observation(execution result of the action)steps and then provide the next Thought and Action.\n", - " \n", - " You only have access to the following tools:\n", - " {# tools #}\n", - " {% for tool in tools %}\n", - " {{ loop.index }}. ToolName: {{ tool.metadata.name }}\n", - " Tool Description: {{ tool.metadata.description }}\n", - " Tool Parameters: {{ tool.metadata.fn_schema_str }} {#tool args can be misleading, especially if we already have type hints and docstring in the function#}\n", - " {% endfor %}\n", - " {# output is always more robust to use json than string #}\n", - " ---\n", - " Your output must be in valid JSON format(raw Python string format) with two keys:\n", - " {\n", - " \"thought\": \"\",\n", - " \"action\": \"ToolName(, )\"\n", - " }\n", - " - Must double quote the JSON str.\n", - " - Inside of the JSON str, Must use escape double quote and escape backslash for string.\n", - " For example:\n", - " \"action\": \"finish(\\\"John's.\\\")\"\n", - " ---\n", - " {# Specifications TODO: preference between the usage of llm tool vs the other tool #}\n", - " Process:\n", - " - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n", - " - Call one available tool at a time to solve each subquery/subquestion. \\\n", - " - At step 'finish', join all subqueries answers and finish the task.\n", - " Remember:\n", - " - Action must call one of the above tools with Took Name. It can not be empty.\n", - " - Read the Tool Description and ensure your args and kwarg follow what each tool expects in types. e.g. (a=1, b=2) if it is keyword argument or (1, 2) if it is positional.\n", - " - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n", - " - When the initial query is simple, use minimum steps to answer the query.\n", - " {#Examples can be here#}\n", - " {# Check if there are any examples #}\n", - " {% if examples %}\n", - " \n", - " {% for example in examples %}\n", - " {{ example }}\n", - " {% endfor %}\n", - " \n", - " {% endif %}\n", - " <>\n", - " -----------------\n", - " {# History #}\n", - " {% for history in step_history %}\n", - " Step {{history.step}}:\n", - " {\n", - " \"thought\": \"{{history.thought}}\",\n", - " \"action\": \"{{history.action}}\",\n", - " }\n", - " \"observation\": \"{{history.observation}}\"\n", - " {% endfor %}\n", - " {% if input_str %}\n", - " User query:\n", - " {{ input_str }}\n", - " {% endif %}\n", - " , preset_prompt_kwargs: {'examples': ['Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\\nThought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\\nAction 1: search(\"Colorado orogeny\")\\nObservation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\\nThought 2: It does not mention the eastern sector. So I need to look up eastern sector.\\nAction 2: lookup(\"eastern sector\")\\nObservation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\\nThought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\\nAction 3: search(\"High Plains\")\\nObservation 3: High Plains refers to one of two distinct land regions:\\nThought 4: I need to instead search High Plains (United States).\\nAction 4: search(\"High Plains (United States)\")\\nObservation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\\nThought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\\nAction 5: finish(\"1,800 to 7,000 ft\")', 'Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\\nThought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\\nAction 1: search(\"Milhouse\")\\nObservation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\\nThought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\\nAction 2: lookup(\"named after\")\\nObservation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \\nThought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\\nAction 3: finish(\"Richard Nixon\")', 'Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\\nThought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\\nAction 1: search(\"Adam Clayton Powell\")\\nObservation 1: Could not find [\"Adam Clayton Powell\"]. Similar: [\\'Adam Clayton Powell III\\', \\'Seventh Avenue (Manhattan)\\', \\'Adam Clayton Powell Jr. State Office Building\\', \\'Isabel Washington Powell\\', \\'Adam Powell\\', \\'Adam Clayton Powell (film)\\', \\'Giancarlo Esposito\\'].\\nThought 2: To find the documentary, I can search Adam Clayton Powell (film).\\nAction 2: search(\"Adam Clayton Powell (film)\")\\nObservation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\\nThe film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\\nThought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\\nAction 3: finish(\"The Saimaa Gesture\")', 'Question: What profession does Nicholas Ray and Elia Kazan have in common?\\nThought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\\nAction 1: search(\"Nicholas Ray\")\\nObservation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\\nThought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\\nAction 2: search(\"Elia Kazan\")\\nObservation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\\nThought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\\nAction 3: finish(\"director, screenwriter, actor\")', 'Question: Which magazine was started first Arthur\\'s Magazine or First for Women?\\nThought 1: I need to search Arthur\\'s Magazine and First for Women, and find which was started first.\\nAction 1: search(\"Arthur\\'s Magazine\")\\nObservation 1: Arthur\\'s Magazine (1844-\\x80\\x931846) was an American literary periodical published in Philadelphia in the 19th century. \\nThought 2: Arthur\\'s Magazine was started in 1844. I need to search First for Women next.\\nAction 2: search(\"First for Women\")\\nObservation 2: First for Women is a woman\\'s magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \\nThought 3: First for Women was started in 1989. 1844 (Arthur\\'s Magazine) < 1989 (First for Women), so Arthur\\'s Magazine was started first.\\nAction 3: finish(\"Arthur\\'s Magazine\")', 'Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\\nThought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\\nAction 1: search(\"Pavel Urysohn\")\\nObservation 1: Pavel Samuilovich Urysohn (February 3, 1898 â\\x80\\x93 August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\\nThought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\\nAction 2: search(\"Leonid Levin\")\\nObservation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \\nThought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \\nAction 3: finish(\"yes\")'], 'tools': [FunctionTool(search), FunctionTool(lookup), FunctionTool(llm_tool), FunctionTool(finish)]}, prompt_variables: ['examples', 'step_history', 'input_str', 'tools']\n", - " )\n", - " (model_client): OpenAIClient()\n", - " (output_processors): JsonParser()\n", - " (additional_llm_tool): Generator(\n", - " model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, \n", - " (prompt): Prompt(\n", - " template: \n", - " {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n", - " \n", - " {% endif %}\n", - " {# task desc #}\n", - " {% if task_desc_str %}\n", - " {{task_desc_str}}\n", - " {% endif %}\n", - " {# output format #}\n", - " {% if output_format_str %}\n", - " \n", - " {{output_format_str}}\n", - " \n", - " {% endif %}\n", - " {# tools #}\n", - " {% if tools_str %}\n", - " \n", - " {{tools_str}}\n", - " \n", - " {% endif %}\n", - " {# example #}\n", - " {% if examples_str %}\n", - " \n", - " {{examples_str}}\n", - " \n", - " {% endif %}\n", - " {# chat history #}\n", - " {% if chat_history_str %}\n", - " \n", - " {{chat_history_str}}\n", - " \n", - " {% endif %}\n", - " {#contex#}\n", - " {% if context_str %}\n", - " \n", - " {{context_str}}\n", - " \n", - " {% endif %}\n", - " {# steps #}\n", - " {% if steps_str %}\n", - " \n", - " {{steps_str}}\n", - " \n", - " {% endif %}\n", - " {% if task_desc_str or output_format_str or tools_str or examples_str or chat_history_str or context_str or steps_str %}\n", - " \n", - " {% endif %}\n", - " {% if input_str %}\n", - " \n", - " {{input_str}}\n", - " \n", - " {% endif %}\n", - " {% if output_str %}\n", - " \n", - " {{output_str}}\n", - " \n", - " {% endif %}\n", - " You:\n", - " , prompt_variables: ['context_str', 'input_str', 'steps_str', 'output_format_str', 'output_str', 'chat_history_str', 'tools_str', 'examples_str', 'task_desc_str']\n", - " )\n", - " (model_client): OpenAIClient()\n", - " )\n", - ")" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# max_steps refers to how many thought-action round we allow the model to perform\n", - "# to save resources, let's use 3 here\n", - "agent = ReActAgent(\n", - " tools=tools,\n", - " max_steps=3,\n", - " model_client=OpenAIClient(),\n", - " model_kwargs=gpt_model_kwargs,\n", - " preset_prompt_kwargs=preset_prompt_kwargs,\n", - ")\n", - "agent" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import importlib\n", - "import adalflow\n", - "\n", - "importlib.reload(adalflow)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 5. Q & A\n", - "Next we can use the agent to answer our questions. Let's run 5 examples. We will use the validation data." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Dataset({\n", - " features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],\n", - " num_rows: 7405\n", - "})" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "val_dataset = dataset[\"validation\"]\n", - "val_dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "``LightRAG`` provides a ``printc`` function. You can utilize it to show colored console output for angent." - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36m2024-06-15 23:13:47 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:48 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.\n", - "Action 1: search(\"Scott Derrickson\")\n", - "Observation 1: Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:50 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.\n", - "Action 2: search(\"Ed Wood\")\n", - "Observation 2: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:51 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Scott Derrickson is an American filmmaker and Ed Wood is also an American filmmaker. They are of the same nationality.\n", - "Action 3: finish(\"yes\")\n", - "Observation 3: yes\u001b[0m\n", - "\u001b[36m2024-06-15 23:13:51 - [react_agent.py:345:call] - answer: yes\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.', action='search(\"Scott Derrickson\")', fun_name='search', fun_args=['Scott Derrickson'], fun_kwargs={}, observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.'), StepOutput(step=2, thought='Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.', action='search(\"Ed Wood\")', fun_name='search', fun_args=['Ed Wood'], fun_kwargs={}, observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\"), StepOutput(step=3, thought='Scott Derrickson is an American filmmaker and Ed Wood is also an American filmmaker. They are of the same nationality.', action='finish(\"yes\")', fun_name='finish', fun_args=['yes'], fun_kwargs={}, observation='yes')]\n", - "\u001b[33m2024-06-15 23:13:51 - [2706144185.py:12:] - question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes\u001b[0m\n", - "\u001b[36m2024-06-15 23:13:51 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:53 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.\n", - "Action 1: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 1: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:55 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n", - "Action 2: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 2: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\",\n", - " \"action\": \"search(\\\"Corliss Archer Kiss and Tell film\\\")\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:13:57 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n", - "Action 3: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 3: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "\u001b[36m2024-06-15 23:13:57 - [react_agent.py:345:call] - answer: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=2, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=3, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\")]\n", - "\u001b[33m2024-06-15 23:13:57 - [2706144185.py:12:] - question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: could not find exact page for corliss archer kiss and tell film similar topics kiss for corliss corliss archer kiss and tell 1945 film kiss and tell play meet corliss archer\u001b[0m\n", - "\u001b[36m2024-06-15 23:13:57 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", - "\u001b[34m2024-06-15 23:13:59 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for a science fantasy young adult series told in first person, with companion books narrating the stories of enslaved worlds and alien species.\n", - "Action 1: search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")\n", - "Observation 1: Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:00 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The search results suggest 'Animorphs' as a similar topic, I should search for 'Animorphs' to see if it matches the criteria.\n", - "Action 2: search('Animorphs')\n", - "Observation 2: Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:01 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The series that matches the criteria is 'Animorphs'. I should provide this as the answer.\n", - "Action 3: finish(\"Animorphs\")\n", - "Observation 3: Animorphs\u001b[0m\n", - "\u001b[36m2024-06-15 23:14:01 - [react_agent.py:345:call] - answer: Animorphs\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for a science fantasy young adult series told in first person, with companion books narrating the stories of enslaved worlds and alien species.', action='search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")', fun_name='search', fun_args=['science fantasy young adult series first person companion books enslaved worlds alien species'], fun_kwargs={}, observation=\"Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\"), StepOutput(step=2, thought=\"The search results suggest 'Animorphs' as a similar topic, I should search for 'Animorphs' to see if it matches the criteria.\", action=\"search('Animorphs')\", fun_name='search', fun_args=['Animorphs'], fun_kwargs={}, observation=\"Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\"), StepOutput(step=3, thought=\"The series that matches the criteria is 'Animorphs'. I should provide this as the answer.\", action='finish(\"Animorphs\")', fun_name='finish', fun_args=['Animorphs'], fun_kwargs={}, observation='Animorphs')]\n", - "\u001b[33m2024-06-15 23:14:01 - [2706144185.py:12:] - question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: animorphs\u001b[0m\n", - "\u001b[36m2024-06-15 23:14:01 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:03 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.\n", - "Action 1: search(\"Laleli Mosque\")\n", - "Observation 1: The Laleli Mosque (Turkish: Laleli Camii, lit. 'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:04 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.\n", - "Action 2: search(\"Esma Sultan Mansion\")\n", - "Observation 2: The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:05 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.\n", - "Action 3: finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")\n", - "Observation 3: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n", - "\u001b[36m2024-06-15 23:14:05 - [react_agent.py:345:call] - answer: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.', action='search(\"Laleli Mosque\")', fun_name='search', fun_args=['Laleli Mosque'], fun_kwargs={}, observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\"), StepOutput(step=2, thought='I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.', action='search(\"Esma Sultan Mansion\")', fun_name='search', fun_args=['Esma Sultan Mansion'], fun_kwargs={}, observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].'), StepOutput(step=3, thought='I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.', action='finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")', fun_name='finish', fun_args=['No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.'], fun_kwargs={}, observation='No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.')]\n", - "\u001b[33m2024-06-15 23:14:05 - [2706144185.py:12:] - question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is located in laleli fatih istanbul turkey while esma sultan mansion is in ortaköy neighborhood of istanbul turkey\u001b[0m\n", - "\u001b[36m2024-06-15 23:14:06 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:07 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\n", - "Action 1: search('Big Stone Gap director')\n", - "Observation 1: Could not find exact page for 'Big Stone Gap director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Union High School (Big Stone Gap, Virginia)', 'Star Wood', 'Dagmara Domińczyk']\u001b[0m\n", - "\u001b[34m2024-06-15 23:14:09 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\n", - "Action 2: search('Big Stone Gap film location')\n", - "Observation 2: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\",\n", - " \"action\": \"search('Big Stone Gap film location')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:14:11 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\n", - "Action 3: search('Big Stone Gap film location')\n", - "Observation 3: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n", - "\u001b[36m2024-06-15 23:14:11 - [react_agent.py:345:call] - answer: Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought=\"I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\", action=\"search('Big Stone Gap director')\", fun_name='search', fun_args=['Big Stone Gap director'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Union High School (Big Stone Gap, Virginia)', 'Star Wood', 'Dagmara Domińczyk']\"), StepOutput(step=2, thought=\"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\", action=\"search('Big Stone Gap film location')\", fun_name='search', fun_args=['Big Stone Gap film location'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\"), StepOutput(step=3, thought=\"I need to search for the New York city where the romantic comedy 'Big Stone Gap' is based.\", action=\"search('Big Stone Gap film location')\", fun_name='search', fun_args=['Big Stone Gap film location'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film location'. Similar topics: ['Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'The Beach (film)']\")]\n", - "\u001b[33m2024-06-15 23:14:11 - [2706144185.py:12:] - question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: could not find exact page for big stone gap film location similar topics big stone gap virginia adriana trigiani dagmara domińczyk big lebowski beach film\u001b[0m\n" - ] - } - ], - "source": [ - "from adalflow.utils.logger import printc\n", - "\n", - "num_questions = 5\n", - "for i in range(num_questions):\n", - " question = val_dataset[i][\"question\"]\n", - " gt_answer = normalize_answer(\n", - " val_dataset[i][\"answer\"]\n", - " ) # normalize the ground truth answer\n", - "\n", - " # get the agent's response\n", - " pred_answer = agent(question)\n", - " pred_answer = normalize_answer(pred_answer)\n", - "\n", - " printc(\n", - " f\"question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", - " color=\"yellow\",\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 6. Evaluation\n", - "\n", - "Now you will see that we have the ``exact correct answer`` for some questions:\n", - "\n", - "question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes`` pred answer: ``yes``\n", - "\n", - "question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: ``animorphs``, pred answer: ``animorphs``\n", - "\n", - "Sometimes the agent performs correctly but not in the same format with the ground truth. E.g. ground truth: ``no``, pred answer: ``no, they are not the same``. This is what we can tolerate.\n", - "\n", - "But how to evaluate if the agent is doing well, or if our tools, examples, and prompt implementations work well? We need to evaluate it.\n", - "\n", - "1. Exact Match(EM)\n", - "Exact Match is what the paper is using. Only when the normalized agent response is the same with the ground truth answer, we count it as correct. The paper's EM for react agent is around 30%(gpt-3).\n", - "\n", - "2. Fuzzy Match(FM)\n", - "EM doesn't make much sense in question and answering. So we propose fuzzy match based on the LLMs' lengthy output nature. If the ground truth answer is included in the agent response, then we count it as correct. FM is not necessarily correct. \n", - "E.g. question: Harry Potter and Dumbledore, who is older? ground truth: ``dumbledore``, pred answer: ``harry potter is older than dumbledore.``\n", - "the model mentioned the groud truth but still provide wrong answer. So FM serves as reference.\n", - "\n", - "Let's use ``LightRAG eval`` module and evaluate on 10 questions and keep the model's practice to set ``max_step`` at `7`." - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36m2024-06-15 23:15:11 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:13 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.\n", - "Action 1: search(\"Scott Derrickson\")\n", - "Observation 1: Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:15 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.\n", - "Action 2: search(\"Ed Wood\")\n", - "Observation 2: Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:15 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Scott Derrickson is an American filmmaker and Ed Wood is also American. So, they are of the same nationality.\n", - "Action 3: finish(\"yes\")\n", - "Observation 3: yes\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:15 - [react_agent.py:345:call] - answer: yes\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search Scott Derrickson and Ed Wood, find their nationalities, then determine if they are the same.', action='search(\"Scott Derrickson\")', fun_name='search', fun_args=['Scott Derrickson'], fun_kwargs={}, observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012) and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.'), StepOutput(step=2, thought='Scott Derrickson is an American filmmaker. I need to search Ed Wood next to find his nationality.', action='search(\"Ed Wood\")', fun_name='search', fun_args=['Ed Wood'], fun_kwargs={}, observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, screenwriter, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\"), StepOutput(step=3, thought='Scott Derrickson is an American filmmaker and Ed Wood is also American. So, they are of the same nationality.', action='finish(\"yes\")', fun_name='finish', fun_args=['yes'], fun_kwargs={}, observation='yes')]\n", - "\u001b[33m2024-06-15 23:15:15 - [3641068398.py:26:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:15 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.\n", - "Action 1: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 1: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:19 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n", - "Action 2: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 2: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['Corliss Archer', 'A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\",\n", - " \"action\": \"search(\\\"Corliss Archer Kiss and Tell film\\\")\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:21 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.\n", - "Action 3: search(\"Corliss Archer Kiss and Tell film\")\n", - "Observation 3: Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:23 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n", - "Action 4: search(\"Corliss Archer film Kiss and Tell cast\")\n", - "Observation 4: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:25 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n", - "Action 5: search(\"Corliss Archer film Kiss and Tell cast\")\n", - "Observation 5: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:27 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n", - "Action 6: search(\"Corliss Archer film Kiss and Tell cast\")\n", - "Observation 6: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:29 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.\n", - "Action 7: search(\"Corliss Archer film Kiss and Tell cast\")\n", - "Observation 7: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:29 - [react_agent.py:345:call] - answer: Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to find the woman who portrayed Corliss Archer in the film Kiss and Tell, then search for the government position she held.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=2, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['Corliss Archer', 'A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=3, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='search(\"Corliss Archer Kiss and Tell film\")', fun_name='search', fun_args=['Corliss Archer Kiss and Tell film'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer Kiss and Tell film'. Similar topics: ['A Kiss for Corliss', 'Corliss Archer', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Meet Corliss Archer']\"), StepOutput(step=4, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=5, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=6, thought='I need to search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\"), StepOutput(step=7, thought='I should search for the woman who portrayed Corliss Archer in the film Kiss and Tell to find out the government position she held.', action='search(\"Corliss Archer film Kiss and Tell cast\")', fun_name='search', fun_args=['Corliss Archer film Kiss and Tell cast'], fun_kwargs={}, observation=\"Could not find exact page for 'Corliss Archer film Kiss and Tell cast'. Similar topics: ['A Kiss for Corliss', 'Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Janet Waldo', 'Kathryn Card']\")]\n", - "\u001b[33m2024-06-15 23:15:29 - [3641068398.py:26:] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: could not find exact page for corliss archer film kiss and tell cast similar topics kiss for corliss kiss and tell 1945 film kiss and tell play janet waldo kathryn card\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:29 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:32 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for a science fantasy young adult series told in first person and involving companion books about enslaved worlds and alien species.\n", - "Action 1: search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")\n", - "Observation 1: Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:34 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The search did not return an exact match. I should search for 'Animorphs' as it involves young adult science fantasy, first-person narration, enslaved worlds, and alien species.\n", - "Action 2: search('Animorphs')\n", - "Observation 2: Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:35 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The series that matches the description is Animorphs. I should provide this as the answer.\n", - "Action 3: finish(\"Animorphs\")\n", - "Observation 3: Animorphs\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:35 - [react_agent.py:345:call] - answer: Animorphs\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for a science fantasy young adult series told in first person and involving companion books about enslaved worlds and alien species.', action='search(\"science fantasy young adult series first person companion books enslaved worlds alien species\")', fun_name='search', fun_args=['science fantasy young adult series first person companion books enslaved worlds alien species'], fun_kwargs={}, observation=\"Could not find exact page for 'science fantasy young adult series first person companion books enslaved worlds alien species'. Similar topics: ['Animorphs', 'Last Legionary', 'Barsoom', 'List of The Outer Limits (1995 TV series) episodes', 'LGBT themes in speculative fiction']\"), StepOutput(step=2, thought=\"The search did not return an exact match. I should search for 'Animorphs' as it involves young adult science fantasy, first-person narration, enslaved worlds, and alien species.\", action=\"search('Animorphs')\", fun_name='search', fun_args=['Animorphs'], fun_kwargs={}, observation=\"Animorphs is a science fantasy series of youth books written by Katherine Applegate and her husband Michael Grant,[2] writing together under the name K. A. Applegate,[3] and published by Scholastic.[4] It is told in first person, with all six main characters taking turns narrating the books through their own perspectives. Horror, war, imperialism, dehumanization, sanity, morality, innocence, leadership, freedom, family, and growing up are the core themes of the series.. Published between June 1996 and May 2001, the series consists of 54 books and includes ten companion books, eight of which fit into the series' continuity (the Animorphs Chronicles and Megamorphs books) and two that are gamebooks not fitting into the continuity (the Alternamorphs books)..\"), StepOutput(step=3, thought='The series that matches the description is Animorphs. I should provide this as the answer.', action='finish(\"Animorphs\")', fun_name='finish', fun_args=['Animorphs'], fun_kwargs={}, observation='Animorphs')]\n", - "\u001b[33m2024-06-15 23:15:35 - [3641068398.py:26:] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: animorphs\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:35 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.\n", - "Action 1: search(\"Laleli Mosque\")\n", - "Observation 1: The Laleli Mosque (Turkish: Laleli Camii, lit. 'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:37 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.\n", - "Action 2: search(\"Esma Sultan Mansion\")\n", - "Observation 2: The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:39 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.\n", - "Action 3: finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")\n", - "Observation 3: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:39 - [react_agent.py:345:call] - answer: No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for the locations of Laleli Mosque and Esma Sultan Mansion to determine if they are in the same neighborhood.', action='search(\"Laleli Mosque\")', fun_name='search', fun_args=['Laleli Mosque'], fun_kwargs={}, observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\"), StepOutput(step=2, thought='I need to search for the location of Esma Sultan Mansion to determine if it is in the same neighborhood as Laleli Mosque.', action='search(\"Esma Sultan Mansion\")', fun_name='search', fun_args=['Esma Sultan Mansion'], fun_kwargs={}, observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].'), StepOutput(step=3, thought='I need to determine if Laleli Mosque and Esma Sultan Mansion are in the same neighborhood by comparing their locations.', action='finish(\"No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.\")', fun_name='finish', fun_args=['No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.'], fun_kwargs={}, observation='No, Laleli Mosque is located in Laleli, Fatih, Istanbul, Turkey, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul, Turkey.')]\n", - "\u001b[33m2024-06-15 23:15:39 - [3641068398.py:26:] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is located in laleli fatih istanbul turkey while esma sultan mansion is in ortaköy neighborhood of istanbul turkey\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:39 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:43 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\n", - "Action 1: search('Big Stone Gap film director')\n", - "Observation 1: Could not find exact page for 'Big Stone Gap film director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'C. C. Swiney']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:45 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 2: search('Big Stone Gap film')\n", - "Observation 2: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n", - " \"action\": \"search('Big Stone Gap film')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:47 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 3: search('Big Stone Gap film')\n", - "Observation 3: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n", - " \"action\": \"search('Big Stone Gap film')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:49 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 4: search('Big Stone Gap film')\n", - "Observation 4: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n", - " \"action\": \"search('Big Stone Gap film')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:51 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 5: search('Big Stone Gap film')\n", - "Observation 5: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n", - " \"action\": \"search('Big Stone Gap film')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:53 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 6: search('Big Stone Gap film')\n", - "Observation 6: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\",\n", - " \"action\": \"search('Big Stone Gap film')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:15:54 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\n", - "Action 7: search('Big Stone Gap film')\n", - "Observation 7: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:54 - [react_agent.py:345:call] - answer: Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought=\"I need to search for the director of the romantic comedy 'Big Stone Gap' and then find the New York city they are based in.\", action=\"search('Big Stone Gap film director')\", fun_name='search', fun_args=['Big Stone Gap film director'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film director'. Similar topics: ['Big Stone Gap (film)', 'Adriana Trigiani', 'Dagmara Domińczyk', 'The Big Lebowski', 'C. C. Swiney']\"), StepOutput(step=2, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=3, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=4, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=5, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=6, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\"), StepOutput(step=7, thought=\"I need to search for the film 'Big Stone Gap' and find the New York city it is based in.\", action=\"search('Big Stone Gap film')\", fun_name='search', fun_args=['Big Stone Gap film'], fun_kwargs={}, observation=\"Could not find exact page for 'Big Stone Gap film'. Similar topics: ['Big Stone Gap (film)', 'Big Stone Gap, Virginia', 'Adriana Trigiani', 'Dagmara Domińczyk', 'Darién Gap']\")]\n", - "\u001b[33m2024-06-15 23:15:54 - [3641068398.py:26:] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: could not find exact page for big stone gap film similar topics big stone gap film big stone gap virginia adriana trigiani dagmara domińczyk darién gap\u001b[0m\n", - "\u001b[36m2024-06-15 23:15:54 - [react_agent.py:330:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:57 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the debut album 2014 S/S of a South Korean boy group and find out who formed the group.\n", - "Action 1: search(\"2014 S/S South Korean boy group debut album\")\n", - "Observation 1: Could not find exact page for '2014 S/S South Korean boy group debut album'. Similar topics: ['2014 S/S', 'S.Coups', 'S (South Korean band)', 'List of South Korean boy bands', 'Kara (South Korean group)']\u001b[0m\n", - "\u001b[34m2024-06-15 23:15:58 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: To find out who formed the group, I need to search for the South Korean boy group's debut album 2014 S/S and look for information on the group's formation.\n", - "Action 2: search('S (South Korean band)')\n", - "Observation 2: S (에스) is a South Korean project group consisting of three members: Kangta, Lee Ji-hoon and Shin Hye-sung. The group debuted in 2003, under the SM Entertainment label. After 11 years, they released and promoted another mini-album in 2014.. In 2003, S released their first album Fr.In.Cl, which stands for Friends in Classic. In 2014, after more than a decade in hiatus, the group released their second mini-album Autumn Breeze on October 27.\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\",\n", - " \"action\": \"search('2014 S/S (album)')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:16:00 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\n", - "Action 3: search('2014 S/S (album)')\n", - "Observation 3: 2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group's record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album's songs.[1]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I., Airplay, and others.[2] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[3][4]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard's K-Pop Hot 100 chart.[5][6] Internationally, 2014 S/S took the top spot in Billboard's World Album Chart.[7].\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\",\n", - " \"action\": \"search('Winner (South Korean band)')\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:16:02 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\n", - "Action 4: search('Winner (South Korean band)')\n", - "Observation 4: Winner (Korean: 위너; romanized: wineo; stylized in all caps) is a South Korean boy band consisting of four members: Jinu, Hoony, Mino and Seungyoon. The band was formed in 2013 by YG Entertainment through Mnet's survival program WIN: Who Is Next leading to their official debut on August 17, 2014, with studio album 2014 S/S. Originally a five-piece band, Taehyun departed in November 2016 for his mental health and well-being. Upon their debut, The Korea Herald cited their immediate success as \"unprecedented\" for a new group, thus dubbed as \"Monster rookies\". Winner is often cited as a \"self-producing\" band with all members actively taking on roles from producing and choreographing to designing and marketing.\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:03 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to provide the name of the group that formed Winner.\n", - "Action 5: finish(\"YG Entertainment\")\n", - "Observation 5: YG Entertainment\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:03 - [react_agent.py:345:call] - answer: YG Entertainment\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for the debut album 2014 S/S of a South Korean boy group and find out who formed the group.', action='search(\"2014 S/S South Korean boy group debut album\")', fun_name='search', fun_args=['2014 S/S South Korean boy group debut album'], fun_kwargs={}, observation=\"Could not find exact page for '2014 S/S South Korean boy group debut album'. Similar topics: ['2014 S/S', 'S.Coups', 'S (South Korean band)', 'List of South Korean boy bands', 'Kara (South Korean group)']\"), StepOutput(step=2, thought=\"To find out who formed the group, I need to search for the South Korean boy group's debut album 2014 S/S and look for information on the group's formation.\", action=\"search('S (South Korean band)')\", fun_name='search', fun_args=['S (South Korean band)'], fun_kwargs={}, observation='S (에스) is a South Korean project group consisting of three members: Kangta, Lee Ji-hoon and Shin Hye-sung. The group debuted in 2003, under the SM Entertainment label. After 11 years, they released and promoted another mini-album in 2014.. In 2003, S released their first album Fr.In.Cl, which stands for Friends in Classic. In 2014, after more than a decade in hiatus, the group released their second mini-album Autumn Breeze on October 27.'), StepOutput(step=3, thought=\"To find out who formed the South Korean boy group that debuted with the album 2014 S/S, I need to search for the album's details and look for information on the group's formation.\", action=\"search('2014 S/S (album)')\", fun_name='search', fun_args=['2014 S/S (album)'], fun_kwargs={}, observation='2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group\\'s record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album\\'s songs.[1]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I., Airplay, and others.[2] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[3][4]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard\\'s K-Pop Hot 100 chart.[5][6] Internationally, 2014 S/S took the top spot in Billboard\\'s World Album Chart.[7].'), StepOutput(step=4, thought=\"To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to search for Winner's formation details.\", action=\"search('Winner (South Korean band)')\", fun_name='search', fun_args=['Winner (South Korean band)'], fun_kwargs={}, observation='Winner (Korean:\\xa0위너; romanized: wineo; stylized in all caps) is a South Korean boy band consisting of four members: Jinu, Hoony, Mino and Seungyoon. The band was formed in 2013 by YG Entertainment through Mnet\\'s survival program WIN: Who Is Next leading to their official debut on August 17, 2014, with studio album 2014 S/S. Originally a five-piece band, Taehyun departed in November 2016 for his mental health and well-being. Upon their debut, The Korea Herald cited their immediate success as \"unprecedented\" for a new group, thus dubbed as \"Monster rookies\". Winner is often cited as a \"self-producing\" band with all members actively taking on roles from producing and choreographing to designing and marketing.'), StepOutput(step=5, thought='To find out who formed the South Korean boy group Winner that debuted with the album 2014 S/S, I need to provide the name of the group that formed Winner.', action='finish(\"YG Entertainment\")', fun_name='finish', fun_args=['YG Entertainment'], fun_kwargs={}, observation='YG Entertainment')]\n", - "\u001b[33m2024-06-15 23:16:03 - [3641068398.py:26:] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:03 - [react_agent.py:330:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:05 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to find the person known by the stage name Aladin and who worked as a consultant to help organizations improve their performance.\n", - "Action 1: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 1: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:07 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I should try searching for Aladin's stage name and his work as a consultant to help organizations improve their performance.\n", - "Action 2: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 2: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:08 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n", - "Action 3: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 3: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:10 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n", - "Action 4: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 4: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:12 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n", - "Action 5: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 5: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:13 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n", - "Action 6: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 6: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:15 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.\n", - "Action 7: search(\"Aladin stage name consultant organizations performance\")\n", - "Observation 7: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:15 - [react_agent.py:345:call] - answer: Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to find the person known by the stage name Aladin and who worked as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=2, thought=\"I should try searching for Aladin's stage name and his work as a consultant to help organizations improve their performance.\", action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=3, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=4, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=5, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=6, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\"), StepOutput(step=7, thought='I need to search for the person known by the stage name Aladin and find out about his work as a consultant to help organizations improve their performance.', action='search(\"Aladin stage name consultant organizations performance\")', fun_name='search', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation=\"Could not find exact page for 'Aladin stage name consultant organizations performance'. Similar topics: ['2009 in film', 'Bret Gilliam']\")]\n", - "\u001b[33m2024-06-15 23:16:15 - [3641068398.py:26:] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: could not find exact page for aladin stage name consultant organizations performance similar topics 2009 in film bret gilliam\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:15 - [react_agent.py:330:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the arena where the Lewiston Maineiacs played their home games and find out its seating capacity.\n", - "Action 1: search(\"Lewiston Maineiacs arena\")\n", - "Observation 1: Could not find exact page for 'Lewiston Maineiacs arena'. Similar topics: ['Lewiston Maineiacs', 'The Colisée', 'Centre Agnico Eagle', 'Sherbrooke Castors', 'Trois-Rivières Draveurs']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:18 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n", - "Action 2: search(\"The Colisée seating capacity\")\n", - "Observation 2: Could not find exact page for 'The Colisée seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'List of indoor arenas in the United States', 'Colosseum', 'List of indoor arenas in Canada']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:20 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I need to search for the seating capacity of the Colisée de Québec where the Lewiston Maineiacs played their home games.\n", - "Action 3: search(\"Colisée de Québec seating capacity\")\n", - "Observation 3: Could not find exact page for 'Colisée de Québec seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'Videotron Centre', 'List of indoor arenas by capacity', 'List of indoor arenas in Canada']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:22 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n", - "Action 4: search(\"Videotron Centre seating capacity\")\n", - "Observation 4: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:24 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n", - "Action 5: search(\"Videotron Centre seating capacity\")\n", - "Observation 5: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:25 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n", - "Action 6: search(\"Videotron Centre seating capacity\")\n", - "Observation 6: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:27 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.\n", - "Action 7: search(\"Videotron Centre seating capacity\")\n", - "Observation 7: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:27 - [react_agent.py:345:call] - answer: Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for the arena where the Lewiston Maineiacs played their home games and find out its seating capacity.', action='search(\"Lewiston Maineiacs arena\")', fun_name='search', fun_args=['Lewiston Maineiacs arena'], fun_kwargs={}, observation=\"Could not find exact page for 'Lewiston Maineiacs arena'. Similar topics: ['Lewiston Maineiacs', 'The Colisée', 'Centre Agnico Eagle', 'Sherbrooke Castors', 'Trois-Rivières Draveurs']\"), StepOutput(step=2, thought='I need to search for the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='search(\"The Colisée seating capacity\")', fun_name='search', fun_args=['The Colisée seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'The Colisée seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'List of indoor arenas in the United States', 'Colosseum', 'List of indoor arenas in Canada']\"), StepOutput(step=3, thought='I need to search for the seating capacity of the Colisée de Québec where the Lewiston Maineiacs played their home games.', action='search(\"Colisée de Québec seating capacity\")', fun_name='search', fun_args=['Colisée de Québec seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Colisée de Québec seating capacity'. Similar topics: ['Colisée de Québec', 'List of ice hockey arenas by capacity', 'Videotron Centre', 'List of indoor arenas by capacity', 'List of indoor arenas in Canada']\"), StepOutput(step=4, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=5, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=6, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\"), StepOutput(step=7, thought='I need to search for the seating capacity of the Videotron Centre where the Lewiston Maineiacs played their home games.', action='search(\"Videotron Centre seating capacity\")', fun_name='search', fun_args=['Videotron Centre seating capacity'], fun_kwargs={}, observation=\"Could not find exact page for 'Videotron Centre seating capacity'. Similar topics: ['Videotron Centre', 'List of ice hockey arenas by capacity', 'Colisée de Québec', 'List of indoor arenas by capacity', 'Potential National Hockey League expansion']\")]\n", - "\u001b[33m2024-06-15 23:16:27 - [3641068398.py:26:] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: could not find exact page for videotron centre seating capacity similar topics videotron centre list of ice hockey arenas by capacity colisée de québec list of indoor arenas by capacity potential national hockey league expansion\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:27 - [react_agent.py:330:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:28 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 1: search(\"Annie Morton\")\n", - "Observation 1: Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames, and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper's Bazaar, Sports Illustrated and Victoria's Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan's book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:29 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for the age of Terry Richardson to compare it with Annie Morton's age.\n", - "Action 2: search(\"Terry Richardson\")\n", - "Observation 2: Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct, although numerous other models have defended him.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\u001b[0m\n", - "Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\n", - "\u001b[34m2024-06-15 23:16:31 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n", - "Action 3: finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\")\n", - "Observation 3: Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\n", - "\u001b[34m2024-06-15 23:16:32 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n", - "Action 4: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n", - "Observation 4: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\n", - "\u001b[34m2024-06-15 23:16:33 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n", - "Action 5: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n", - "Observation 5: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\n", - "\u001b[34m2024-06-15 23:16:34 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.\n", - "Action 6: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n", - "Observation 6: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\n", - "\u001b[34m2024-06-15 23:16:35 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I need to compare the birth dates of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 7: finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")\n", - "Observation 7: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:35 - [react_agent.py:345:call] - answer: Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected 'else' after 'if' expression (, line 1)\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for the ages of Annie Morton and Terry Richardson to determine who is older.', action='search(\"Annie Morton\")', fun_name='search', fun_args=['Annie Morton'], fun_kwargs={}, observation='Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames, and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper\\'s Bazaar, Sports Illustrated and Victoria\\'s Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan\\'s book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.'), StepOutput(step=2, thought=\"I need to search for the age of Terry Richardson to compare it with Annie Morton's age.\", action='search(\"Terry Richardson\")', fun_name='search', fun_args=['Terry Richardson'], fun_kwargs={}, observation=\"Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct, although numerous other models have defended him.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\"), StepOutput(step=3, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (, line 1)'), StepOutput(step=4, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (, line 1)'), StepOutput(step=5, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (, line 1)'), StepOutput(step=6, thought='I have found the birth dates of Annie Morton and Terry Richardson. Now, I need to calculate who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (, line 1)'), StepOutput(step=7, thought='I need to compare the birth dates of Annie Morton and Terry Richardson to determine who is older.', action='finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\")', fun_name=None, fun_args=None, fun_kwargs=None, observation='Error executing finish(\"Annie Morton\") if born before Terry Richardson, else finish(\"Terry Richardson\"): expected \\'else\\' after \\'if\\' expression (, line 1)')]\n", - "\u001b[33m2024-06-15 23:16:35 - [3641068398.py:26:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: error executing finishannie morton if born before terry richardson else finishterry richardson expected else after if expression unknown line 1\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:35 - [react_agent.py:330:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I need to search for Local H and For Against to find out if they are both from the United States.\n", - "Action 1: search(\"Local H\")\n", - "Observation 1: Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Joe Pletcher, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:39 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I need to search for For Against to find out if they are from the United States.\n", - "Action 2: search(\"For Against\")\n", - "Observation 2: For Against is an American post-punk/dream pop band from Lincoln, Nebraska, United States. Despite numerous lineup shuffles and some periods of dormancy, the band has produced material steadily since 1984.. Something of an anomaly in Midwestern America, For Against—especially early in its career—has looked to a distinctly British aesthetic for inspiration, most obviously the post-punk scene exemplified by the early rosters of the Factory and 4AD record labels.. Featuring members of underground Lincoln bands Cartoon Pupils and Hymn to Joy, the band first called itself Glue, and then D.B.L., finally settling on Four Against One,[1] a name that reflected its status as a quintet at the time. Bassist Liz Panarelli's early departure gave way to keyboardist Jeffrey Runnings' shift to bass, as well as the band's truncation of its name to For Against.\u001b[0m\n", - "\u001b[34m2024-06-15 23:16:40 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: Both Local H and For Against are from the United States.\n", - "Action 3: finish(\"Yes, both Local H and For Against are from the United States.\")\n", - "Observation 3: Yes, both Local H and For Against are from the United States.\u001b[0m\n", - "\u001b[36m2024-06-15 23:16:40 - [react_agent.py:345:call] - answer: Yes, both Local H and For Against are from the United States.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I need to search for Local H and For Against to find out if they are both from the United States.', action='search(\"Local H\")', fun_name='search', fun_args=['Local H'], fun_kwargs={}, observation=\"Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Joe Pletcher, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\"), StepOutput(step=2, thought='I need to search for For Against to find out if they are from the United States.', action='search(\"For Against\")', fun_name='search', fun_args=['For Against'], fun_kwargs={}, observation=\"For Against is an American post-punk/dream pop band from Lincoln, Nebraska, United States. Despite numerous lineup shuffles and some periods of dormancy, the band has produced material steadily since 1984.. Something of an anomaly in Midwestern America, For Against—especially early in its career—has looked to a distinctly British aesthetic for inspiration, most obviously the post-punk scene exemplified by the early rosters of the Factory and 4AD record labels.. Featuring members of underground Lincoln bands Cartoon Pupils and Hymn to Joy, the band first called itself Glue, and then D.B.L., finally settling on Four Against One,[1] a name that reflected its status as a quintet at the time. Bassist Liz Panarelli's early departure gave way to keyboardist Jeffrey Runnings' shift to bass, as well as the band's truncation of its name to For Against.\"), StepOutput(step=3, thought='Both Local H and For Against are from the United States.', action='finish(\"Yes, both Local H and For Against are from the United States.\")', fun_name='finish', fun_args=['Yes, both Local H and For Against are from the United States.'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.')]\n", - "\u001b[33m2024-06-15 23:16:40 - [3641068398.py:26:] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes both local h and for against are from united states\u001b[0m\n", - "EM = (0.3, [1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]), FM = (0.6, [1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0]), average time = 8.897640180587768\n" - ] - } - ], - "source": [ - "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", - "\n", - "# set up evaluation type\n", - "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n", - "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n", - "\n", - "agent = ReActAgent(\n", - " tools=tools,\n", - " max_steps=7,\n", - " model_client=OpenAIClient(),\n", - " model_kwargs=gpt_model_kwargs,\n", - " preset_prompt_kwargs=preset_prompt_kwargs,\n", - ")\n", - "\n", - "num_questions = 10\n", - "gt_answers = []\n", - "pred_answers = []\n", - "start_time = time.time()\n", - "for i in range(num_questions):\n", - " question = val_dataset[i][\"question\"]\n", - " gt_answer = normalize_answer(\n", - " val_dataset[i][\"answer\"]\n", - " ) # normalize the ground truth answer\n", - " gt_answers.append(gt_answer)\n", - "\n", - " # get the agent's response\n", - " pred_answer = agent(question)\n", - " pred_answer = normalize_answer(pred_answer)\n", - " pred_answers.append(pred_answer)\n", - "\n", - " printc(\n", - " f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", - " color=\"yellow\",\n", - " )\n", - "\n", - "end_time = time.time()\n", - "\n", - "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", - "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", - "avg_time = (end_time - start_time) / num_questions\n", - "\n", - "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The EM is 0.3 and the FM is 0.6. Each query takes 9s in average. (The performance also depends on the success of wikipedia query connection.)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What if we use simple LLM models to answer these questions? To test on this, we just need to remove the tools. We have a built-in ``llm_tool`` and ``finish`` that automatically handles the query. ``llm_tool`` uses the same model with the agent. We can't add the examples because the examples will mislead the agent to use non-existing tools." - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[36m2024-06-15 23:17:00 - [react_agent.py:330:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:01 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The nationality of Scott Derrickson and Ed Wood needs to be determined.\n", - "Action 1: llm_tool(\"What is the nationality of Scott Derrickson and Ed Wood?\")\n", - "Observation 1: What is the nationality of Scott Derrickson and Ed Wood?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:03 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is asking if Scott Derrickson and Ed Wood were of the same nationality.\n", - "Action 2: llm_tool(\"Were Scott Derrickson and Ed Wood of the same nationality?\")\n", - "Observation 2: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:04 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user query has been successfully answered.\n", - "Action 3: finish(\"No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\")\n", - "Observation 3: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:04 - [react_agent.py:345:call] - answer: No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The nationality of Scott Derrickson and Ed Wood needs to be determined.', action='llm_tool(\"What is the nationality of Scott Derrickson and Ed Wood?\")', fun_name='llm_tool', fun_args=['What is the nationality of Scott Derrickson and Ed Wood?'], fun_kwargs={}, observation='What is the nationality of Scott Derrickson and Ed Wood?'), StepOutput(step=2, thought='The user is asking if Scott Derrickson and Ed Wood were of the same nationality.', action='llm_tool(\"Were Scott Derrickson and Ed Wood of the same nationality?\")', fun_name='llm_tool', fun_args=['Were Scott Derrickson and Ed Wood of the same nationality?'], fun_kwargs={}, observation='No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.'), StepOutput(step=3, thought='The user query has been successfully answered.', action='finish(\"No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.\")', fun_name='finish', fun_args=['No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.'], fun_kwargs={}, observation='No, Scott Derrickson and Ed Wood were not of the same nationality. Scott Derrickson is American, while Ed Wood was also American.')]\n", - "\u001b[33m2024-06-15 23:17:04 - [3230041225.py:26:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:04 - [react_agent.py:330:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:06 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use llm_tool to answer this question.\n", - "Action 1: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 1: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:08 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I will use llm_tool to quickly find the answer.\n", - "Action 2: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 2: The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:12 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I will use llm_tool to quickly provide the answer.\n", - "Action 3: llm_tool(\"Shirley Temple held the government position of U.S. Representative.\")\n", - "Observation 3: Shirley Temple did not hold the government position of U.S. Representative. She was a famous child actress and diplomat, but she did not serve as a U.S. Representative.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:13 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I will use llm_tool to quickly find the answer.\n", - "Action 4: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 4: The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:15 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I will use llm_tool to quickly provide the answer.\n", - "Action 5: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 5: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:17 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I will use llm_tool to quickly find the answer to the user's query.\n", - "Action 6: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 6: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:18 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I will use llm_tool to quickly find the answer to the user's query.\n", - "Action 7: llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")\n", - "Observation 7: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:18 - [react_agent.py:345:call] - answer: Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I will use llm_tool to answer this question.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?'), StepOutput(step=2, thought='I will use llm_tool to quickly find the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.'), StepOutput(step=3, thought='I will use llm_tool to quickly provide the answer.', action='llm_tool(\"Shirley Temple held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['Shirley Temple held the government position of U.S. Representative.'], fun_kwargs={}, observation='Shirley Temple did not hold the government position of U.S. Representative. She was a famous child actress and diplomat, but she did not serve as a U.S. Representative.'), StepOutput(step=4, thought='I will use llm_tool to quickly find the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='The woman who portrayed Corliss Archer in the film Kiss and Tell was Shirley Temple. She later held the government position of U.S. Representative.'), StepOutput(step=5, thought='I will use llm_tool to quickly provide the answer.', action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?'), StepOutput(step=6, thought=\"I will use llm_tool to quickly find the answer to the user's query.\", action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell and also held the government position of U.S. Representative?'), StepOutput(step=7, thought=\"I will use llm_tool to quickly find the answer to the user's query.\", action='llm_tool(\"The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.\")', fun_name='llm_tool', fun_args=['The woman who portrayed Corliss Archer in the film Kiss and Tell held the government position of U.S. Representative.'], fun_kwargs={}, observation='Who is the woman who portrayed Corliss Archer in the film Kiss and Tell?')]\n", - "\u001b[33m2024-06-15 23:17:18 - [3230041225.py:26:] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: who is woman who portrayed corliss archer in film kiss and tell\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:18 - [react_agent.py:330:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:20 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n", - "Action 1: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n", - "Observation 1: That sounds like an exciting series! Do you have a favorite book or character from the series?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:22 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n", - "Action 2: llm_tool(\"Science fantasy young adult series with companion books about enslaved worlds and alien species.\")\n", - "Observation 2: That sounds like an exciting series! Can you tell me more about the main plot or any specific themes in the books?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:24 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking a science fantasy young adult series narrated in the first person with companion books detailing stories of enslaved worlds and alien species.\n", - "Action 3: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n", - "Observation 3: Hello! It sounds like you're interested in a science fantasy young adult series with a first-person narration, companion books about enslaved worlds, and alien species. That's a fascinating combination! Do you have a specific question or topic you'd like to discuss about these books?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:27 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n", - "Action 4: llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")\n", - "Observation 4: It sounds like you're looking for a science fantasy young adult series that is narrated in the first person and includes companion books about enslaved worlds and alien species. One recommendation that fits this description is the \"Red Rising\" series by Pierce Brown. This series follows the story of Darrow, a young man living in a society where people are divided by color and forced to work as slaves for the ruling class. As Darrow discovers the truth about his world and joins a rebellion, he encounters various alien species and explores different enslaved worlds. The series includes multiple companion books that delve deeper into the world and its characters.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:29 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n", - "Action 5: llm_tool(\"Red Rising series by Pierce Brown.\")\n", - "Observation 5: Hello! How can I assist you with the Red Rising series by Pierce Brown?\u001b[0m\n", - "Trying to fix potential missing commas...\n", - "Trying to fix scaped single quotes...\n", - "Fixed JSON string: {\n", - " \"thought\": \"The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\",\n", - " \"action\": \"llm_tool(\\\"Red Rising series by Pierce Brown.\\\")\",\n", - "}\n", - "Parsing JSON string with PyYAML...\n", - "\u001b[34m2024-06-15 23:17:30 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.\n", - "Action 6: llm_tool(\"Red Rising series by Pierce Brown.\")\n", - "Observation 6: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:31 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.\n", - "Action 7: llm_tool(\"Red Rising series by Pierce Brown.\")\n", - "Observation 7: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:31 - [react_agent.py:345:call] - answer: Hello! How can I assist you today with the Red Rising series by Pierce Brown?\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='That sounds like an exciting series! Do you have a favorite book or character from the series?'), StepOutput(step=2, thought='The user is inquiring about a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='That sounds like an exciting series! Can you tell me more about the main plot or any specific themes in the books?'), StepOutput(step=3, thought='The user is seeking a science fantasy young adult series narrated in the first person with companion books detailing stories of enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation=\"Hello! It sounds like you're interested in a science fantasy young adult series with a first-person narration, companion books about enslaved worlds, and alien species. That's a fascinating combination! Do you have a specific question or topic you'd like to discuss about these books?\"), StepOutput(step=4, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.\")', fun_name='llm_tool', fun_args=['Science fantasy young adult series narrated in first person with companion books about enslaved worlds and alien species.'], fun_kwargs={}, observation='It sounds like you\\'re looking for a science fantasy young adult series that is narrated in the first person and includes companion books about enslaved worlds and alien species. One recommendation that fits this description is the \"Red Rising\" series by Pierce Brown. This series follows the story of Darrow, a young man living in a society where people are divided by color and forced to work as slaves for the ruling class. As Darrow discovers the truth about his world and joins a rebellion, he encounters various alien species and explores different enslaved worlds. The series includes multiple companion books that delve deeper into the world and its characters.'), StepOutput(step=5, thought='The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you with the Red Rising series by Pierce Brown?'), StepOutput(step=6, thought='The user is seeking confirmation on a specific science fantasy young adult series with companion books focusing on enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you today with the Red Rising series by Pierce Brown?'), StepOutput(step=7, thought='The user is looking for a science fantasy young adult series narrated in the first person with companion books about enslaved worlds and alien species.', action='llm_tool(\"Red Rising series by Pierce Brown.\")', fun_name='llm_tool', fun_args=['Red Rising series by Pierce Brown.'], fun_kwargs={}, observation='Hello! How can I assist you today with the Red Rising series by Pierce Brown?')]\n", - "\u001b[33m2024-06-15 23:17:31 - [3230041225.py:26:] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: hello how can i assist you today with red rising series by pierce brown\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:31 - [react_agent.py:330:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:33 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use llm_tool to quickly answer this question.\n", - "Action 1: llm_tool(\"Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\")\n", - "Observation 1: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:35 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is asking about the location of two specific landmarks.\n", - "Action 2: finish(\"No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\")\n", - "Observation 2: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:35 - [react_agent.py:345:call] - answer: No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I will use llm_tool to quickly answer this question.', action='llm_tool(\"Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\")', fun_name='llm_tool', fun_args=['Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?'], fun_kwargs={}, observation='No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.'), StepOutput(step=2, thought='The user is asking about the location of two specific landmarks.', action='finish(\"No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.\")', fun_name='finish', fun_args=['No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.'], fun_kwargs={}, observation='No, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. The Laleli Mosque is located in the Laleli neighborhood of Istanbul, while the Esma Sultan Mansion is located in the Ortaköy neighborhood.')]\n", - "\u001b[33m2024-06-15 23:17:35 - [3230041225.py:26:] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is located in laleli neighborhood of istanbul while esma sultan mansion is located in ortaköy neighborhood\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:35 - [react_agent.py:330:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:36 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n", - "Action 1: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n", - "Observation 1: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:38 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the specific New York city where the movie 'Big Stone Gap' is based.\n", - "Action 2: llm_tool(\"New York city where the movie 'Big Stone Gap' is based in\")\n", - "Observation 2: What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:40 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking information about the specific New York city where the movie 'Big Stone Gap' is based.\n", - "Action 3: llm_tool(\"New York city where the movie 'Big Stone Gap' is based in\")\n", - "Observation 3: What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:41 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n", - "Action 4: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n", - "Observation 4: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:42 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n", - "Action 5: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n", - "Observation 5: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:44 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n", - "Action 6: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n", - "Observation 6: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:45 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\n", - "Action 7: llm_tool(\"director of the romantic comedy Big Stone Gap\")\n", - "Observation 7: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:45 - [react_agent.py:345:call] - answer: Hello! How can I assist you today?\u001b[0m\n", - "step_history: [StepOutput(step=1, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=2, thought=\"The user is inquiring about the specific New York city where the movie 'Big Stone Gap' is based.\", action='llm_tool(\"New York city where the movie \\'Big Stone Gap\\' is based in\")', fun_name='llm_tool', fun_args=[\"New York city where the movie 'Big Stone Gap' is based in\"], fun_kwargs={}, observation=\"What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\"), StepOutput(step=3, thought=\"The user is seeking information about the specific New York city where the movie 'Big Stone Gap' is based.\", action='llm_tool(\"New York city where the movie \\'Big Stone Gap\\' is based in\")', fun_name='llm_tool', fun_args=[\"New York city where the movie 'Big Stone Gap' is based in\"], fun_kwargs={}, observation=\"What can you tell me about New York City, where the movie 'Big Stone Gap' is based?\"), StepOutput(step=4, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=5, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=6, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=7, thought=\"The user is asking for the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action='llm_tool(\"director of the romantic comedy Big Stone Gap\")', fun_name='llm_tool', fun_args=['director of the romantic comedy Big Stone Gap'], fun_kwargs={}, observation='Hello! How can I assist you today?')]\n", - "\u001b[33m2024-06-15 23:17:45 - [3230041225.py:26:] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: hello how can i assist you today\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:45 - [react_agent.py:330:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:47 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the debut album of a South Korean boy group and who formed it.\n", - "Action 1: llm_tool(\"2014 S/S is the debut album of a South Korean boy group\")\n", - "Observation 1: What is the name of the South Korean boy group that released the album 2014 S/S?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:49 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the formation of the South Korean boy group that released the album 2014 S/S.\n", - "Action 2: llm_tool(\"Who formed the South Korean boy group that released the album 2014 S/S?\")\n", - "Observation 2: The South Korean boy group that released the album 2014 S/S is WINNER.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:50 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking confirmation of the information provided.\n", - "Action 3: llm_tool(\"Is WINNER the South Korean boy group that released the album 2014 S/S?\")\n", - "Observation 3: Yes, WINNER is the South Korean boy group that released the album 2014 S/S.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:52 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is looking for the name of the person or group that formed the South Korean boy group WINNER.\n", - "Action 4: llm_tool(\"Who formed the South Korean boy group WINNER?\")\n", - "Observation 4: The South Korean boy group WINNER was formed by YG Entertainment through the reality survival program \"WIN: Who is Next\" in 2013.\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:53 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking about the formation of the South Korean boy group that released the album 2014 S/S.\n", - "Action 5: finish(\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\")\n", - "Observation 5: YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:53 - [react_agent.py:345:call] - answer: YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The user is asking about the debut album of a South Korean boy group and who formed it.', action='llm_tool(\"2014 S/S is the debut album of a South Korean boy group\")', fun_name='llm_tool', fun_args=['2014 S/S is the debut album of a South Korean boy group'], fun_kwargs={}, observation='What is the name of the South Korean boy group that released the album 2014 S/S?'), StepOutput(step=2, thought='The user is inquiring about the formation of the South Korean boy group that released the album 2014 S/S.', action='llm_tool(\"Who formed the South Korean boy group that released the album 2014 S/S?\")', fun_name='llm_tool', fun_args=['Who formed the South Korean boy group that released the album 2014 S/S?'], fun_kwargs={}, observation='The South Korean boy group that released the album 2014 S/S is WINNER.'), StepOutput(step=3, thought='The user is seeking confirmation of the information provided.', action='llm_tool(\"Is WINNER the South Korean boy group that released the album 2014 S/S?\")', fun_name='llm_tool', fun_args=['Is WINNER the South Korean boy group that released the album 2014 S/S?'], fun_kwargs={}, observation='Yes, WINNER is the South Korean boy group that released the album 2014 S/S.'), StepOutput(step=4, thought='The user is looking for the name of the person or group that formed the South Korean boy group WINNER.', action='llm_tool(\"Who formed the South Korean boy group WINNER?\")', fun_name='llm_tool', fun_args=['Who formed the South Korean boy group WINNER?'], fun_kwargs={}, observation='The South Korean boy group WINNER was formed by YG Entertainment through the reality survival program \"WIN: Who is Next\" in 2013.'), StepOutput(step=5, thought='The user is asking about the formation of the South Korean boy group that released the album 2014 S/S.', action='finish(\"YG Entertainment through the reality survival program \\'WIN: Who is Next\\' in 2013.\")', fun_name='finish', fun_args=[\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\"], fun_kwargs={}, observation=\"YG Entertainment through the reality survival program 'WIN: Who is Next' in 2013.\")]\n", - "\u001b[33m2024-06-15 23:17:53 - [3230041225.py:26:] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment through reality survival program win who is next in 2013\u001b[0m\n", - "\u001b[36m2024-06-15 23:17:53 - [react_agent.py:330:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:54 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.\n", - "Action 1: llm_tool(\"Aladin stage name consultant organizations performance\")\n", - "Observation 1: Hello! How can I assist you with the information you provided?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:56 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the identity of the person known as Aladin who worked as a consultant to enhance organizational performance.\n", - "Action 2: llm_tool(\"Aladin consultant organizational performance\")\n", - "Observation 2: Hello! How can I assist you with Aladin consultant organizational performance?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:57 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.\n", - "Action 3: llm_tool(\"Aladin consultant organizational performance\")\n", - "Observation 3: Hello! How can I assist you with Aladin consultant organizational performance?\u001b[0m\n", - "\u001b[34m2024-06-15 23:17:59 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: The user is asking about the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\n", - "Action 4: llm_tool(\"Aladin consultant organizations performance\")\n", - "Observation 4: Hello! How can I assist you with the performance of Aladin consultant organizations?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:00 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is asking for information about a person known as Aladin who worked as a consultant to enhance organizational performance.\n", - "Action 5: llm_tool(\"Aladin consultant organizations performance\")\n", - "Observation 5: Hello! How can I assist you with the Aladin consultant organizations performance?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:01 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.\n", - "Action 6: llm_tool(\"Aladin consultant organizations performance\")\n", - "Observation 6: Hello! How can I assist you with the Aladin consultant organizations performance?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:03 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: The user is looking for information about the person known as Aladin, who assisted organizations in improving their performance as a consultant.\n", - "Action 7: llm_tool(\"Aladin consultant organizations performance\")\n", - "Observation 7: Hello! How can I assist you with the Aladin consultant organizations performance data?\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:03 - [react_agent.py:345:call] - answer: Hello! How can I assist you with the Aladin consultant organizations performance data?\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action='llm_tool(\"Aladin stage name consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin stage name consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the information you provided?'), StepOutput(step=2, thought='The user is inquiring about the identity of the person known as Aladin who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizational performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizational performance'], fun_kwargs={}, observation='Hello! How can I assist you with Aladin consultant organizational performance?'), StepOutput(step=3, thought='The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizational performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizational performance'], fun_kwargs={}, observation='Hello! How can I assist you with Aladin consultant organizational performance?'), StepOutput(step=4, thought='The user is asking about the person known by the stage name Aladin who helped organizations improve their performance as a consultant.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the performance of Aladin consultant organizations?'), StepOutput(step=5, thought='The user is asking for information about a person known as Aladin who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance?'), StepOutput(step=6, thought='The user is seeking confirmation of the identity of the person known as Aladin, who worked as a consultant to enhance organizational performance.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance?'), StepOutput(step=7, thought='The user is looking for information about the person known as Aladin, who assisted organizations in improving their performance as a consultant.', action='llm_tool(\"Aladin consultant organizations performance\")', fun_name='llm_tool', fun_args=['Aladin consultant organizations performance'], fun_kwargs={}, observation='Hello! How can I assist you with the Aladin consultant organizations performance data?')]\n", - "\u001b[33m2024-06-15 23:18:03 - [3230041225.py:26:] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: hello how can i assist you with aladin consultant organizations performance data\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:03 - [react_agent.py:330:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:05 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n", - "Action 1: llm_tool(\"Seating capacity of the arena where the Lewiston Maineiacs played their home games.\")\n", - "Observation 1: The seating capacity of the arena where the Lewiston Maineiacs played their home games was 3,677.\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:06 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the seating capacity of the arena where the Lewiston Maineiacs played their home games.\n", - "Action 2: finish(\"3,677.\")\n", - "Observation 2: 3,677.\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:06 - [react_agent.py:345:call] - answer: 3,677.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='llm_tool(\"Seating capacity of the arena where the Lewiston Maineiacs played their home games.\")', fun_name='llm_tool', fun_args=['Seating capacity of the arena where the Lewiston Maineiacs played their home games.'], fun_kwargs={}, observation='The seating capacity of the arena where the Lewiston Maineiacs played their home games was 3,677.'), StepOutput(step=2, thought='The user is inquiring about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action='finish(\"3,677.\")', fun_name='finish', fun_args=['3,677.'], fun_kwargs={}, observation='3,677.')]\n", - "\u001b[33m2024-06-15 23:18:06 - [3230041225.py:26:] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: 3677\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:06 - [react_agent.py:330:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:07 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: I will use the llm_tool to get the information about the ages of Annie Morton and Terry Richardson.\n", - "Action 1: llm_tool(\"Annie Morton age\")\n", - "Observation 1: Hello! How old is Annie Morton?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:09 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 2: llm_tool(\"Annie Morton age\")\n", - "Observation 2: Hello! How old is Annie Morton?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:10 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: I will use the llm_tool to get the information about the ages of Terry Richardson.\n", - "Action 3: llm_tool(\"Terry Richardson age\")\n", - "Observation 3: How old is Terry Richardson?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:12 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 4: llm_tool(\"Annie Morton age\")\n", - "Observation 4: Hello! How old is Annie Morton?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:13 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 5: llm_tool(\"Annie Morton age\")\n", - "Observation 5: Hello! How old is Annie Morton?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:14 - [react_agent.py:314:_run_one_step] - step: 6, response: Thought 6: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 6: llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")\n", - "Observation 6: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:16 - [react_agent.py:314:_run_one_step] - step: 7, response: Thought 7: I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.\n", - "Action 7: llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")\n", - "Observation 7: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:16 - [react_agent.py:345:call] - answer: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='I will use the llm_tool to get the information about the ages of Annie Morton and Terry Richardson.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=2, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=3, thought='I will use the llm_tool to get the information about the ages of Terry Richardson.', action='llm_tool(\"Terry Richardson age\")', fun_name='llm_tool', fun_args=['Terry Richardson age'], fun_kwargs={}, observation='How old is Terry Richardson?'), StepOutput(step=4, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=5, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Annie Morton age\")', fun_name='llm_tool', fun_args=['Annie Morton age'], fun_kwargs={}, observation='Hello! How old is Annie Morton?'), StepOutput(step=6, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")', fun_name='llm_tool', fun_args=['Who is older, Annie Morton or Terry Richardson?'], fun_kwargs={}, observation='Who is older, Annie Morton or Terry Richardson?'), StepOutput(step=7, thought='I will use the llm_tool to compare the ages of Annie Morton and Terry Richardson to determine who is older.', action='llm_tool(\"Who is older, Annie Morton or Terry Richardson?\")', fun_name='llm_tool', fun_args=['Who is older, Annie Morton or Terry Richardson?'], fun_kwargs={}, observation='Who is older, Annie Morton or Terry Richardson?')]\n", - "\u001b[33m2024-06-15 23:18:16 - [3230041225.py:26:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: who is older annie morton or terry richardson\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:16 - [react_agent.py:330:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:17 - [react_agent.py:314:_run_one_step] - step: 1, response: Thought 1: The user is asking about the origin of two entities, Local H and For Against.\n", - "Action 1: llm_tool(\"Local H origin\")\n", - "Observation 1: Hello! How can I assist you today?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:18 - [react_agent.py:314:_run_one_step] - step: 2, response: Thought 2: The user is inquiring about the origin of For Against.\n", - "Action 2: llm_tool(\"For Against origin\")\n", - "Observation 2: Hello! How can I assist you today regarding the topic of origin?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:20 - [react_agent.py:314:_run_one_step] - step: 3, response: Thought 3: The user wants to know if both Local H and For Against are from the United States.\n", - "Action 3: llm_tool(\"Local H and For Against origin\")\n", - "Observation 3: Hello! How can I assist you today with the information about Local H and their stance on For Against?\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:23 - [react_agent.py:314:_run_one_step] - step: 4, response: Thought 4: Confirming the origin of Local H and For Against.\n", - "Action 4: llm_tool(\"Are Local H and For Against both from the United States?\")\n", - "Observation 4: Yes, both Local H and For Against are from the United States.\u001b[0m\n", - "\u001b[34m2024-06-15 23:18:24 - [react_agent.py:314:_run_one_step] - step: 5, response: Thought 5: The user is seeking confirmation about the origin of Local H and For Against.\n", - "Action 5: finish(\"Yes, both Local H and For Against are from the United States.\")\n", - "Observation 5: Yes, both Local H and For Against are from the United States.\u001b[0m\n", - "\u001b[36m2024-06-15 23:18:24 - [react_agent.py:345:call] - answer: Yes, both Local H and For Against are from the United States.\u001b[0m\n", - "step_history: [StepOutput(step=1, thought='The user is asking about the origin of two entities, Local H and For Against.', action='llm_tool(\"Local H origin\")', fun_name='llm_tool', fun_args=['Local H origin'], fun_kwargs={}, observation='Hello! How can I assist you today?'), StepOutput(step=2, thought='The user is inquiring about the origin of For Against.', action='llm_tool(\"For Against origin\")', fun_name='llm_tool', fun_args=['For Against origin'], fun_kwargs={}, observation='Hello! How can I assist you today regarding the topic of origin?'), StepOutput(step=3, thought='The user wants to know if both Local H and For Against are from the United States.', action='llm_tool(\"Local H and For Against origin\")', fun_name='llm_tool', fun_args=['Local H and For Against origin'], fun_kwargs={}, observation='Hello! How can I assist you today with the information about Local H and their stance on For Against?'), StepOutput(step=4, thought='Confirming the origin of Local H and For Against.', action='llm_tool(\"Are Local H and For Against both from the United States?\")', fun_name='llm_tool', fun_args=['Are Local H and For Against both from the United States?'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.'), StepOutput(step=5, thought='The user is seeking confirmation about the origin of Local H and For Against.', action='finish(\"Yes, both Local H and For Against are from the United States.\")', fun_name='finish', fun_args=['Yes, both Local H and For Against are from the United States.'], fun_kwargs={}, observation='Yes, both Local H and For Against are from the United States.')]\n", - "\u001b[33m2024-06-15 23:18:24 - [3230041225.py:26:] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes both local h and for against are from united states\u001b[0m\n", - "EM = (0.0, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), FM = (0.4, [0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0]), average time = 8.354214406013488\n" - ] - } - ], - "source": [ - "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", - "\n", - "# set up evaluation type\n", - "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n", - "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n", - "\n", - "agent = ReActAgent(\n", - " max_steps=7, model_client=OpenAIClient(), model_kwargs=gpt_model_kwargs\n", - ")\n", - "\n", - "num_questions = 10\n", - "gt_answers = []\n", - "pred_answers = []\n", - "start_time = time.time()\n", - "for i in range(num_questions):\n", - " question = val_dataset[i][\"question\"]\n", - " gt_answer = normalize_answer(\n", - " val_dataset[i][\"answer\"]\n", - " ) # normalize the ground truth answer\n", - " gt_answers.append(gt_answer)\n", - "\n", - " # get the agent's response\n", - " pred_answer = agent(question)\n", - " pred_answer = normalize_answer(pred_answer)\n", - " pred_answers.append(pred_answer)\n", - "\n", - " printc(\n", - " f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", - " color=\"yellow\",\n", - " )\n", - "\n", - "end_time = time.time()\n", - "\n", - "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", - "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", - "avg_time = (end_time - start_time) / num_questions\n", - "\n", - "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Without the tools and examples, EM=0 and FM=0.4. We saw hallucinations and nonsense:\n", - "\n", - "2024-06-15 23:17:04 - [3230041225.py:26:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes``, pred answer: ``no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american``\n", - "\n", - "2024-06-15 23:18:16 - [3230041225.py:26:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth:`` terry richardson``, pred answer: ``who is older annie morton or terry richardson``\n", - "\n", - "Therefore, using ReAct agent outperforms the base LLM.\n", - "Meanwhile, ``LightRAG ReAct agent`` shows that the performance on 10 questions(EM=0.3)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 7. Future Improvement" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO:\n", - "# 1. advanced, add history to react\n", - "# 2. add training, few shot\n", - "# 3. llm as judge\n", - "# 4. add picture\n", - "# 5. better json handling, we need to store the answer output" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "lightrag-project", - "language": "python", - "name": "light-rag-project" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/use_cases/agent/react_agent_hotpot_qa.ipynb b/use_cases/agent/react_agent_hotpot_qa.ipynb new file mode 100644 index 00000000..0e1d4d6d --- /dev/null +++ b/use_cases/agent/react_agent_hotpot_qa.ipynb @@ -0,0 +1,1272 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ReAct Agent Use Case" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Q&A Chatbot\n", + "In this tutorial, we will implement ``adalflow ReAct`` to build a Q&A chatbot on [HotpotQA](https://arxiv.org/pdf/1809.09600) dataset. \n", + "\n", + "To learn more about ``adalflow ReAct``, please refer to our developer notes." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# 2. HotpotQA Dataset\n", + "We are using [HotpotQA](https://arxiv.org/pdf/1809.09600). It is a Wikipedia-based multi-hop question and answer dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# load the dataset\n", + "from datasets import load_dataset\n", + "\n", + "dataset = load_dataset(path=\"hotpot_qa\", name=\"fullwiki\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "len of eval: 7405\n", + "example: {'id': '5a8b57f25542995d1e6f1371', 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?', 'answer': 'yes', 'type': 'comparison', 'level': 'hard', 'supporting_facts': {'title': ['Scott Derrickson', 'Ed Wood'], 'sent_id': [0, 0]}, 'context': {'title': ['Adam Collis', 'Ed Wood (film)', 'Tyler Bates', 'Doctor Strange (2016 film)', 'Hellraiser: Inferno', 'Sinister (film)', 'Deliver Us from Evil (2014 film)', 'Woodson, Arkansas', 'Conrad Brooks', 'The Exorcism of Emily Rose'], 'sentences': [['Adam Collis is an American filmmaker and actor.', ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.', ' He also studied cinema at the University of Southern California from 1991 to 1997.', ' Collis first work was the assistant director for the Scott Derrickson\\'s short \"Love in the Ruins\" (1995).', ' In 1998, he played \"Crankshaft\" in Eric Koyanagi\\'s \"Hundred Percent\".'], ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.', \" The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.\", ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.'], ['Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.', ' Much of his work is in the action and horror film genres, with films like \"Dawn of the Dead, 300, Sucker Punch,\" and \"John Wick.\"', ' He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.', ' With Gunn, he has scored every one of the director\\'s films; including \"Guardians of the Galaxy\", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.', ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums \"The Pale Emperor\" and \"Heaven Upside Down\".'], ['Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.', ' It is the fourteenth film of the Marvel Cinematic Universe (MCU).', ' The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.', ' In \"Doctor Strange\", surgeon Strange learns the mystic arts after a career-ending car accident.'], ['Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.', ' It is the fifth installment in the \"Hellraiser\" series and the first \"Hellraiser\" film to go straight-to-DVD.', ' It was directed by Scott Derrickson and released on October 3, 2000.', \" The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.\", \" The film's reviews were mixed.\"], ['Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.', ' It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.'], ['Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.', ' The film is officially based on a 2001 non-fiction book entitled \"Beware the Night\" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was \"inspired by actual accounts\".', ' The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.'], ['Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.', ' Its population was 403 at the 2010 census.', ' It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.', ' Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.', ' Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.'], ['Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.', ' He moved to Hollywood, California in 1948 to pursue a career in acting.', ' He got his start in movies appearing in Ed Wood films such as \"Plan 9 from Outer Space\", \"Glen or Glenda\", and \"Jail Bait.\"', ' He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.', ' He also has since gone on to write, produce and direct several films.'], ['The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.', ' The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.']]}}\n", + "attributes in each sample: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context']\n" + ] + } + ], + "source": [ + "# check the data sample\n", + "test_sample = dataset[\"validation\"][0]\n", + "print(f\"len of eval: {len(dataset['validation'])}\")\n", + "print(f\"example: {test_sample}\")\n", + "print(f\"attributes in each sample: {list(test_sample.keys())}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "question: Were Scott Derrickson and Ed Wood of the same nationality?\n", + "answer: yes\n" + ] + } + ], + "source": [ + "# Each sample contains a question and a corresponding answer.\n", + "print(f\"question: {test_sample.get('question')}\")\n", + "print(f\"answer: {test_sample.get('answer')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Set up\n", + "Please make sure you have set the model client APIs before running the agent. Now import the necessary packages." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import dotenv\n", + "from adalflow.components.model_client import OpenAIClient\n", + "from adalflow.components.agent.react import ReActAgent\n", + "from adalflow.core.func_tool import FunctionTool\n", + "\n", + "import time\n", + "\n", + "# load evironment, please set the relative path to your .env file that includes the api key\n", + "dotenv.load_dotenv(dotenv_path=\"../../.env\", override=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Create Agent\n", + "To create an gent, we need to define the basic components.\n", + "\n", + "## Tools\n", + "Firstly, we need to specify what functions the agent will need to answer the question. In this case, we are answering the Wikipedia-based questions, we will allow the agent to **search** Wikipedia api. The [ReAct Paper](https://arxiv.org/pdf/2210.03629) includes a **lookup** function that serves as Ctrl+F functionality on the browser.\n", + "\n", + "As ``adalflow ReAct`` has a built in ``finish`` function, we don't need to define by ourselves." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "import re\n", + "import string\n", + "\n", + "\n", + "# copy code from the paper\n", + "def clean_str(p):\n", + " return p.encode().decode(\"unicode-escape\").encode(\"latin1\").decode(\"utf-8\")\n", + "\n", + "\n", + "# normalization copied from the paper's code\n", + "def normalize_answer(s):\n", + " def remove_articles(text):\n", + " return re.sub(r\"\\b(a|an|the)\\b\", \" \", text)\n", + "\n", + " def white_space_fix(text):\n", + " return \" \".join(text.split())\n", + "\n", + " def remove_punc(text):\n", + " exclude = set(string.punctuation)\n", + " return \"\".join(ch for ch in text if ch not in exclude)\n", + "\n", + " def lower(text):\n", + " return text.lower()\n", + "\n", + " return white_space_fix(remove_articles(remove_punc(lower(s))))\n", + "\n", + "\n", + "def search(entity: str) -> str:\n", + " \"\"\"\n", + " searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\n", + " \"\"\"\n", + " # Format the entity for URL encoding\n", + " entity_formatted = entity.replace(\" \", \"+\")\n", + " url = f\"https://en.wikipedia.org/w/index.php?search={entity_formatted}\"\n", + "\n", + " # Fetch the page\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text, \"html.parser\")\n", + "\n", + " # Check if the exact page was found or suggest similar items\n", + " # when

is detected, it means the entity page is not found on wikipedia\n", + " result_divs = soup.find_all(\"div\", {\"class\": \"mw-search-result-heading\"})\n", + "\n", + " if (\n", + " result_divs\n", + " ): # this means the searched entity page is not in wikipedia, wikipedia will show a list of similar entities\n", + " # get Similar results\n", + " similar_titles = [div.a.get_text() for div in result_divs]\n", + " return f\"Could not find exact page for '{entity}'. Similar topics: {similar_titles[:5]}\" # return the top 5 similar titles\n", + " else:\n", + " # the paper uses page to represent content in

\n", + " # Extract xontent\n", + " page_list = [\n", + " p.get_text().strip() for p in soup.find_all(\"p\") + soup.find_all(\"ul\")\n", + " ]\n", + " # TODO: Recursive search, if find any concept that needs more search then call search again\n", + " # if any(\"may refer to:\" in p for p in page_list):\n", + " # search(entity)\n", + "\n", + " # restructure & clean the page content following the paper's logic\n", + " page = \"\"\n", + " for p in page_list:\n", + " if len(p.split(\" \")) > 2:\n", + " page += clean_str(p)\n", + " if not p.endswith(\"\\n\"):\n", + " page += \"\\n\"\n", + " paragraphs = page.split(\"\\n\")\n", + " paragraphs = [p.strip() for p in paragraphs if p.strip()]\n", + "\n", + " sentences = []\n", + " for p in paragraphs:\n", + " sentences += p.split(\". \")\n", + " sentences = [s.strip() + \".\" for s in sentences if s.strip()]\n", + "\n", + " # return the first 5 sentences\n", + " if sentences:\n", + " return (\n", + " \" \".join(sentences[:5]) if len(sentences) >= 5 else \" \".join(sentences)\n", + " )\n", + " else:\n", + " return \"No content found on this page.\"\n", + "\n", + " # TODO: clean the paragraphs and return the searched content\n", + "\n", + "\n", + "def lookup(text: str, keyword: str) -> str:\n", + " \"\"\"\n", + " returns the sentences containing keyword in the current passage.\n", + " \"\"\"\n", + " sentences = text.split(\".\")\n", + " matching_sentences = [\n", + " sentence.strip() + \".\"\n", + " for sentence in sentences\n", + " if keyword.lower() in sentence.lower()\n", + " ]\n", + " if not matching_sentences:\n", + " return \"No sentences found with the keyword.\"\n", + " else:\n", + " return \" \".join(\n", + " matching_sentences\n", + " ) # Join all matching sentences into a single string" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# set up tools for the agent\n", + "tools = [FunctionTool(fn=search)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Examples\n", + "The next thing to add is examples. Few shot prompt engineering is a common practice to improve the model performance.\n", + "\n", + "Let's use the paper's examples. The paper has 6 examples altogether." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "examples = [\n", + " \"\"\"Question: What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\n", + "Thought 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\n", + "Action 1: search(\"Colorado orogeny\")\n", + "Observation 1: The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\n", + "Thought 2: It does not mention the eastern sector. So I need to look up eastern sector.\n", + "Action 2: lookup(\"eastern sector\")\n", + "Observation 2: (Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\n", + "Thought 3: The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\n", + "Action 3: search(\"High Plains\")\n", + "Observation 3: High Plains refers to one of two distinct land regions:\n", + "Thought 4: I need to instead search High Plains (United States).\n", + "Action 4: search(\"High Plains (United States)\")\n", + "Observation 4: The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\n", + "Thought 5: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\n", + "Action 5: finish(\"1,800 to 7,000 ft\")\"\"\",\n", + " \"\"\"Question: Musician and satirist Allie Goertz wrote a song about the \"The Simpsons\" character Milhouse, who Matt Groening named after who?\n", + "Thought 1: The question simplifies to \"The Simpsons\" character Milhouse is named after who. I only need to search Milhouse and find who it is named after.\n", + "Action 1: search(\"Milhouse\")\n", + "Observation 1: Milhouse Mussolini Van Houten is a recurring character in the Fox animated television series The Simpsons voiced by Pamela Hayden and created by Matt Groening.\n", + "Thought 2: The paragraph does not tell who Milhouse is named after, maybe I can look up \"named after\".\n", + "Action 2: lookup(\"named after\")\n", + "Observation 2: (Result 1 / 1) Milhouse was named after U.S. president Richard Nixon, whose middle name was Milhous. \n", + "Thought 3: Milhouse was named after U.S. president Richard Nixon, so the answer is Richard Nixon.\n", + "Action 3: finish(\"Richard Nixon\")\"\"\",\n", + " \"\"\"Question: Which documentary is about Finnish rock groups, Adam Clayton Powell or The Saimaa Gesture?\n", + "Thought 1: I need to search Adam Clayton Powell and The Saimaa Gesture, and find which documentary is about Finnish rock groups.\n", + "Action 1: search(\"Adam Clayton Powell\")\n", + "Observation 1: Could not find [\"Adam Clayton Powell\"]. Similar: ['Adam Clayton Powell III', 'Seventh Avenue (Manhattan)', 'Adam Clayton Powell Jr. State Office Building', 'Isabel Washington Powell', 'Adam Powell', 'Adam Clayton Powell (film)', 'Giancarlo Esposito'].\n", + "Thought 2: To find the documentary, I can search Adam Clayton Powell (film).\n", + "Action 2: search(\"Adam Clayton Powell (film)\")\n", + "Observation 2: Adam Clayton Powell is a 1989 American documentary film directed by Richard Kilberg.\n", + "The film is about the rise and fall of influential African-American politician Adam Clayton Powell Jr.[3][4] It was later aired as part of the PBS series The American Experience.\n", + "Thought 3: Adam Clayton Powell (film) is a documentary about an African-American politician, not Finnish rock groups. So the documentary about Finnish rock groups must instead be The Saimaa Gesture.\n", + "Action 3: finish(\"The Saimaa Gesture\")\"\"\",\n", + " \"\"\"Question: What profession does Nicholas Ray and Elia Kazan have in common?\n", + "Thought 1: I need to search Nicholas Ray and Elia Kazan, find their professions, then find the profession they have in common.\n", + "Action 1: search(\"Nicholas Ray\")\n", + "Observation 1: Nicholas Ray (born Raymond Nicholas Kienzle Jr., August 7, 1911 – June 16, 1979) was an American film director, screenwriter, and actor best known for the 1955 film Rebel Without a Cause.\n", + "Thought 2: Professions of Nicholas Ray are director, screenwriter, and actor. I need to search Elia Kazan next and find his professions.\n", + "Action 2: search(\"Elia Kazan\")\n", + "Observation 2: Elia Kazan was an American film and theatre director, producer, screenwriter and actor.\n", + "Thought 3: Professions of Elia Kazan are director, producer, screenwriter, and actor. So profession Nicholas Ray and Elia Kazan have in common is director, screenwriter, and actor.\n", + "Action 3: finish(\"director, screenwriter, actor\")\"\"\",\n", + " \"\"\"Question: Which magazine was started first Arthur's Magazine or First for Women?\n", + "Thought 1: I need to search Arthur's Magazine and First for Women, and find which was started first.\n", + "Action 1: search(\"Arthur's Magazine\")\n", + "Observation 1: Arthur's Magazine (1844-€“1846) was an American literary periodical published in Philadelphia in the 19th century. \n", + "Thought 2: Arthur's Magazine was started in 1844. I need to search First for Women next.\n", + "Action 2: search(\"First for Women\")\n", + "Observation 2: First for Women is a woman's magazine published by Bauer Media Group in the USA.[1] The magazine was started in 1989. \n", + "Thought 3: First for Women was started in 1989. 1844 (Arthur's Magazine) < 1989 (First for Women), so Arthur's Magazine was started first.\n", + "Action 3: finish(\"Arthur's Magazine\")\"\"\",\n", + " \"\"\"Question: Were Pavel Urysohn and Leonid Levin known for the same type of work?\n", + "Thought 1: I need to search Pavel Urysohn and Leonid Levin, find their types of work, then find if they are the same.\n", + "Action 1: search(\"Pavel Urysohn\")\n", + "Observation 1: Pavel Samuilovich Urysohn (February 3, 1898 – August 17, 1924) was a Soviet mathematician who is best known for his contributions in dimension theory.\n", + "Thought 2: Pavel Urysohn is a mathematician. I need to search Leonid Levin next and find its type of work.\n", + "Action 2: search(\"Leonid Levin\")\n", + "Observation 2: Leonid Anatolievich Levin is a Soviet-American mathematician and computer scientist. \n", + "Thought 3: Leonid Levin is a mathematician and computer scientist. So Pavel Urysohn and Leonid Levin have the same type of work. \n", + "Action 3: finish(\"yes\")\"\"\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# preset up the examples as prompt_kwargs, the examples will be included in the system prompt\n", + "\n", + "# convert this example in FunctionExpression\n", + "# import adalflow as adal\n", + "# from adalflow.core.types import FunctionExpression\n", + "\n", + "# eg_1 = {\n", + "# \"Question\": \"What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?\",\n", + "# \"Thought 1\": \"I need to search Colorado orogeny, find the area that the eastern sector of the Colorado orogeny extends into, then find the elevation range of the area.\",\n", + "# \"Action 1\": \"search\",\n", + "# \"kwargs\": {\"entity\": \"Colorado orogeny\"},\n", + "# \"Observation 1\": \"The Colorado orogeny was an episode of mountain building (an orogeny) in Colorado and surrounding areas.\",\n", + "# \"Thought 2\": \"It does not mention the eastern sector. So I need to look up eastern sector.\",\n", + "# \"Action 2\": \"lookup('eastern sector')\",\n", + "# \"Action 2\": \"lookup\",\n", + "# \"kwargs\": {\"text\": \"eastern sector\", \"keyword\": \"eastern sector\"},\n", + "# \"Observation 2\": \"(Result 1 / 1) The eastern sector extends into the High Plains and is called the Central Plains orogeny.\",\n", + "# \"Thought 3\": \"The eastern sector of Colorado orogeny extends into the High Plains. So I need to search High Plains and find its elevation range.\",\n", + "# \"Action 3\": \"search('High Plains')\",\n", + "# \"Observation 3\": \"High Plains refers to one of two distinct land regions:\",\n", + "# \"Thought 4\": \"I need to instead search High Plains (United States).\",\n", + "# \"Action 4\": \"search('High Plains (United States)')\",\n", + "# \"Observation 4\": \"The High Plains are a subregion of the Great Plains. From east to west, the High Plains rise in elevation from around 1,800 to 7,000 ft (550 to 2,130 m).[3]\",\n", + "# \"Thought 5\": \"High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.\",\n", + "# \"Action 5\": \"finish('1,800 to 7,000 ft')\"\n", + "# }\n", + "\n", + "# examples_expression = []\n", + "# for example in examples:\n", + "# ex_exp = FunctionExpression(thought=example)\n", + "\n", + "# preset_prompt_kwargs = {\"examples\": examples}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Model\n", + "\n", + "Next, we can choose the model to call. In this example we will use OpenAIClient ``gpt-3.5-turbo`` model. We will set the ``temperature`` at 0.0 to make the response as consistent as possible." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "gpt_model_kwargs = {\n", + " \"model\": \"gpt-3.5-turbo\",\n", + " \"temperature\": 0.0,\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Agent\n", + "Combining the previous components, we can define the agent." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ReActAgent(\n", + " max_steps=3, add_llm_as_fallback=True, \n", + " (tool_manager): ToolManager(Tools: [FunctionTool(fn: , async: False, definition: FunctionDefinition(func_name='search', func_desc='search(entity: str) -> str\\n\\n searches the exact entity on Wikipedia and returns the first paragraph if it exists. If not, it will return some similar entities to search.\\n ', func_parameters={'type': 'object', 'properties': {'entity': {'type': 'str'}}, 'required': ['entity']})), FunctionTool(fn: .llm_tool at 0x1379487c0>, async: False, definition: FunctionDefinition(func_name='llm_tool', func_desc=\"llm_tool(input: str) -> str\\nI answer any input query with llm's world knowledge. Use me as a fallback tool or when the query is simple.\", func_parameters={'type': 'object', 'properties': {'input': {'type': 'str'}}, 'required': ['input']})), FunctionTool(fn: .finish at 0x137948400>, async: False, definition: FunctionDefinition(func_name='finish', func_desc='finish(answer: str) -> str\\nFinish the task with answer.', func_parameters={'type': 'object', 'properties': {'answer': {'type': 'str'}}, 'required': ['answer']}))], Additional Context: {})\n", + " (planner): Generator(\n", + " model_kwargs={'model': 'gpt-3.5-turbo', 'temperature': 0.0}, trainable_prompt_kwargs=[]\n", + " (prompt): Prompt(\n", + " template: \n", + " {# role/task description #}\n", + " You are a helpful assistant.\n", + " Answer the user's query using the tools provided below with minimal steps and maximum accuracy.\n", + " {# REACT instructions #}\n", + " Each step you will read the previous Thought, Action, and Observation(execution result of the action) and then provide the next Thought and Action.\n", + " {# Tools #}\n", + " {% if tools %}\n", + " \n", + " You available tools are:\n", + " {% for tool in tools %}\n", + " {{ loop.index }}.\n", + " {{tool}}\n", + " ------------------------\n", + " {% endfor %}\n", + " \n", + " {% endif %}\n", + " {# output format and examples for output format #}\n", + " \n", + " {{output_format_str}}\n", + " \n", + " \n", + " {# Task specification to teach the agent how to think using 'divide and conquer' strategy #}\n", + " - For simple queries: Directly call the ``finish`` action and provide the answer.\n", + " - For complex queries:\n", + " - Step 1: Read the user query and potentially divide it into subqueries. And get started with the first subquery.\n", + " - Call one available tool at a time to solve each subquery/subquestion. \\\n", + " - At step 'finish', join all subqueries answers and finish the task.\n", + " Remember:\n", + " - Action must call one of the above tools with name. It can not be empty.\n", + " - You will always end with 'finish' action to finish the task. The answer can be the final answer or failure message.\n", + " \n", + " \n", + " -----------------\n", + " User query:\n", + " {{ input_str }}\n", + " {# Step History #}\n", + " {% if step_history %}\n", + " \n", + " Your previous steps:\n", + " {% for history in step_history %}\n", + " Step {{ loop.index }}.\n", + " \"Thought\": \"{{history.action.thought}}\",\n", + " \"Action\": \"{{history.action.action}}\",\n", + " \"Observation\": \"{{history.observation}}\"\n", + " ------------------------\n", + " {% endfor %}\n", + " \n", + " {% endif %}\n", + " You:, prompt_kwargs: {'tools': ['func_name: search\\nfunc_desc: \"search(entity: str) -> str\\\\n\\\\n searches the exact entity on Wikipedia\\\\\\n \\\\ and returns the first paragraph if it exists. If not, it will return some similar\\\\\\n \\\\ entities to search.\\\\n \"\\nfunc_parameters:\\n type: object\\n properties:\\n entity:\\n type: str\\n required:\\n - entity', \"func_name: llm_tool\\nfunc_desc: 'llm_tool(input: str) -> str\\n\\n I answer any input query with llm''s world knowledge. Use me as a fallback tool\\n or when the query is simple.'\\nfunc_parameters:\\n type: object\\n properties:\\n input:\\n type: str\\n required:\\n - input\", \"func_name: finish\\nfunc_desc: 'finish(answer: str) -> str\\n\\n Finish the task with answer.'\\nfunc_parameters:\\n type: object\\n properties:\\n answer:\\n type: str\\n required:\\n - answer\"], 'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n \"question\": \"The question to ask the LLM (Optional[str]) (optional)\",\\n \"thought\": \"Why the function is called (Optional[str]) (optional)\",\\n \"action\": \"FuncName() Valid function call expression. Example: \\\\\"FuncName(a=1, b=2)\\\\\" Follow the data type specified in the function parameters.e.g. for Type object with x,y properties, use \\\\\"ObjectType(x=1, y=2) (str) (required)\"\\n}\\n```\\nExamples:\\n```\\n{\\n \"question\": null,\\n \"thought\": \"I have finished the task.\",\\n \"action\": \"finish(answer=\\\\\"final answer: \\'answer\\'\\\\\")\"\\n}\\n________\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'tools', 'step_history', 'output_format_str']\n", + " )\n", + " (model_client): OpenAIClient()\n", + " (output_processors): JsonOutputParser(\n", + " data_class=FunctionExpression, examples=[FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"final answer: \\'answer\\'\")')], exclude_fields=None, include_fields=None, return_data_class=True\n", + " (output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard JSON instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " {% if example %}\n", + " Examples:\n", + " ```\n", + " {{example}}\n", + " ```\n", + " {% endif %}\n", + " -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + " -Use double quotes for the keys and string values.\n", + " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n", + " -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']\n", + " )\n", + " (output_processors): JsonParser()\n", + " )\n", + " )\n", + ")" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# max_steps refers to how many thought-action round we allow the model to perform\n", + "# to save resources, let's use 3 here\n", + "agent = ReActAgent(\n", + " tools=tools,\n", + " max_steps=3,\n", + " model_client=OpenAIClient(),\n", + " model_kwargs=gpt_model_kwargs,\n", + " # preset_prompt_kwargs=preset_prompt_kwargs,\n", + ")\n", + "agent" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import importlib\n", + "import adalflow\n", + "\n", + "importlib.reload(adalflow)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 5. Q & A\n", + "Next we can use the agent to answer our questions. Let's run 5 examples. We will use the validation data." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],\n", + " num_rows: 7405\n", + "})" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "val_dataset = dataset[\"validation\"]\n", + "val_dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "``LightRAG`` provides a ``printc`` function. You can utilize it to show colored console output for angent." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31m2024-12-19 13:58:48 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:49 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:50 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:51 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Scott Derrickson and Ed Wood were both of American nationality.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson and Ed Wood were both of American nationality.'}), observation='Scott Derrickson and Ed Wood were both of American nationality.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:58:51 - [react.py:299:call] - answer:\n", + " Scott Derrickson and Ed Wood were both of American nationality.\u001b[0m\n", + "\u001b[33m2024-12-19 13:58:51 - [530968165.py:14:] - question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson and ed wood were both of american nationality\u001b[0m\n", + "\u001b[31m2024-12-19 13:58:51 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:52 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Shirley Temple')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Shirley Temple'}), observation=\"This is an accepted version of this page. Shirley Temple Black (born Shirley Jane Temple; April 23, 1928 – February 10, 2014) was an American actress, singer, dancer, and diplomat, who was Hollywood's number-one box-office draw as a child actress from 1934 to 1938. Later, she was named United States Ambassador to Ghana and Czechoslovakia, and also served as Chief of Protocol of the United States.. Temple began her film career in 1931 when she was three years old and was well-known for her performance in Bright Eyes, which was released in 1934. She won a special Juvenile Academy Award in February 1935 for her outstanding contribution as a juvenile performer in motion pictures during 1934 and continued to appear in popular films through the remainder of the 1930s, although her subsequent films became less popular as she grew older.[1] She appeared in her last film, A Kiss for Corliss, in 1949.[2][3].\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:53 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.'}), observation='The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:58:53 - [react.py:299:call] - answer:\n", + " The government positions held by Shirley Temple, who portrayed Corliss Archer in the film Kiss and Tell, include being the United States Ambassador to Ghana and Czechoslovakia, as well as serving as Chief of Protocol of the United States.\u001b[0m\n", + "\u001b[33m2024-12-19 13:58:53 - [530968165.py:14:] - question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: government positions held by shirley temple who portrayed corliss archer in film kiss and tell include being united states ambassador to ghana and czechoslovakia as well as serving as chief of protocol of united states\u001b[0m\n", + "\u001b[31m2024-12-19 13:58:53 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:55 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?', thought='I will search for the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"search(entity='science fantasy young adult series with companion books about enslaved worlds and alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series with companion books about enslaved worlds and alien species'}), observation=\"Could not find exact page for 'science fantasy young adult series with companion books about enslaved worlds and alien species'. Similar topics: ['Animorphs', 'Feminist science fiction', 'LGBT themes in speculative fiction', 'Apocalyptic and post-apocalyptic fiction', 'Last Legionary']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:56 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will use LLM as a fallback tool to provide information on the science fantasy young adult series with companion books about enslaved worlds and alien species.', action=\"llm_tool(input='What is a science fantasy young adult series with companion books about enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is a science fantasy young adult series with companion books about enslaved worlds and alien species?'}), observation='One popular science fantasy young adult series that fits your description is the \"Lorien Legacies\" series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:57 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"One popular science fantasy young adult series that fits your description is the \\'Lorien Legacies\\' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\"}), observation=\"One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:58:57 - [react.py:299:call] - answer:\n", + " One popular science fantasy young adult series that fits your description is the 'Lorien Legacies' series by Pittacus Lore. The series includes companion books that delve into the enslaved worlds and alien species featured in the main novels. The series follows a group of alien teenagers known as the Garde who are on Earth hiding from their enemies, the Mogadorians, while developing their powers to fight back.\u001b[0m\n", + "\u001b[33m2024-12-19 13:58:57 - [530968165.py:14:] - question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: one popular science fantasy young adult series that fits your description is lorien legacies series by pittacus lore series includes companion books that delve into enslaved worlds and alien species featured in main novels series follows group of alien teenagers known as garde who are on earth hiding from their enemies mogadorians while developing their powers to fight back\u001b[0m\n", + "\u001b[31m2024-12-19 13:58:57 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", + "\u001b[34m2024-12-19 13:58:59 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:00 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have found information about the Laleli Mosque. Now, I will search for information about the Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:01 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have found information about both the Laleli Mosque and the Esma Sultan Mansion.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:59:01 - [react.py:299:call] - answer:\n", + " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n", + "\u001b[33m2024-12-19 13:59:01 - [530968165.py:14:] - question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n", + "\u001b[31m2024-12-19 13:59:01 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:03 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:04 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will provide the director of the romantic comedy 'Big Stone Gap'.\", action='finish(answer=\"The director of the romantic comedy \\'Big Stone Gap\\' is Adriana Trigiani.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\"}), observation=\"The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:59:04 - [react.py:299:call] - answer:\n", + " The director of the romantic comedy 'Big Stone Gap' is Adriana Trigiani.\u001b[0m\n", + "\u001b[33m2024-12-19 13:59:04 - [530968165.py:14:] - question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy big stone gap is adriana trigiani\u001b[0m\n" + ] + } + ], + "source": [ + "from adalflow.utils.logger import printc\n", + "\n", + "num_questions = 5\n", + "for i in range(num_questions):\n", + " question = val_dataset[i][\"question\"]\n", + " gt_answer = normalize_answer(\n", + " val_dataset[i][\"answer\"]\n", + " ) # normalize the ground truth answer\n", + "\n", + " # get the agent's response\n", + " pred_answer = agent(question)\n", + " pred_answer = normalize_answer(pred_answer)\n", + "\n", + " printc(\n", + " f\"question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", + " color=\"yellow\",\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 6. Evaluation\n", + "\n", + "Now you will see that we have the ``exact correct answer`` for some questions:\n", + "\n", + "question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes`` pred answer: ``yes``\n", + "\n", + "question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: ``animorphs``, pred answer: ``animorphs``\n", + "\n", + "Sometimes the agent performs correctly but not in the same format with the ground truth. E.g. ground truth: ``no``, pred answer: ``no, they are not the same``. This is what we can tolerate.\n", + "\n", + "But how to evaluate if the agent is doing well, or if our tools, examples, and prompt implementations work well? We need to evaluate it.\n", + "\n", + "1. Exact Match(EM)\n", + "Exact Match is what the paper is using. Only when the normalized agent response is the same with the ground truth answer, we count it as correct. The paper's EM for react agent is around 30%(gpt-3).\n", + "\n", + "2. Fuzzy Match(FM)\n", + "EM doesn't make much sense in question and answering. So we propose fuzzy match based on the LLMs' lengthy output nature. If the ground truth answer is included in the agent response, then we count it as correct. FM is not necessarily correct. \n", + "E.g. question: Harry Potter and Dumbledore, who is older? ground truth: ``dumbledore``, pred answer: ``harry potter is older than dumbledore.``\n", + "the model mentioned the groud truth but still provide wrong answer. So FM serves as reference.\n", + "\n", + "Let's use ``LightRAG eval`` module and evaluate on 10 questions and keep the model's practice to set ``max_step`` at `7`." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31m2024-12-19 13:59:56 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:57 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the nationalities of Scott Derrickson and Ed Wood.', action=\"search(entity='Scott Derrickson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Scott Derrickson'}), observation='Scott Derrickson (born July 16, 1966) is an American filmmaker. He is best known for his work in the horror genre, directing films such as The Exorcism of Emily Rose (2005), Sinister (2012), and The Black Phone (2021). He is also known for the superhero film Doctor Strange (2016), based on the Marvel Comics character.. Scott Derrickson grew up in Denver, Colorado. He graduated from Biola University with a B.A.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:58 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the nationality of Ed Wood.', action=\"search(entity='Ed Wood')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Ed Wood'}), observation=\"Edward Davis Wood Jr. (October 10, 1924\\xa0– December 10, 1978) was an American filmmaker, actor, and pulp novelist.. In the 1950s, Wood directed several low-budget science fiction, crime and horror films that later became cult classics, notably Glen or Glenda (1953), Jail Bait (1954), Bride of the Monster (1955), Plan 9 from Outer Space (1957) and Night of the Ghouls (1959).[1] In the 1960s and 1970s, he moved towards sexploitation and pornographic films such as The Sinister Urge (1960), Orgy of the Dead (1965) and Necromania (1971), and wrote over 80 lurid pulp crime and sex novels.. Notable for their campy aesthetics, technical errors, unsophisticated special effects, use of poorly-matched stock footage, eccentric casts, idiosyncratic stories and non sequitur dialogue, Wood's films remained largely obscure until he was posthumously awarded a Golden Turkey Award for Worst Director of All Time in 1980, renewing public interest in his life and work.[2]. Following the publication of Rudolph Grey's 1992 oral biography Nightmare of Ecstasy: The Life and Art of Edward D.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 13:59:59 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the search for the nationalities of Scott Derrickson and Ed Wood.', action='finish(answer=\"Scott Derrickson is American and Ed Wood was also American.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Scott Derrickson is American and Ed Wood was also American.'}), observation='Scott Derrickson is American and Ed Wood was also American.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 13:59:59 - [react.py:299:call] - answer:\n", + " Scott Derrickson is American and Ed Wood was also American.\u001b[0m\n", + "\u001b[33m2024-12-19 13:59:59 - [2823432734.py:31:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: scott derrickson is american and ed wood was also american\u001b[0m\n", + "\u001b[31m2024-12-19 13:59:59 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:00 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Corliss Archer')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Corliss Archer'}), observation='Fictional teenage girl Corliss Archer is the lead character in a series of American short stories written by F. Hugh Herbert starting in 1943. She also appears in these derivative works:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:02 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell film cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell film cast'}), observation=\"Could not find exact page for 'Kiss and Tell film cast'. Similar topics: ['Kiss & Tell (1997 film)', 'Kiss Me (2014 film)', 'Kiss Kiss Bang Bang', 'Kiss and Tell (1945 film)', 'Kiss the Girls (1997 film)']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:04 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"search(entity='Kiss and Tell (1945 film) cast')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Kiss and Tell (1945 film) cast'}), observation=\"Could not find exact page for 'Kiss and Tell (1945 film) cast'. Similar topics: ['Kiss and Tell (1945 film)', 'Kiss and Tell (play)', 'Kiss Them for Me (film)', 'A Kiss for Corliss', 'State Fair (1945 film)']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:05 - [react.py:265:_run_one_step] - Step 4: \n", + "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have finished the search for the woman who portrayed Corliss Archer in the film Kiss and Tell.', action='finish(answer=\"Unfortunately, I couldn\\'t find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\"}), observation=\"Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:05 - [react.py:299:call] - answer:\n", + " Unfortunately, I couldn't find the specific information about the woman who portrayed Corliss Archer in the film Kiss and Tell.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:05 - [2823432734.py:31:] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: unfortunately i couldnt find specific information about woman who portrayed corliss archer in film kiss and tell\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:05 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:07 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the science fantasy young adult series described by the user.', action=\"search(entity='science fantasy young adult series')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'science fantasy young adult series'}), observation=\"Could not find exact page for 'science fantasy young adult series'. Similar topics: ['Ballantine Adult Fantasy series', 'Red Queen (novel)', 'Brandon Sanderson', 'The Mortal Instruments', 'V. E. Schwab']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:09 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books enslaved worlds alien species'. Similar topics: ['List of Doctor Who universe creatures and aliens', 'List of Star Wars species (P–T)', 'List of companions in Doctor Who spin-offs', 'Extraterrestrial life', 'List of Known Space characters']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:11 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for companion books narrating the stories of enslaved worlds and alien species related to the previous search.', action=\"search(entity='companion books narrating stories enslaved worlds alien species')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'companion books narrating stories enslaved worlds alien species'}), observation=\"Could not find exact page for 'companion books narrating stories enslaved worlds alien species'. Similar topics: ['Animorphs', 'Eighth Doctor', 'Apocalyptic and post-apocalyptic fiction', 'Barsoom', 'Wicked Witch of the West']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:13 - [react.py:265:_run_one_step] - Step 4: \n", + "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search for the specific series and companion books. I will use the LLM tool to provide a relevant answer.', action=\"llm_tool(input='What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?'}), observation='The science fantasy young adult series you are referring to is \"The Lorien Legacies\" series by Pittacus Lore. The companion books you mentioned are called \"The Lost Files,\" which narrate the stories of enslaved worlds and alien species in the same universe as the main series.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:14 - [react.py:265:_run_one_step] - Step 5: \n", + "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The science fantasy young adult series you are referring to is \\'The Lorien Legacies\\' series by Pittacus Lore. The companion books you mentioned are called \\'The Lost Files,\\' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\"}), observation=\"The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:14 - [react.py:299:call] - answer:\n", + " The science fantasy young adult series you are referring to is 'The Lorien Legacies' series by Pittacus Lore. The companion books you mentioned are called 'The Lost Files,' which narrate the stories of enslaved worlds and alien species in the same universe as the main series.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:14 - [2823432734.py:31:] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: science fantasy young adult series you are referring to is lorien legacies series by pittacus lore companion books you mentioned are called lost files which narrate stories of enslaved worlds and alien species in same universe as main series\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:14 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:16 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"search(entity='Laleli Mosque')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Laleli Mosque'}), observation=\"The Laleli Mosque (Turkish: Laleli Camii, lit.\\u2009'Tulip Mosque') is an 18th-century Ottoman imperial mosque located in Laleli, Fatih, Istanbul, Turkey.[1]. The mosque was commissioned by Sultan Mustafa III to serve as his imperial or sultanic mosque.[2][3] Although it was tradition among earlier sultans to build their imperial mosque in commemoration of a major military success, Mustafa III ignored this tradition by ordering the construction before any such victories.[3] Construction began on 5 April 1760 and was completed on 9 March 1764.[4][3] According to a contemporary writer, the mosque was officially named Nur Mustafa ('Light of Mustafa'), but it became popularly known as the Laleli Mosque ('Mosque of the Tulips') after the name of the neighbourhood where it was built.[3]. The architect of the mosque is not confirmed by historical documentation, but art historians have attributed the mosque to Mehmed Tahir Agha, the chief imperial architect at the time of the mosque's completion.[a][2][4][5] On average, about 770 workers were employed in the project and about two thirds of them were non-Muslims, the rest being Muslim.[5]. The mosque was the centerpiece of a larger complex (külliye) that included the Mustafa III's tomb, a nearby caravanserai which provided some revenues to the complex, a sebil, and a madrasa.[6] Mustafa III was buried in the mausoleum attached to the complex after his death in 1774.[7] The mosque and its complex were damaged by the 1766 earthquake[4] and, according to Ünver Rüstem, by a fire in 1783.[7] In 1783 it was fully restored.[7][2] The restoration, which Doğan Kuban attributes to the architect Seyit Mustafa Agha,[4][2] preserved the original mosque's appearance.[7] The mausoleum's façade was updated with new marble window frames in the early 19th century.[6] The madrasa of the complex was destroyed by fire in 1911.[6]. The mosque was built in the Ottoman Baroque style of its time.[8][9][10] The layout is based on that of the earlier Selimiye Mosque of Edirne from the classical period, in accordance with Mustafa III's wishes.[11][5] The decoration of the mosque and its complex is firmly baroque.[12][2] The mosque incorporates thus the visual style of the earlier Nuruosmaniye Mosque – the first imperial mosque in the Ottoman Baroque style, completed by Mustafa III's predecessor – but in a more restrained way that integrates it with more traditional Ottoman architecture.[13].\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:17 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about the location of Esma Sultan Mansion.', action=\"search(entity='Esma Sultan Mansion')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Esma Sultan Mansion'}), observation='The Esma Sultan Mansion (Turkish: Esma Sultan Yalısı), a historical yalı located on the Bosphorus in the Ortaköy neighborhood of Istanbul, Turkey and named after its original owner Princess Esma Sultan, is used today as a cultural center after being redeveloped.. The three-storey brick manor was designed by the renowned architect Sarkis Balyan and finished in 1875 next to Ortaköy Mosque. It was presented to the Princess Esma Sultan, the daughter of Ottoman Sultan Abdulaziz, as a wedding gift in 1889.. The mansion remained in the possession of the Ottoman dynasty until 1915. Subsequently, the building was used first as a tobacco warehouse and then as a coal depot from 1920 until 1975 when it was destroyed by a fire.[1].')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:18 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.'}), observation='Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:18 - [react.py:299:call] - answer:\n", + " Yes, the Laleli Mosque and Esma Sultan Mansion are not located in the same neighborhood. Laleli Mosque is in Laleli, Fatih, Istanbul, while Esma Sultan Mansion is in the Ortaköy neighborhood of Istanbul.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:18 - [2823432734.py:31:] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: yes laleli mosque and esma sultan mansion are not located in same neighborhood laleli mosque is in laleli fatih istanbul while esma sultan mansion is in ortaköy neighborhood of istanbul\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:18 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:19 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Big Stone Gap (film)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Big Stone Gap (film)'}), observation=\"Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios, a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name, the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6, 2014.[2]. The film was released on October 9, 2015, by Picturehouse.[3] The film was released in Blu-Ray by Universal Pictures Home Entertainment on February 2, 2016.. In 1978, 40-year-old independent woman Ave Maria Mulligan owns her dead father's pharmacy in her hometown of Big Stone Gap, Virginia.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:20 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"I will search for the director of the romantic comedy 'Big Stone Gap' on Wikipedia.\", action=\"search(entity='Adriana Trigiani')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Adriana Trigiani'}), observation=\"Adriana Trigiani is an American best-selling author of eighteen books, playwright, television writer/producer, film director/screenwriter/producer, and entrepreneur based in New York City. Trigiani has published a novel a year since 2000.[1]. Trigiani graduated from Saint Mary's College in Indiana in 1981.[2]. Inspired by her Italian American heritage and Appalachian childhood in Big Stone Gap, Virginia, Trigiani arrived in New York in 1985. Trigiani made her off-Broadway debut in New York City as a playwright in 1985 at the Manhattan Theater Club with Secrets of the Lava Lamp, directed by Stuart Ross.[3] From 1988 to 1998, she created scripts for television sitcoms, including The Cosby Show (1984) and its spin-off A Different World (1987).\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:21 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.'}), observation='Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:21 - [react.py:299:call] - answer:\n", + " Adriana Trigiani is the director of the romantic comedy film Big Stone Gap and is based in New York City.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:21 - [2823432734.py:31:] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: adriana trigiani is director of romantic comedy film big stone gap and is based in new york city\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:21 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:22 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about the debut album of the South Korean boy group.', action=\"search(entity='2014 S/S (album)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': '2014 S/S (album)'}), observation='2014 S/S is the debut album of South Korean group Winner. It was released on August 12, 2014, by the group\\'s record label, YG Entertainment. The members were credited for writing the lyrics and composing the majority of the album\\'s songs.[2]. The members produced the majority of the music for this album themselves, with the help of other producers such as Choice 37, B.I., Airplay, and others.[3] The album was highlighted for incorporating elements generally absent from K-pop releases, including hints of acoustic and alternative rock.[4][5]. The title track \"Empty\" became a hit in South Korea, topping the Gaon Chart and Billboard\\'s K-Pop Hot 100 chart,[6][7] and topping 7 other South Korean charts.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:24 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question='The question to ask the LLM (Optional[str]) (optional)', thought='I will find out who formed the South Korean boy group Winner.', action=\"search(entity='Winner (group)')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (group)'}), observation=\"Could not find exact page for 'Winner (group)'. Similar topics: ['2026 FIFA World Cup', '2025 FIFA Club World Cup', '2025 Africa Cup of Nations', 'Reality Winner', 'Dana Winner']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:26 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I will search for the formation of the South Korean boy group Winner through another related search.', action=\"search(entity='Winner (South Korean group) formation')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Winner (South Korean group) formation'}), observation=\"Could not find exact page for 'Winner (South Korean group) formation'. Similar topics: ['Blackpink', 'Winner (band)', 'Babymonster', 'BigBang (South Korean band)', 'Seventeen (South Korean band)']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:27 - [react.py:265:_run_one_step] - Step 4: \n", + "StepOutput(step=4, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action=\"llm_tool(input='Who formed the South Korean boy group Winner?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who formed the South Korean boy group Winner?'}), observation='The South Korean boy group Winner was formed by YG Entertainment through the reality survival program called \"WIN: Who is Next\" in 2013.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:28 - [react.py:265:_run_one_step] - Step 5: \n", + "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have exhausted the search options related to the formation of Winner. I will provide a general response.', action='finish(answer=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called \\'WIN: Who is Next\\' in 2013.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\"}), observation=\"YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:28 - [react.py:299:call] - answer:\n", + " YG Entertainment formed the South Korean boy group Winner through the reality survival program called 'WIN: Who is Next' in 2013.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:28 - [2823432734.py:31:] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: yg entertainment formed south korean boy group winner through reality survival program called win who is next in 2013\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:28 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:29 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who helped organizations improve their performance as a consultant.', action=\"search(entity='Aladin')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin'}), observation='Aladdin is a folk tale of Middle Eastern origin.. Aladdin, Aladin, and variants such as Aladdin and His Magic Lamp, may also refer to:. Main pageContentsCurrent eventsRandom articleAbout WikipediaContact us. HelpLearn to editCommunity portalRecent changesUpload file. Donate.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:31 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The search for Aladin did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin consultant'}), observation=\"Could not find exact page for 'Aladin consultant'. Similar topics: ['2009 in film', 'Magicians (2007 film)', 'Liberal Party of Canada candidates in the 2019 Canadian federal election', 'Ashchorjyo Prodeep', 'Waleed El-Ansary']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:33 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"The search for Aladin consultant did not provide relevant information. Let's try to find the person known by the stage name Aladin who helped organizations improve their performance as a consultant.\", action=\"search(entity='Aladin performance improvement consultant')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Aladin performance improvement consultant'}), observation=\"Could not find exact page for 'Aladin performance improvement consultant'. Similar topics: ['Five Pillars of Islam', 'Signal-to-noise ratio', 'Islamic Golden Age', 'Travelling salesman problem', 'Clock']\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:35 - [react.py:265:_run_one_step] - Step 4: \n", + "StepOutput(step=4, action=FunctionExpression(question=None, thought=\"The search for Aladin with different variations did not yield relevant information. Let's use LLM as a fallback tool to answer the query.\", action=\"llm_tool(input='Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:36 - [react.py:265:_run_one_step] - Step 5: \n", + "StepOutput(step=5, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.'}), observation='The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:36 - [react.py:299:call] - answer:\n", + " The person known by his stage name Aladin who helped organizations improve their performance as a consultant is Aladin Akyurek. He is a renowned consultant, speaker, and author who specializes in organizational development and performance improvement.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:36 - [2823432734.py:31:] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: person known by his stage name aladin who helped organizations improve their performance as consultant is aladin akyurek he is renowned consultant speaker and author who specializes in organizational development and performance improvement\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:36 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:37 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for the arena where the Lewiston Maineiacs played their home games.', action=\"search(entity='Androscoggin Bank Colisée')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Androscoggin Bank Colisée'}), observation='The Colisée, formerly Androscoggin Bank Colisée, Central Maine Youth Center, Central Maine Civic Center, and Lewiston Colisee, is a 4,000 capacity (3,677 seated) multi-purpose arena, in Lewiston, Maine, United States that was built from 1956 to 1958 and opened in the latter year to replace the Dominics Regional High School Arena and was initially constructed and operated by the Catholic parish of SS. Peter and Paul. It was used as a replacement venue for the boxing match between Muhammad Ali and Sonny Liston in 1965.. Currently, it is the home of the Maine Nordiques of the North American Hockey League. The Colisée is also used for concerts, conventions and trade shows.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:38 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).' )\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).'}), observation='The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:38 - [react.py:299:call] - answer:\n", + " The arena where the Lewiston Maineiacs played their home games, Androscoggin Bank Colisée, can seat 4,000 people (3,677 seated).\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:38 - [2823432734.py:31:] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: arena where lewiston maineiacs played their home games androscoggin bank colisée can seat 4000 people 3677 seated\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:38 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:39 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='I will search for information about Annie Morton and Terry Richardson to determine who is older.', action=\"search(entity='Annie Morton')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Annie Morton'}), observation='Annie Morton (born October 8, 1970) is an American model[1] born in Pennsylvania.[2] She has appeared on the covers of British Vogue, ID, Marie Claire, and other magazines. She has been photographed by Helmut Newton; Peter Lindbergh; Annie Leibovitz; Richard Avedon; Juergen Teller;[3] Paul Jasmin, Mary Ellen Mark, Stephen Shames, and Terry Richardson, and modeled for Donna Karan,[4] Givenchy, Guerlain, Chanel, Harper\\'s Bazaar, Sports Illustrated and Victoria\\'s Secret.[5] A long time vegetarian, an advocate for organic lifestyle choices and natural healthcare. She co-founded Tsi-La Organics, a \"Green Luxury\" company that creates and sells vegan, organic perfume and skin care products.[6]. She has appeared on many magazine covers and has been featured in several professional photobooks, Peter Lindbergh Selected Work, the cover of Juergen Teller By Juergen Teller, Helmut Newton Pages From The Glossies, and Donna Karan\\'s book A Journey Of A Woman 20 Years.[7] In 1997 she posed for the Pirelli Calendar with Richard Avedon.. Morton has also appeared in several music videos.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:41 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I will search for information about Terry Richardson to determine his age.', action=\"search(entity='Terry Richardson')\"), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Terry Richardson'}), observation=\"Terrence Richardson (born August 14, 1965) is an American fashion and portrait photographer. He has shot advertising campaigns for Marc Jacobs, Aldo, Supreme, Sisley, Tom Ford, and Yves Saint Laurent among others, and also done work for magazines such as Rolling Stone, GQ, Vogue, Vanity Fair, Harper's Bazaar, i-D, and Vice.. Since 2001, Richardson has been accused by multiple models of sexual misconduct.[2][3][4][5] In 2017, brands and magazines that had worked with Richardson in the past began distancing themselves from him, and said they would no longer employ him.[6] He has not actively worked as a photographer since 2018.[7]. Richardson was born in New York City, the son of Norma Kessler, an actress,[8][9] and Bob Richardson, a fashion photographer who struggled with schizophrenia and drug abuse.[10] Richardson's father was Irish Catholic and his mother is Jewish.[11] Following the divorce of his parents, Richardson moved to Woodstock, New York, with his mother and stepfather, English guitarist Jackie Lomax.[8] Richardson later moved to the Hollywood neighborhood of Los Angeles, where he attended Hollywood High School.[12]. He moved with his mother to Ojai, California, where he attended Nordhoff High School, when he was 16.[13] Richardson originally wanted to be a punk rock musician rather than a photographer.[13] He played bass guitar in the punk rock band The Invisible Government for four years.[14] He played bass for a variety of other punk bands in Southern California including Signal Street Alcoholics, Doggy Style, Baby Fist and Middle Finger.[8][15].\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:42 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.'}), observation='Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:42 - [react.py:299:call] - answer:\n", + " Annie Morton was born on October 8, 1970, making her older than Terry Richardson who was born on August 14, 1965.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:42 - [2823432734.py:31:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: annie morton was born on october 8 1970 making her older than terry richardson who was born on august 14 1965\u001b[0m\n", + "\u001b[31m2024-12-19 14:00:42 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:43 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='search(entity=\"Local H\")'), function=Function(thought=None, name='search', args=[], kwargs={'entity': 'Local H'}), observation=\"Local H is an American rock band originally formed by guitarist and vocalist Scott Lucas, bassist Matt Garcia, drummer Joe Daniels, and lead guitarist John Sparkman in Zion, Illinois in 1990. The members all met in high school in 1987 and founded Local H three years later. After Sparkman's departure in 1991 and Garcia's departure in 1993, Local H continued as an unorthodox two-piece setup.. Local H signed a record contract with Island Records in 1994, where they would go on to release three albums. The band's debut album, Ham Fisted (1995), was not a success and the band was nearly dropped, but the band remained on the label long enough to release their second album As Good as Dead (1996).\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:00:44 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action='finish(answer=\"Yes, Local H is from the United States. Now, let\\'s check For Against\\'s origin.\"'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States. Now, let's check For Against's origin.\"}), observation=\"Yes, Local H is from the United States. Now, let's check For Against's origin.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:00:44 - [react.py:299:call] - answer:\n", + " Yes, Local H is from the United States. Now, let's check For Against's origin.\u001b[0m\n", + "\u001b[33m2024-12-19 14:00:44 - [2823432734.py:31:] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states now lets check for againsts origin\u001b[0m\n", + "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0], additional_info=None), average time = 4.769389891624451\n" + ] + } + ], + "source": [ + "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", + "\n", + "# set up evaluation type\n", + "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n", + "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n", + "\n", + "agent = ReActAgent(\n", + " tools=tools,\n", + " max_steps=7,\n", + " model_client=OpenAIClient(),\n", + " model_kwargs=gpt_model_kwargs,\n", + " # preset_prompt_kwargs=preset_prompt_kwargs,\n", + ")\n", + "\n", + "num_questions = 10\n", + "gt_answers = []\n", + "pred_answers = []\n", + "start_time = time.time()\n", + "for i in range(num_questions):\n", + " question = val_dataset[i][\"question\"]\n", + " gt_answer = normalize_answer(\n", + " val_dataset[i][\"answer\"]\n", + " ) # normalize the ground truth answer\n", + " gt_answers.append(gt_answer)\n", + "\n", + " # get the agent's response\n", + " pred_answer = agent(question)\n", + " pred_answer = normalize_answer(pred_answer)\n", + " pred_answers.append(pred_answer)\n", + "\n", + " printc(\n", + " f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", + " color=\"yellow\",\n", + " )\n", + "\n", + "end_time = time.time()\n", + "\n", + "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", + "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", + "avg_time = (end_time - start_time) / num_questions\n", + "\n", + "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The EM is 0.3 and the FM is 0.6. Each query takes 9s in average. (The performance also depends on the success of wikipedia query connection.)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What if we use simple LLM models to answer these questions? To test on this, we just need to remove the tools. We have a built-in ``llm_tool`` and ``finish`` that automatically handles the query. ``llm_tool`` uses the same model with the agent. We can't add the examples because the examples will mislead the agent to use non-existing tools." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[31m2024-12-19 14:01:27 - [react.py:285:call] - input_query: Were Scott Derrickson and Ed Wood of the same nationality?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:29 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the nationality of Scott Derrickson and Ed Wood.', action=\"llm_tool(input='What is the nationality of Scott Derrickson?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Scott Derrickson?'}), observation='Scott Derrickson is an American filmmaker.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:30 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the nationality of Ed Wood.', action=\"llm_tool(input='What is the nationality of Ed Wood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'What is the nationality of Ed Wood?'}), observation='Ed Wood was an American filmmaker.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:31 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought='Confirming if Scott Derrickson and Ed Wood were of the same nationality.', action=\"finish(answer='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.'}), observation='Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:31 - [react.py:299:call] - answer:\n", + " Yes, both Scott Derrickson and Ed Wood were American filmmakers, so they were of the same nationality.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:31 - [3974127232.py:27:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: yes, pred answer: yes both scott derrickson and ed wood were american filmmakers so they were of same nationality\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:31 - [react.py:285:call] - input_query: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:32 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"llm_tool(input='government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell'}), observation='The government position held by the woman who portrayed Corliss Archer in the film \"Kiss and Tell\" is Shirley Temple. She served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:33 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is asking about the government position held by the woman who portrayed Corliss Archer in the film Kiss and Tell.', action=\"finish(answer='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.'}), observation='Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:33 - [react.py:299:call] - answer:\n", + " Shirley Temple served as the United States Ambassador to Ghana and later to Czechoslovakia.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:33 - [3974127232.py:27:] - No. 2, question: What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?, ground truth: chief of protocol, pred answer: shirley temple served as united states ambassador to ghana and later to czechoslovakia\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:33 - [react.py:285:call] - input_query: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:36 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.', action=\"llm_tool(input='Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species'}), observation='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\n", + "_______\n", + "\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Error at parsing JSON string: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n", + " in \"\", line 1, column 1:\n", + " {\n", + " ^\n", + "expected ',' or '}', but got ''\n", + " in \"\", line 4, column 61:\n", + " ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n", + " ^. Got JSON string: {\n", + " \"question\": null,\n", + " \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n", + " \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n", + "}\n", + "Error in parsing JSON to JSON: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n", + " in \"\", line 1, column 1:\n", + " {\n", + " ^\n", + "expected ',' or '}', but got ''\n", + " in \"\", line 4, column 61:\n", + " ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n", + " ^. Got JSON string: {\n", + " \"question\": null,\n", + " \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n", + " \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n", + "}\n", + "Error processing the output processors: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n", + " in \"\", line 1, column 1:\n", + " {\n", + " ^\n", + "expected ',' or '}', but got ''\n", + " in \"\", line 4, column 61:\n", + " ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n", + " ^. Got JSON string: {\n", + " \"question\": null,\n", + " \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n", + " \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n", + "}\n", + "Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n", + " in \"\", line 1, column 1:\n", + " {\n", + " ^\n", + "expected ',' or '}', but got ''\n", + " in \"\", line 4, column 61:\n", + " ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n", + " ^. Got JSON string: {\n", + " \"question\": null,\n", + " \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n", + " \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n", + "}\n", + "Error running step 3: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n", + "Error running step 4: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n", + "Error running step 5: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n", + "Error running step 6: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n", + "Error running step 7: Error rendering Jinja2 template: 'None' has no attribute 'thought'\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m2024-12-19 14:01:37 - [react.py:299:call] - answer:\n", + " Error planning step 2: Error: Got invalid JSON object with yaml.safe_load. Error: while parsing a flow mapping\n", + " in \"\", line 1, column 1:\n", + " {\n", + " ^\n", + "expected ',' or '}', but got ''\n", + " in \"\", line 4, column 61:\n", + " ... ='I recommend checking out the \"Lorien Legacies\" series by Pitta ... \n", + " ^. Got JSON string: {\n", + " \"question\": null,\n", + " \"thought\": \"Providing the information about the science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species.\",\n", + " \"action\": \"finish(answer='I recommend checking out the \"Lorien Legacies\" series by Pittacus Lore. This science fiction young adult series follows the story of nine alien teenagers who escape their home planet, Lorien, which has been invaded by the evil Mogadorians. Each book in the series focuses on one of the teenagers and their journey to save Earth and their own species. Additionally, there are companion books that delve into the histories of the enslaved worlds and alien species featured in the main series.')\"\n", + "}\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:37 - [3974127232.py:27:] - No. 3, question: What science fantasy young adult series, told in first person, has a set of companion books narrating the stories of enslaved worlds and alien species?, ground truth: animorphs, pred answer: error planning step 2 error got invalid json object with yamlsafeload error while parsing flow mapping in unicode string line 1 column 1 expected or but got scalar in unicode string line 4 column 61 i recommend checking out lorien legacies series by pitta got json string question null thought providing information about science fantasy young adult series with companion books narrating stories of enslaved worlds and alien species action finishansweri recommend checking out lorien legacies series by pittacus lore this science fiction young adult series follows story of nine alien teenagers who escape their home planet lorien which has been invaded by evil mogadorians each book in series focuses on one of teenagers and their journey to save earth and their own species additionally there are companion books that delve into histories of enslaved worlds and alien species featured in main series\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:37 - [react.py:285:call] - input_query: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:39 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"llm_tool(input='Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Are Laleli Mosque and Esma Sultan Mansion in the same neighborhood?'}), observation='No, Laleli Mosque and Esma Sultan Mansion are not in the same neighborhood. Laleli Mosque is located in the Laleli district of Istanbul, while Esma Sultan Mansion is located in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:40 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='Confirming the locations of Laleli Mosque and Esma Sultan Mansion.', action=\"finish(answer='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.'}), observation='No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:40 - [react.py:299:call] - answer:\n", + " No, Laleli Mosque is in the Laleli district of Istanbul, and Esma Sultan Mansion is in the Ortaköy district. They are in different neighborhoods within Istanbul.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:40 - [3974127232.py:27:] - No. 4, question: Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?, ground truth: no, pred answer: no laleli mosque is in laleli district of istanbul and esma sultan mansion is in ortaköy district they are in different neighborhoods within istanbul\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:40 - [react.py:285:call] - input_query: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:42 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"llm_tool(input='director of the romantic comedy Big Stone Gap and New York city it is based in')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'director of the romantic comedy Big Stone Gap and New York city it is based in'}), observation='\"Big Stone Gap\" is a romantic comedy film directed by Adriana Trigiani. The story is set in a small town in Virginia, not New York City. The film follows the life of a woman named Ave Maria Mulligan, played by Ashley Judd, as she navigates love, family, and community in the charming town of Big Stone Gap.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:43 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"The user is asking about the director of the romantic comedy movie 'Big Stone Gap' and the New York city it is based in.\", action=\"finish(answer='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.'}), observation='The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:43 - [react.py:299:call] - answer:\n", + " The director of the romantic comedy movie Big Stone Gap is Adriana Trigiani. The movie is set in a small town in Virginia, not New York City.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:43 - [3974127232.py:27:] - No. 5, question: The director of the romantic comedy \"Big Stone Gap\" is based in what New York city?, ground truth: greenwich village new york city, pred answer: director of romantic comedy movie big stone gap is adriana trigiani movie is set in small town in virginia not new york city\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:43 - [react.py:285:call] - input_query: 2014 S/S is the debut album of a South Korean boy group that was formed by who?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:44 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the debut album of a South Korean boy group and who formed it.', action=\"llm_tool(input='2014 S/S is the debut album of a South Korean boy group that was formed by who?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': '2014 S/S is the debut album of a South Korean boy group that was formed by who?'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:45 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought=\"Providing the information about the debut album and the group's formation.\", action=\"finish(answer='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': '2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.'}), observation='2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:45 - [react.py:299:call] - answer:\n", + " 2014 S/S is the debut album of the South Korean boy group WINNER, which was formed by YG Entertainment through the reality survival program WIN: Who is Next.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:45 - [3974127232.py:27:] - No. 6, question: 2014 S/S is the debut album of a South Korean boy group that was formed by who?, ground truth: yg entertainment, pred answer: 2014 ss is debut album of south korean boy group winner which was formed by yg entertainment through reality survival program win who is next\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:45 - [react.py:285:call] - input_query: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:48 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action=\"llm_tool(input='Aladin stage name consultant organizations performance improve')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Aladin stage name consultant organizations performance improve'}), observation='It sounds like you are looking for advice on how to improve the performance of organizations that provide stage name consulting services, like Aladin. Here are some suggestions to help enhance their performance:\\n\\n1. Develop a strong brand identity: Help organizations establish a unique and memorable brand identity that sets them apart from competitors.\\n\\n2. Enhance customer experience: Focus on providing exceptional customer service and personalized consultations to ensure client satisfaction.\\n\\n3. Expand service offerings: Consider offering additional services such as social media management, image consulting, or talent development to attract a wider range of clients.\\n\\n4. Invest in marketing and promotion: Utilize digital marketing strategies, social media platforms, and partnerships with influencers to increase visibility and attract new clients.\\n\\n5. Professional development: Provide ongoing training and development opportunities for consultants to stay updated on industry trends and best practices.\\n\\nBy implementing these strategies, organizations offering stage name consulting services can improve their performance and attract more clients.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:49 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='The user is looking for information about a person known by the stage name Aladin who worked as a consultant to help organizations improve their performance.', action='finish(answer=\"Aladin is known by his stage name and helped organizations improve their performance as a consultant.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'Aladin is known by his stage name and helped organizations improve their performance as a consultant.'}), observation='Aladin is known by his stage name and helped organizations improve their performance as a consultant.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:49 - [react.py:299:call] - answer:\n", + " Aladin is known by his stage name and helped organizations improve their performance as a consultant.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:49 - [3974127232.py:27:] - No. 7, question: Who was known by his stage name Aladin and helped organizations improve their performance as a consultant?, ground truth: eenasul fateh, pred answer: aladin is known by his stage name and helped organizations improve their performance as consultant\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:49 - [react.py:285:call] - input_query: The arena where the Lewiston Maineiacs played their home games can seat how many people?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:50 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='The user is asking about the seating capacity of the arena where the Lewiston Maineiacs played their home games.', action=\"llm_tool(input='The arena where the Lewiston Maineiacs played their home games can seat how many people?')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'The arena where the Lewiston Maineiacs played their home games can seat how many people?'}), observation='The arena where the Lewiston Maineiacs played their home games was the Androscoggin Bank Colisée, which had a seating capacity of approximately 3,677 people for hockey games.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:51 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='I have finished the task.', action=\"finish(answer='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\"), function=Function(thought=None, name='finish', args=[], kwargs={'answer': 'The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.'}), observation='The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:51 - [react.py:299:call] - answer:\n", + " The Androscoggin Bank Colisée, where the Lewiston Maineiacs played their home games, had a seating capacity of approximately 3,677 people for hockey games.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:51 - [3974127232.py:27:] - No. 8, question: The arena where the Lewiston Maineiacs played their home games can seat how many people?, ground truth: 3677 seated, pred answer: androscoggin bank colisée where lewiston maineiacs played their home games had seating capacity of approximately 3677 people for hockey games\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:51 - [react.py:285:call] - input_query: Who is older, Annie Morton or Terry Richardson?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:52 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Identifying the ages of Annie Morton and Terry Richardson.', action=\"llm_tool(input='Annie Morton age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Annie Morton age'}), observation=\"I'm sorry, but I cannot provide personal information such as age without more context or details. How can I assist you further?\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='Comparing the ages of Annie Morton and Terry Richardson to determine who is older.', action=\"llm_tool(input='Terry Richardson age') (str)\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Terry Richardson age'}), observation=\"I'm sorry, but I cannot provide real-time information on individuals. Terry Richardson is a well-known American fashion photographer, but I do not have access to his current age.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:54 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Unfortunately, I couldn't retrieve the ages of Annie Morton and Terry Richardson.\", action='finish(answer=\"I don\\'t have access to the specific ages of Annie Morton and Terry Richardson.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"I don't have access to the specific ages of Annie Morton and Terry Richardson.\"}), observation=\"I don't have access to the specific ages of Annie Morton and Terry Richardson.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:54 - [react.py:299:call] - answer:\n", + " I don't have access to the specific ages of Annie Morton and Terry Richardson.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:54 - [3974127232.py:27:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth: terry richardson, pred answer: i dont have access to specific ages of annie morton and terry richardson\u001b[0m\n", + "\u001b[31m2024-12-19 14:01:54 - [react.py:285:call] - input_query: Are Local H and For Against both from the United States?\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:56 - [react.py:265:_run_one_step] - Step 1: \n", + "StepOutput(step=1, action=FunctionExpression(question=None, thought='Checking the origin of the bands Local H and For Against.', action='llm_tool(input=\"Local H origin\")'), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'Local H origin'}), observation='Local H is an American rock band hailing from Zion, Illinois. The band was formed in 1987 by Scott Lucas (vocals, guitar, bass) and Joe Daniels (drums). They are known for their energetic live performances and their blend of alternative rock, grunge, and punk influences.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:58 - [react.py:265:_run_one_step] - Step 2: \n", + "StepOutput(step=2, action=FunctionExpression(question=None, thought='Checking the origin of the band For Against.', action=\"llm_tool(input='For Against origin')\"), function=Function(thought=None, name='llm_tool', args=[], kwargs={'input': 'For Against origin'}), observation='Sure, here are some arguments for and against the concept of origin:\\n\\nFor:\\n1. Understanding the origin of something can provide valuable insights into its nature and purpose.\\n2. Knowing the origin of a product or idea can help in determining its authenticity and quality.\\n3. Studying origins can help in tracing the development and evolution of various phenomena.\\n\\nAgainst:\\n1. Focusing too much on origin can lead to oversimplification and overlook the complexity of a subject.\\n2. The origin of something may not always be clear or easily traceable, leading to speculation and uncertainty.\\n3. Placing too much emphasis on origin can limit our understanding and appreciation of the present and future aspects of a subject.')\n", + "_______\n", + "\u001b[0m\n", + "\u001b[34m2024-12-19 14:01:59 - [react.py:265:_run_one_step] - Step 3: \n", + "StepOutput(step=3, action=FunctionExpression(question=None, thought=\"Combining the origins of Local H and For Against to answer the user's query.\", action='finish(answer=\"Yes, Local H is from the United States (Zion, Illinois) and For Against\\'s origin is not specified in the response.\")'), function=Function(thought=None, name='finish', args=[], kwargs={'answer': \"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\"}), observation=\"Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\")\n", + "_______\n", + "\u001b[0m\n", + "\u001b[32m2024-12-19 14:01:59 - [react.py:299:call] - answer:\n", + " Yes, Local H is from the United States (Zion, Illinois) and For Against's origin is not specified in the response.\u001b[0m\n", + "\u001b[33m2024-12-19 14:01:59 - [3974127232.py:27:] - No. 10, question: Are Local H and For Against both from the United States?, ground truth: yes, pred answer: yes local h is from united states zion illinois and for againsts origin is not specified in response\u001b[0m\n", + "EM = EvaluationResult(avg_score=0.0, per_item_scores=[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], additional_info=None), FM = EvaluationResult(avg_score=0.5, per_item_scores=[1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0], additional_info=None), average time = 3.1863945960998534\n" + ] + } + ], + "source": [ + "from adalflow.eval.answer_match_acc import AnswerMatchAcc\n", + "\n", + "# set up evaluation type\n", + "EM_evaluator = AnswerMatchAcc(type=\"exact_match\")\n", + "FM_evaluator = AnswerMatchAcc(type=\"fuzzy_match\")\n", + "\n", + "agent = ReActAgent(\n", + " max_steps=7, model_client=OpenAIClient(), model_kwargs=gpt_model_kwargs\n", + ")\n", + "\n", + "num_questions = 10\n", + "gt_answers = []\n", + "pred_answers = []\n", + "start_time = time.time()\n", + "for i in range(num_questions):\n", + " question = val_dataset[i][\"question\"]\n", + " gt_answer = normalize_answer(\n", + " val_dataset[i][\"answer\"]\n", + " ) # normalize the ground truth answer\n", + " gt_answers.append(gt_answer)\n", + "\n", + " # get the agent's response\n", + " pred_answer = agent(question)\n", + " pred_answer = normalize_answer(pred_answer)\n", + " pred_answers.append(pred_answer)\n", + "\n", + " printc(\n", + " f\"No. {i+1}, question: {question}, ground truth: {gt_answer}, pred answer: {pred_answer}\",\n", + " color=\"yellow\",\n", + " )\n", + "\n", + "end_time = time.time()\n", + "\n", + "em = EM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", + "fm = FM_evaluator.compute(pred_answers=pred_answers, gt_answers=gt_answers)\n", + "avg_time = (end_time - start_time) / num_questions\n", + "\n", + "print(f\"EM = {em}, FM = {fm}, average time = {avg_time}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Without the tools and examples, EM=0 and FM=0.4. We saw hallucinations and nonsense:\n", + "\n", + "2024-06-15 23:17:04 - [3230041225.py:26:] - No. 1, question: Were Scott Derrickson and Ed Wood of the same nationality?, ground truth: ``yes``, pred answer: ``no scott derrickson and ed wood were not of same nationality scott derrickson is american while ed wood was also american``\n", + "\n", + "2024-06-15 23:18:16 - [3230041225.py:26:] - No. 9, question: Who is older, Annie Morton or Terry Richardson?, ground truth:`` terry richardson``, pred answer: ``who is older annie morton or terry richardson``\n", + "\n", + "Therefore, using ReAct agent outperforms the base LLM.\n", + "Meanwhile, ``LightRAG ReAct agent`` shows that the performance on 10 questions(EM=0.3)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 7. Future Improvement" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO:\n", + "# 1. advanced, add history to react\n", + "# 2. add training, few shot\n", + "# 3. llm as judge\n", + "# 4. add picture\n", + "# 5. better json handling, we need to store the answer output" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "my-project-kernel", + "language": "python", + "name": "my-project-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/use_cases/classification/train.py b/use_cases/classification/train.py index 0bdbd562..68a71fdb 100644 --- a/use_cases/classification/train.py +++ b/use_cases/classification/train.py @@ -11,6 +11,7 @@ gpt_3_model, gpt_4o_model, ) +from adalflow.core.generator import BackwardPassSetup class TrecClassifierAdal(adal.AdalComponent): @@ -26,7 +27,7 @@ def __init__( eval_fn = AnswerMatchAcc(type="exact_match").compute_single_item loss_fn = adal.EvalFnToTextLoss( eval_fn=eval_fn, - eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0", + eval_fn_desc="exact_match: 1 if str(y) == str(y_gt) else 0. When the LLM prediction failed with format parsing which results with errors, we set y_pred = -1", ) super().__init__( task=task, @@ -51,8 +52,8 @@ def prepare_eval( def prepare_loss( self, sample: TRECExtendedData, y_pred: adal.Parameter, *args, **kwargs ) -> Tuple[Callable[..., Any], Dict]: - full_response = y_pred.full_response - y_label = -1 + full_response = y_pred.data + y_label = -1 # default value for failed prediction if ( full_response and full_response.data is not None @@ -67,7 +68,11 @@ def prepare_loss( eval_input=sample.class_name, requires_opt=False, ) - return self.loss_fn, {"kwargs": {"y": y_pred, "y_gt": y_gt}} + return self.loss_fn, { + "kwargs": {"y": y_pred, "y_gt": y_gt}, + "id": sample.id, + "gt": y_gt.eval_input, + } def train( @@ -81,6 +86,9 @@ def train( strategy="constrained", optimization_order="sequential", debug=False, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, ): # TODO: ensure the teacher prompt gets updated with the new model adal_component = TrecClassifierAdal( @@ -90,6 +98,12 @@ def train( backward_engine_model_config=gpt_4o_model, teacher_model_config=gpt_4o_model, ) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) print(adal_component) trainer = adal.Trainer( train_batch_size=train_batch_size, @@ -103,50 +117,67 @@ def train( weighted_sampling=True, optimization_order=optimization_order, exclude_input_fields_from_bootstrap_demos=False, + max_proposals_per_step=max_proposals_per_step, ) + trainer.set_random_seed(seed) print(trainer) train_dataset, val_dataset, test_dataset = load_datasets() - trainer.fit( + ckpt, _ = trainer.fit( train_dataset=train_dataset, - val_dataset=test_dataset, - # val_dataset=val_dataset, - # test_dataset=test_dataset, + val_dataset=val_dataset, + test_dataset=test_dataset, debug=debug, - resume_from_ckpt="/Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json", + backward_pass_setup=backward_pass_setup, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/TrecClassifierAdal/constrained_max_steps_12_5d1bf_run_1.json", ) + return ckpt if __name__ == "__main__": # TODO: # Evaluating step(6): 0.7333 across 30 samples, Max potential: 0.7778: 83%|▊| 30/36 [00:08<00:01, # Optimizer revert: 0.7096774193548387 <= 0.7777777777777778 - train( + import json + + import random + + random.seed(2025) + # np.random.seed(2025) # Set NumPy random seed + + # make the strategy configurable in the script + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_true") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + + ckpt = train( **gpt_3_model, - debug=False, + debug=True, max_steps=12, - strategy="constrained", + strategy=set_strategy, optimization_order="sequential", - ) - # val 0.694 -> 0.833, #test 0.8472 -> 0.833, adding more shots does not help - # NOTE: raw: 40, bootstrap: 4, max_steps: 8, strategy: random, val: 86.1, test: 86.8 (+4.2% compared with dspy) - # NOTE: train task without output format: val: 0.67->0.805, test: 0.805-> 0.896 # best performing model (zero-shot) - # NOTE: train with without output format, use new class_name: constrained_max_steps_12_bac8d_run_1.json - # val: 0.77.8, test: 0.86.8 #constrained_max_steps_12_138d9_run_1.json - - # REsume from the above, continue another 12 steps: val: 77.78% tets: 86.81% - # result from the above, use bootstrap 1 shot: test -> 88.19% #constrained_max_steps_12_2ffa7_run_4.json (with input) - # result from the above, use bootstrap 1 shot: no improvement, 86.81% #constrained_max_steps_12_2ffa7_run_5.json (with only rational and answers) - # result from above, use bootstrap 2 shots: use input:no improvement - # bootstrap is not helpful - # 40 shots, 1 bootstrap, continue from last best, 86.1 val, 90.28% tes - # 40 shots, resume, no improvment - # continue from last best, 3 bootstrap, 83.3 val, 86.1 test (only rational) - # continue from last best, 3 bootstrap, (both input and rational)86.1 val, 82.64 test (not really better) - # NOTE: - # continue from last best, 1 bootstrap, (both input and rational)86.1 val, 86.1 test (not really better) - # TrecClassifierAdal/constrained_max_steps_12_2ffa7_run_2.json - - -# theory: all few-shots demo or instruction, all so that the llm can reason better. Once it reches to its limits, no more shots can help or further instruction can. -# there might be a saturation point!!! + seed=2025, + tg=use_tg, + max_proposals_per_step=max_proposals_per_step, + ) # val 0.694 -> 0.833, #test 0.8472 -> 0.833, adding more shots does not help + + if set_output_path: + with open(set_output_path, "w") as f: + json.dump({"ckpt": ckpt}, f) + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") diff --git a/use_cases/classification/train_string_output.py b/use_cases/classification/train_string_output.py index 9ecdef27..45fe5bcf 100644 --- a/use_cases/classification/train_string_output.py +++ b/use_cases/classification/train_string_output.py @@ -7,7 +7,7 @@ from use_cases.classification.data import load_datasets, TRECExtendedData from adalflow.eval.answer_match_acc import AnswerMatchAcc -from LightRAG.use_cases.config import ( +from use_cases.config import ( gpt_3_model, gpt_4o_model, ) diff --git a/use_cases/classification/trec_task_structured_output.py b/use_cases/classification/trec_task_structured_output.py index eb5333cd..56014cc6 100644 --- a/use_cases/classification/trec_task_structured_output.py +++ b/use_cases/classification/trec_task_structured_output.py @@ -60,7 +60,7 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict): # data="You are a classifier. Given a question, classify it into one of the following classes based on what the question is seeking:\n\nFormat: class_index. class_name, class_description\n\n0. ABBR, Abbreviation\n1. ENTY, Entity\n2. DESC, Description and abstract concept\n3. HUM, Human being\n4. LOC, Location\n5. NUM, Numeric value\n\nPay special attention to questions about entities versus descriptions, as well as those asking for specific terms or people. Do not try to answer the question:", # best # data="You are a classifier. For each question given, classify it into one of the following classes:\n\nFormat: class_index. class_name, class_description\n\n0. ABBR, Abbreviation (includes initials)\n1. ENTY, Entity (includes products, languages, objects, etc.)\n2. DESC, Description and abstract concept (includes explanations)\n3. HUM, Human being (includes individuals, groups, etc.)\n4. LOC, Location (includes addresses, places, etc.)\n5. NUM, Numeric value (includes distances, dates, ages, etc.)\n\n- Focus on identifying the primary subject of the question and classifying based on what is being explicitly asked for.", role_desc="Task description", - requires_opt=False, + requires_opt=True, param_type=adal.ParameterType.PROMPT, ), "output_format_str": adal.Parameter( @@ -70,12 +70,12 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict): param_type=adal.ParameterType.PROMPT, ), # NOTE: 88.19% - "few_shot_demos": adal.Parameter( - data=None, - requires_opt=True, - role_desc="Few shot examples to help the model", - param_type=adal.ParameterType.DEMOS, - ), + # "few_shot_demos": adal.Parameter( + # data=None, + # requires_opt=True, + # role_desc="Few shot examples to help the model", + # param_type=adal.ParameterType.DEMOS, + # ), } self.llm = adal.Generator( @@ -96,7 +96,7 @@ def _prepare_input(self, question: str): prompt_kwargs = { "input_str": adal.Parameter( data=input_str, - requires_opt=True, + requires_opt=False, role_desc="input to the LLM", param_type=adal.ParameterType.INPUT, ) @@ -108,6 +108,8 @@ def call( ) -> Union[adal.GeneratorOutput, adal.Parameter]: prompt_kwargs = self._prepare_input(question) output = self.llm(prompt_kwargs=prompt_kwargs, id=id) + if isinstance(output, adal.Parameter): + output.data_in_prompt = lambda x: x.data.raw_response return output diff --git a/use_cases/config.py b/use_cases/config.py index 895ed097..440965a4 100644 --- a/use_cases/config.py +++ b/use_cases/config.py @@ -15,7 +15,20 @@ gpt_3_model = { "model_client": OpenAIClient(input_type="text"), "model_kwargs": { - "model": "gpt-3.5-turbo", + "model": "gpt-3.5-turbo-0125", + "max_tokens": 2000, + "temperature": 0.0, + "top_p": 0.99, + "frequency_penalty": 0, + "presence_penalty": 0, + "stop": None, + }, +} + +gpt_3_1106_model = { + "model_client": OpenAIClient(input_type="text"), + "model_kwargs": { + "model": "gpt-3.5-turbo-1106", "max_tokens": 2000, "temperature": 0.0, "top_p": 0.99, @@ -38,10 +51,32 @@ }, } +gpt_4_model = { + "model_client": OpenAIClient(), + "model_kwargs": { + "model": "gpt-4-turbo", + "temperature": 1, + "top_p": 0.99, + "max_tokens": 1000, + # "frequency_penalty": 1, # high for nto repeating prompt + }, +} + gpt_4o_model = { "model_client": OpenAIClient(), "model_kwargs": { - "model": "gpt-4o-mini", + "model": "gpt-4o", # gpt-4o-realtime-preview-2024-12-17 + "temperature": 1, + "top_p": 0.99, + # "max_tokens": 1000, + # "frequency_penalty": 0.8, # high for nto repeating prompt + }, +} + +gpt_4o1_model = { + "model_client": OpenAIClient(), + "model_kwargs": { + "model": "o1-preview", "temperature": 1, "top_p": 0.99, "max_tokens": 1000, diff --git a/use_cases/question_answering/bbh/data.py b/use_cases/question_answering/bbh/data.py index d1fc3709..910b7e00 100644 --- a/use_cases/question_answering/bbh/data.py +++ b/use_cases/question_answering/bbh/data.py @@ -3,11 +3,11 @@ import re from dataclasses import dataclass, field -import adalflow as adal from adalflow.core import DataClass from adalflow.datasets.big_bench_hard import BigBenchHard from adalflow.utils.data import subset_dataset +from adalflow.core import func_to_parser @dataclass @@ -64,7 +64,7 @@ class QuestionAnswer(DataClass): ) # score can be used as weight for demo, weight = score (the higher the more likely to be sampled) -@adal.fun_to_component +@func_to_parser def parse_integer_answer(answer: str): """A function that parses the last integer from a string using regular expressions.""" try: @@ -81,7 +81,7 @@ def parse_integer_answer(answer: str): return answer -@adal.fun_to_component +@func_to_parser def extract_answer(answer: str) -> str: try: pattern = re.compile(r"Answer:\s*(.*)", re.DOTALL) diff --git a/use_cases/question_answering/bbh/object_count/task.py b/use_cases/question_answering/bbh/object_count/task.py index 6f5571f8..4892fe0f 100644 --- a/use_cases/question_answering/bbh/object_count/task.py +++ b/use_cases/question_answering/bbh/object_count/task.py @@ -37,12 +37,12 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict): param_type=ParameterType.PROMPT, instruction_to_optimizer="You can try to show examples to see if it helps.", ) - few_shot_demos = adal.Parameter( - data=None, - role_desc="To provide few shot demos to the language model", - requires_opt=False, - param_type=ParameterType.DEMOS, - ) + # few_shot_demos = adal.Parameter( + # data=None, + # role_desc="To provide few shot demos to the language model", + # requires_opt=True, + # param_type=ParameterType.DEMOS, + # ) self.llm_counter = adal.Generator( model_client=model_client, @@ -50,19 +50,19 @@ def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict): template=few_shot_template, prompt_kwargs={ "system_prompt": system_prompt, - "few_shot_demos": few_shot_demos, + # "few_shot_demos": few_shot_demos, }, output_processors=parse_integer_answer, use_cache=True, ) - def call( + def bicall( self, question: str, id: str = None ) -> Union[adal.GeneratorOutput, adal.Parameter]: output = self.llm_counter(prompt_kwargs={"input_str": question}, id=id) - print(f"output: {output}, training: {self.training}") + # print(f"output: {output}, training: {self.training}") if self.training: - if output.full_response.error and "429" in output.full_response.error: + if output.data.error and "429" in output.data.error: raise ValueError("Rate limit exceeded") else: if output.error and "429" in output.error: @@ -85,8 +85,9 @@ def test_object_count_task(): task_pipeline.train() answer: adal.Parameter = task_pipeline(question, id="1") print(answer) - print(f"full_response: {answer.full_response}") + print(f"data: {answer.data}") answer.draw_graph() + print(f"prompt_data: {answer.get_prompt_data()}") if __name__ == "__main__": diff --git a/use_cases/question_answering/bbh/object_count/train_new.py b/use_cases/question_answering/bbh/object_count/train_new.py index 48309aa7..bb21ec3b 100644 --- a/use_cases/question_answering/bbh/object_count/train_new.py +++ b/use_cases/question_answering/bbh/object_count/train_new.py @@ -58,8 +58,8 @@ def prepare_loss( eval_input=sample.answer, requires_opt=False, ) - pred.eval_input = pred.full_response.data - return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}} + pred.eval_input = pred.data.data + return self.loss_fn, {"kwargs": {"y": pred, "y_gt": y_gt}, "id": sample.id} # TODO: make the train diagnose on the student model and the teacher model automatcally @@ -95,6 +95,9 @@ def train_diagnose_teacher( # You will answer a reasoning question. Think step by step and double-check each calculation you make. Pay close attention to any numerical quantities in the text, converting written numbers into their numerical equivalents. Additionally, re-verify your final answer before concluding. The last line of your response should be of the following format: 'Answer: $VALUE' where VALUE is a numerical value. # 0.98 val, 0.91 test +from adalflow.core.generator import BackwardPassSetup + + def train( train_batch_size=4, # larger batch size is not that effective, probably because of llm's lost in the middle raw_shots: int = 0, @@ -106,6 +109,9 @@ def train( debug=False, resume_from_ckpt=None, exclude_input_fields_from_bootstrap_demos=False, + seed=None, + tg: bool = False, + max_proposals_per_step: int = 5, ): adal_component = ObjectCountAdalComponent( **gpt_3_model, @@ -114,6 +120,13 @@ def train( backward_engine_model_config=gpt_4o_model, ) print(adal_component) + backward_pass_setup = None + if tg: + backward_pass_setup = BackwardPassSetup( + all_pred_at_once=False, + compute_grad_for_errors_only=False, + ) + trainer = adal.Trainer( train_batch_size=train_batch_size, adaltask=adal_component, @@ -123,43 +136,71 @@ def train( raw_shots=raw_shots, bootstrap_shots=bootstrap_shots, debug=debug, - weighted_sampling=True, + weighted_sampling=False, optimization_order=optimization_order, exclude_input_fields_from_bootstrap_demos=exclude_input_fields_from_bootstrap_demos, + max_proposals_per_step=max_proposals_per_step, ) + trainer.set_random_seed(seed) print(trainer) train_dataset, val_dataset, test_dataset = load_datasets() + # train_dataset = train_dataset[:4] + # val_dataset = val_dataset[:4] + # test_dataset = test_dataset[:4] + ckpt, _ = trainer.fit( train_dataset=train_dataset, val_dataset=val_dataset, test_dataset=test_dataset, resume_from_ckpt=resume_from_ckpt, + backward_pass_setup=backward_pass_setup, ) return ckpt if __name__ == "__main__": - import sys import json + import random + + random.seed(2025) + # np.random.seed(2025) # Set NumPy random seed + + # make the strategy configurable in the script + import argparse + + parser = argparse.ArgumentParser() + + parser.add_argument("--strategy", type=str, default="constrained") + parser.add_argument("--use_tg", action="store_true") + parser.add_argument("--max_proposals_per_step", type=int, default=5) + parser.add_argument( + "output_path", nargs="?", help="File path to save the checkpoint" + ) + + args = parser.parse_args() + + set_strategy = args.strategy + set_output_path = args.output_path + use_tg = args.use_tg + max_proposals_per_step = args.max_proposals_per_step + ckpt = train( - debug=False, + debug=True, max_steps=12, - strategy="constrained", + strategy=set_strategy, exclude_input_fields_from_bootstrap_demos=True, + seed=2025, # pass the numpy seed + tg=use_tg, + max_proposals_per_step=max_proposals_per_step, + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_dc778_run_1.json", + # resume_from_ckpt="/Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_18e8d_run_1.json", ) print(f"ckpt: {ckpt}") - # Save ckpt to a file passed as an argument - if len(sys.argv) > 1: # Check if a file path is provided - with open(sys.argv[1], "w") as f: + if set_output_path: + with open(set_output_path, "w") as f: json.dump({"ckpt": ckpt}, f) - - # train_diagnose(**gpt_3_model) - # train_diagnose_teacher(**gpt_4o_model) # 4omini works well as an optimizer too - # /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_49c63_run_1.json - # 0.72 -> 0.9 val - # 0.79 -> 0.92 test - # 0.86->0.94 val, 0.79 -> 0.93 with only negative gradients /Users/liyin/.adalflow/ckpt/ObjectCountAdalComponent/constrained_max_steps_12_7a649_run_1.json - - # without gradients -> 0.9 on tests + print(f"Checkpoint saved to {set_output_path}") + else: + print("No file path provided for saving the checkpoint.") diff --git a/use_cases/question_answering/bbh/word_sorting/train.py b/use_cases/question_answering/bbh/word_sorting/train.py index 12518206..c2a20da1 100644 --- a/use_cases/question_answering/bbh/word_sorting/train.py +++ b/use_cases/question_answering/bbh/word_sorting/train.py @@ -76,7 +76,7 @@ def prepare_loss(self, sample: Example, pred: adal.Parameter): eval_input=sample.answer, requires_opt=False, ) - pred.eval_input = pred.full_response.data # processed + pred.eval_input = pred.data.data # processed question_param = adal.Parameter( name="question", data=sample.question, @@ -89,7 +89,8 @@ def prepare_loss(self, sample: Example, pred: adal.Parameter): "pred_answer": pred, "gt_answer": y_gt, "question": question_param, - } + }, + "id": sample.id, } diff --git a/use_cases/text_grad_2.0_train.py b/use_cases/text_grad_2.0_train.py index 37ff320d..90b9051e 100644 --- a/use_cases/text_grad_2.0_train.py +++ b/use_cases/text_grad_2.0_train.py @@ -1,21 +1,58 @@ import subprocess import tempfile import json +import numpy as np +import argparse +num_runs = 4 # List of experiments to run object_count = "use_cases/question_answering/bbh/object_count/train_new.py" +trec_6_classification = "use_cases/classification/train.py" hotpot_qa_multi_hop_rag = "benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag.py" +hotpot_qa_multi_hop_rag_cycle = ( + "benchmarks/hotpot_qa/adal_exp/train_multi_hop_rag_cycle.py" +) +hotpot_qa_vanilla = "benchmarks/hotpot_qa/adal_exp/train_vanilla.py" +hotpot_qa_vanilla_rag = "benchmarks/hotpot_qa/adal_exp/train_vanilla_rag.py" + ckpt_values = [] experiments = [ - object_count, + # object_count, + # trec_6_classification, + # hotpot_qa_vanilla_rag, # hotpot_qa_multi_hop_rag, + hotpot_qa_multi_hop_rag_cycle, ] +# set up the strategy for each experiment + +argparser = argparse.ArgumentParser() +argparser.add_argument("--strategy", type=str, default="constrained") +argparser.add_argument("--use_tg", action="store_true") +argparser.add_argument("--max_proposals_per_step", type=int, default=5) + +args = argparser.parse_args() + +strategy = args.strategy +use_tg = args.use_tg +max_proposals_per_step = args.max_proposals_per_step + # Optional: Arguments for each experiment (if needed) + +setup_str = f"--strategy {strategy}" + +if use_tg: + setup_str += " --use_tg" + +setup_str += f" --max_proposals_per_step {max_proposals_per_step}" + + experiment_args = { - object_count: "", - # hotpot_qa_multi_hop_rag: "", + object_count: setup_str, + trec_6_classification: setup_str, + hotpot_qa_vanilla_rag: setup_str, + hotpot_qa_multi_hop_rag: setup_str, } ckpt_values = {} @@ -47,12 +84,133 @@ def run_experiment(script, args): if __name__ == "__main__": + + result_file = "text_grad_2_results" + # add important run information in the naming of the file + import uuid + + result_file = f"{result_file}_{num_runs}_runs_{uuid.uuid4()}.json" + for experiment in experiments: args = experiment_args.get(experiment, "") - ckpt = run_experiment(experiment, args) - if ckpt: - ckpt_values[experiment] = ckpt + for i in range(num_runs): + print(f"\nRun {i + 1}/{num_runs}") + ckpt = run_experiment(experiment, args) + ckpt_index = f"{experiment}_{i + 1}" + if ckpt: + ckpt_values[ckpt_index] = ckpt + # load all json files using the ckpt paths + highest_test_score, last_test_score, mean_test_score, standard_deviation = ( + 0, + 0, + 0, + 0, + ) + last_test_scores = [] + highest_val_scores = [] + total_passes = ( + [] + ) # each is the number of unique val scores in the highest val scores + total_prompts = [] # how many prompts tried in total + + past_highest_val_scores = [] + # # average pass rate, average pass prompts + # average_pass_rate_list = [] + # average_pass_prompts_list = [] + # average_total_prompts = [] + # highest_test_score_json_file = None + total_steps = [] + training_times = [] + subset_pass_rate = [] + valset_pass_rate = [] + for experiment_index, ckpt in ckpt_values.items(): + with open(ckpt, "r") as f: + data = json.load(f) + print(f"Experiment: {experiment_index}") + print(f"Data: {data}") + _high_val_score = max(data["val_scores"]) + _unique_val_scores = len(set(data["val_scores"])) - 1 + _last_test_score = data["test_score"] + # read the effective measures + effective_measures = data.get("effective_measure", {}) + + _total_prompts = effective_measures.get("subset", {}).get( + "pass", 0 + ) + effective_measures.get("subset", {}).get("fail", 0) + if _total_prompts == 0: + _total_prompts = effective_measures.get("valset", {}).get( + "pass", 0 + ) + effective_measures.get("valset", {}).get("fail", 0) + _total_steps = len(data["steps"]) - 1 + _training_time = data.get("total_time", 0) + _subset_pass = effective_measures.get("subset", {}).get("pass", 0) + _subset_fail = effective_measures.get("subset", {}).get("fail", 0) + _valset_pass = effective_measures.get("valset", {}).get("pass", 0) + _valset_fail = effective_measures.get("valset", {}).get("fail", 0) + subset_pass_rate.append(_subset_pass / (_subset_pass + _subset_fail)) + valset_pass_rate.append(_valset_pass / (_valset_pass + _valset_fail)) + # save the results in the lists + past_highest_val_scores.append(_high_val_score) + total_passes.append(_unique_val_scores) + total_prompts.append(_total_prompts) + last_test_scores.append(_last_test_score) + total_steps.append(_total_steps) + training_times.append(_training_time) + + # ensure all steps are the same + assert all( + [step == total_steps[0] for step in total_steps] + ), "All steps should be the same" + + # compute the metrics + mean_test_score = np.mean(last_test_scores) + std_test_score = np.std(last_test_scores) + + # val scores + mean_val_score = np.mean(past_highest_val_scores) + std_val_score = np.std(past_highest_val_scores) + + # pass rate total_passes / steps + average_pass_rate = np.mean(total_passes) / total_steps[0] + + # average total prompts + average_total_prompts = np.mean(total_prompts) + + # average training time + average_training_time = np.mean(training_times) + + # subset pass rate + average_subset_pass_rate = np.mean(subset_pass_rate) + + # valset pass rate + average_valset_pass_rate = np.mean(valset_pass_rate) + + # add these numbers in the ckpt_values + index = f"{experiment}_summary" + ckpt_values[index] = { + "config": { + "num_runs": num_runs, + "args": args, + }, + "metrics": { + "mean_test_score": mean_test_score, + "std_test_score": std_test_score, + "mean_val_score": mean_val_score, + "std_val_score": std_val_score, + "average_pass_rate": average_pass_rate, + "average_total_prompts": average_total_prompts, + "average_training_time": average_training_time, + "average_subset_pass_rate": average_subset_pass_rate, + "average_valset_pass_rate": average_valset_pass_rate, + }, + } print("\nAll Checkpoints:") for experiment, ckpt in ckpt_values.items(): print(f"{experiment}: {ckpt}") + + # Save the results to a file + with open(result_file, "w") as f: + json.dump(ckpt_values, f, indent=4) + + print(f"\nResults saved to {result_file}")