ServiceNow · ollmer · Nov 14, 2024 · Nov 14, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/conf/gaia_llama.yaml b/conf/gaia_llama.yaml
@@ -8,7 +8,6 @@ split: validation
 n_attempts: 1
 agent:
   planning_mode: simple
-  subtasks: false
 
 env:
   safe_calculator: false
@@ -17,4 +16,4 @@ env:
 
 hydra:
   run:
-    dir: outputs/gaia/runs/${exp_name}
+    dir: ${exp_path}
diff --git a/conf/gaia_openai.yaml b/conf/gaia_openai.yaml
@@ -2,14 +2,13 @@ defaults:
   - llm: gpt4o_mini
   - _self_
 
-exp_name: gpt4o_mini_val_batch32_6
+exp_name: gpt4o_mini_val_l1test1
 exp_path: outputs/gaia/runs/${exp_name}
 split: validation
 n_attempts: 1
 batch: 32
 agent:
   planning_mode: simple
-  subtasks: false
 
 env:
   safe_calculator: false
@@ -18,4 +17,4 @@ env:
 
 hydra:
   run:
-    dir: outputs/gaia/runs/${exp_name}
+    dir: ${exp_path}
diff --git a/examples/gaia_agent/agent.py b/examples/gaia_agent/agent.py
@@ -13,14 +13,12 @@
 from .steps import (
     ActionExecutionFailure,
     CalculationResultObservation,
-    FinishSubtask,
     GaiaAgentStep,
     GaiaQuestion,
     ListOfFactsThought,
     PageObservation,
     PlanThought,
     PreviousFactsObservation,
-    PythonCodeAction,
     SearchResultsObservation,
     SourcesThought,
     UseCalculatorAction,
@@ -42,14 +40,14 @@ class PlanningMode(str, Enum):
 class GaiaNode(MonoNode):
     system_prompt: str = PromptRegistry.system_prompt
     steps_prompt: str = PromptRegistry.allowed_steps
-    agent_step_cls: Any = Field(exclude=True, default=GaiaAgentStep)
+    output_cls: Any = Field(exclude=True, default=GaiaAgentStep)
 
-    def get_steps_description(self, tape: GaiaTape, agent: Any) -> str:
+    def get_steps_description(self, tape: GaiaTape) -> str:
         """
         Allow different subset of steps based on the agent's configuration
         """
         plan_thoughts = not tape.has_fact_schemas()
-        allowed_steps = get_allowed_steps(agent.subtasks, plan_thoughts)
+        allowed_steps = get_allowed_steps(plan_thoughts)
         return self.steps_prompt.format(allowed_steps=allowed_steps)
 
     def prepare_tape(self, tape: GaiaTape, max_chars: int = 200) -> GaiaTape:
@@ -75,7 +73,7 @@ def postprocess_step(self, tape: GaiaTape, new_steps: list[Step], step: Step) ->
         if isinstance(step, ListOfFactsThought):
             # remove empty facts produced by the model
             step.given_facts = [fact for fact in step.given_facts if fact.value is not None and fact.value != ""]
-        elif isinstance(step, (UseCalculatorAction, PythonCodeAction)):
+        elif isinstance(step, (UseCalculatorAction)):
             # if calculator or code action is used, add the facts to the action call
             step.facts = tape.model_copy(update=dict(steps=tape.steps + new_steps)).facts()
         return step
@@ -84,9 +82,7 @@ def trim_tape(self, tape: GaiaTape) -> GaiaTape:
         """
         Make tape shorter to fit llm context size limits
         """
-        finish_subtask_positions = [i for i, step in enumerate(tape) if isinstance(step, FinishSubtask)]
-        # trim either after last finished subtask or at 2/3 of the tape
-        summarization_border = (finish_subtask_positions[-1] + 1) if finish_subtask_positions else int(len(tape) * 0.66)
+        summarization_border = int(len(tape) * 0.66)  # trim at 2/3 of the tape
         short_tape = tape.model_copy(update=dict(steps=[]))
         pre_tape: GaiaTape = tape[:summarization_border]  # type: ignore
         for step in pre_tape.steps:
@@ -100,22 +96,19 @@ def trim_tape(self, tape: GaiaTape) -> GaiaTape:
 
 
 class GaiaAgent(Agent):
-    subtasks: bool
-
     @classmethod
     def create(
         cls,
         llm: LLM,
         planning_mode: PlanningMode = PlanningMode.simple,
-        subtasks: bool = False,
     ):
-        nodes = cls.prepare_guidance(planning_mode, subtasks)
-        return super().create(llm, nodes=nodes, max_iterations=2, subtasks=subtasks)
+        nodes = cls.prepare_guidance(planning_mode)
+        return super().create(llm, nodes=nodes, max_iterations=2)
 
     @classmethod
-    def prepare_guidance(cls, planning_mode: PlanningMode, subtasks: bool) -> list[GaiaNode]:
+    def prepare_guidance(cls, planning_mode: PlanningMode) -> list[GaiaNode]:
         """
-        Prepare mononodes based on the planning mode and subtasks flag
+        Prepare mononodes based on the planning mode
         """
         guidance_nodes = []
         if planning_mode == PlanningMode.simple:
@@ -179,6 +172,4 @@ def prepare_guidance(cls, planning_mode: PlanningMode, subtasks: bool) -> list[G
             ]
         else:
             raise ValueError(f"Unknown planning mode: {planning_mode}")
-        if subtasks:
-            guidance_nodes.append(GaiaNode(name="check_subtask_finished", guidance=PromptRegistry.is_subtask_finished))
         return guidance_nodes
diff --git a/examples/gaia_agent/environment.py b/examples/gaia_agent/environment.py
@@ -2,6 +2,7 @@
 
 from tapeagents.environment import Environment
 from tapeagents.tools.calculator import calculate
+from tapeagents.tools.container_executor import CodeBlock, ContainerExecutor
 from tapeagents.tools.python_interpreter import python_calculate, run_python_code
 from tapeagents.tools.simple_browser import SimpleTextBrowser
 from tapeagents.utils import FatalError
@@ -27,8 +28,14 @@
 
 
 class GaiaEnvironment(Environment):
-    def __init__(self, safe_calculator: bool = True, **kwargs) -> None:
+    def __init__(
+        self,
+        safe_calculator: bool = True,
+        code_sandbox: ContainerExecutor | None = None,
+        **kwargs,
+    ) -> None:
         super().__init__()
+        self.code_sandbox = code_sandbox
         self.browser = SimpleTextBrowser(**kwargs)
         self.calculate = calculate if safe_calculator else python_calculate
 
@@ -76,10 +83,28 @@ def react(self, tape: GaiaTape) -> GaiaTape:
                         result = self.calculate(action.expression, action.facts or {})
                         tape = tape.append(CalculationResultObservation(name=action.fact_name, result=result))
                     case PythonCodeAction():
-                        result, stdout, stderr = run_python_code(action.code, action.facts or {})
-                        tape = tape.append(
-                            CodeResultObservation(name=action.fact_name, result=result, stdout=stdout, stderr=stderr)
-                        )
+                        if self.code_sandbox is not None:
+                            result = self.code_sandbox.execute_code_blocks(
+                                [CodeBlock(code=print_last_line(action.code), language="python")]
+                            )
+                            obs = CodeResultObservation(
+                                result=result.output.strip(),
+                                stdout=f"Exit code: {result.exit_code}",
+                                stderr="",
+                            )
+                        else:
+                            # TODO: remove this option and permutations crutch
+                            logger.warning(f"Code sandbox is not provided, running code locally!\n{action.code}")
+                            if "permutations" in action.code:
+                                result, stdout, stderr = "", "", "Execution timeout"
+                            else:
+                                result, stdout, stderr = run_python_code(action.code, {})
+                            obs = CodeResultObservation(
+                                result=result,
+                                stdout=stdout,
+                                stderr=stderr,
+                            )
+                        tape = tape.append(obs)
                     case LLMOutputParsingFailureAction():
                         pass
                     case _:
@@ -91,3 +116,13 @@ def react(self, tape: GaiaTape) -> GaiaTape:
                 tape = tape.append(ActionExecutionFailure(error=str(e)))
                 break
         return tape
+
+
+def print_last_line(python_code: str) -> str:
+    lines = python_code.splitlines()
+    if " = " in lines[-1]:
+        name = lines[-1].split("=")[0].strip()
+        lines.append(f"print({name})")
+    else:
+        lines[-1] = f"print({lines[-1]})"
+    return "\n".join(lines)
diff --git a/examples/gaia_agent/eval.py b/examples/gaia_agent/eval.py
@@ -9,14 +9,15 @@
 from huggingface_hub import snapshot_download
 from termcolor import colored
 
+from tapeagents.core import TerminationStep
 from tapeagents.io import load_tapes, save_json_tape
-from tapeagents.orchestrator import main_loop
+from tapeagents.orchestrator import MainLoopStatus, main_loop
 from tapeagents.renderers import step_view
 
 from .agent import GaiaAgent
 from .environment import GaiaEnvironment
 from .scorer import question_scorer
-from .steps import GaiaAnswer, GaiaQuestion, PlanThought
+from .steps import GaiaQuestion
 from .tape import GaiaMetadata, GaiaTape
 
 logger = logging.getLogger(__name__)
@@ -96,56 +97,32 @@ def load_dataset(split: str):
     return tasks
 
 
-def solve_task(task: dict, agent: GaiaAgent, env: GaiaEnvironment, n_attempts: int = 1) -> GaiaTape:
+def solve_task(task: dict, agent: GaiaAgent, env: GaiaEnvironment, level: int, tries: int = 1) -> GaiaTape:
     question = task_to_question_step(task, env)
-    tapes: list[GaiaTape] = []
-    results: list[Any] = []
-    previous_plans: list[str] = []
-    while len(tapes) < n_attempts:
-        predicted = None
-        tries = 3
-        while not predicted and tries:
-            tape = GaiaTape(steps=[question])
-            logger.info(colored(f"Attempt {len(tapes)+1}", "green"))
-            discard_attempt = False
-            planned = False
-            step = None
-            try:
-                for event in main_loop(agent, tape, env, max_loops=30):
-                    if event.agent_event and event.agent_event.step:
-                        step = event.agent_event.step
-                        tape = tape.append(step)  # type: ignore
-                        if isinstance(step, PlanThought) and not planned:
-                            plan_dump = "\n".join(step.plan)
-                            if plan_dump in previous_plans:
-                                logger.info("Plan already been used, discard attempt")
-                                discard_attempt = True
-                                break
-                            else:
-                                planned = True
-                                previous_plans.append(plan_dump)
-                    if event.observation:
-                        tape = tape.append(event.observation)  # type: ignore
-                if discard_attempt:
-                    continue
-            except Exception as e:
-                tape.metadata.error = str(e)
-                logger.exception(f"Failed to solve task: {e}")
-                break
-            predicted = step.answer if isinstance(step, GaiaAnswer) else None
-            tries -= 1
-        predicted = str(predicted)
-        tapes.append(tape)
-        results.append(predicted)
-        logger.info(f"Expected: {task['Final answer']}, Agent produced: {predicted}")
-    logger.info(f"Produced {len(tapes)} tapes, vote")
-    best = majority_vote(results)
-    logger.info(f"Majority vote best non-empty result: {best}, out of {results}")
-    best_tape = tapes[best]
-    best_tape.metadata = GaiaMetadata.model_validate(
-        best_tape.metadata.model_dump() | {"task": task, "result": results[best]}
-    )
-    return best_tape
+    result = None
+    tape = GaiaTape(steps=[question])
+    while not result and tries:
+        tape = GaiaTape(steps=[question])
+        try:
+            for event in main_loop(agent, tape, env, max_loops=60):
+                if event.agent_event and event.agent_event.step:
+                    tape = tape.append(event.agent_event.step)  # type: ignore
+                if event.observation:
+                    tape = tape.append(event.observation)  # type: ignore
+                if event.status == MainLoopStatus.TERMINATED:
+                    tape = tape.append(TerminationStep())
+                    tape.metadata.terminated = True
+        except Exception as e:
+            tape.metadata.error = str(e)
+            logger.exception(f"Fatal Error. Failed to solve task: {e}")
+            break
+        result = getattr(tape[-1], "answer", None)
+        tries -= 1
+    result = getattr(tape[-1], "answer", None)
+    logger.info(f"Expected: {task['Final answer']}, Agent produced: {result}")
+    tape.metadata = GaiaMetadata.model_validate(tape.metadata.model_dump() | {"task": task, "result": str(result)})
+    tape.metadata.level = level
+    return tape
 
 
 def task_to_question_step(task: dict, env: GaiaEnvironment, max_doc_length: int = 8000) -> GaiaQuestion: