Skip to content

Commit

Permalink
Merge pull request #147 from ServiceNow/browsergym_gaia
Browse files Browse the repository at this point in the history
Use full browser in gaia
  • Loading branch information
ollmer authored Jan 28, 2025
2 parents 9918a41 + 5703644 commit 529e463
Show file tree
Hide file tree
Showing 118 changed files with 4,048 additions and 10,796 deletions.
10 changes: 3 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -108,14 +108,10 @@ cython_debug/

*.jsonl
!/tests/**/*.jsonl

*.sqlite
!tests/**/**.sqlite

*.zip

*.log
*.sqlite
*.zip

# Weights & Biases
/wandb/
/temp/
/temp/
2 changes: 0 additions & 2 deletions conf/gaia_llama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ agent:
subtasks: false

env:
attachment_dir: ${exp_path}/attachments/
image_observations: false
use_web_cache: true

hydra:
Expand Down
25 changes: 18 additions & 7 deletions conf/gaia_openai.yaml
Original file line number Diff line number Diff line change
@@ -1,18 +1,29 @@
defaults:
- llm: gpt4o
- llm: gpt4o_mini
- _self_

exp_name: gpt4o_val_search1

exp_name: gpt4o_mini_val_axtree2

exp_path: outputs/gaia/runs/${exp_name}
split: validation
batch: 1
batch: 16
retry_unsolved: true

only_tasks: [] # list of (level, task_num)
# - [1, 0]
# - [1, 1]
# - [1, 2]
# - [1, 3]
# - [1, 4]
# - [1, 5]
# - [1, 6]
# - [1, 7]

agent:
plain_code: false

env:
attachment_dir: ${exp_path}/attachments/
image_observations: true
simple_browser: false
use_web_cache: true

studio:
Expand All @@ -21,4 +32,4 @@ studio:

hydra:
run:
dir: ${exp_path}
dir: ${exp_path}
1 change: 0 additions & 1 deletion conf/workarena_demo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ exp_path: ../workarena/runs/${exp_name}
agent: baseline
env:
exp_path: ${exp_path}
baseline_obs: True
headless: False
seeds: [42]

Expand Down
1 change: 0 additions & 1 deletion conf/workarena_openai.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ exp_path: ../workarena/runs/${exp_name}
agent: guided
env:
exp_path: ${exp_path}
baseline_obs: False
headless: True
seeds: [0, 42, 1337, 900, 103]

Expand Down
132 changes: 23 additions & 109 deletions examples/gaia_agent/agent.py
Original file line number Diff line number Diff line change
@@ -1,136 +1,50 @@
import logging
from enum import Enum
from typing import Any

from pydantic import Field

from tapeagents.agent import Agent
from tapeagents.environment import CodeExecutionResult, ExecuteCode
from tapeagents.core import Step
from tapeagents.llms import LLM
from tapeagents.nodes import MonoNode
from tapeagents.steps import VideoObservation
from tapeagents.tools.container_executor import extract_code_blocks
from tapeagents.steps import ActionExecutionFailure, VideoObservation
from tapeagents.tools.simple_browser import PageObservation

from .prompts import PromptRegistry
from .steps import (
ActionExecutionFailure,
CalculationResultObservation,
CodeResultObservation,
FinishSubtask,
GaiaAgentStep,
GaiaQuestion,
ListOfFactsThought,
NewFactThought,
PageObservation,
PlanThought,
SourcesThought,
all_steps,
nocode_steps,
plan_steps,
)
from .steps import THOUGHTS, FactsSurvey, Plan
from .tape import GaiaTape

logger = logging.getLogger(__name__)


class PlanningMode(str, Enum):
simple = "simple"
facts_and_sources = "facts_and_sources"
multiplan = "multiplan"
replan_after_sources = "replan_after_sources"
reflect = "reflect"


class GaiaNode(MonoNode):
system_prompt: str = PromptRegistry.system_prompt
steps_prompt: str = PromptRegistry.allowed_steps
agent_step_cls: Any = Field(exclude=True, default=GaiaAgentStep)
allowed_steps: str

def get_steps_description(self, tape: GaiaTape, agent: Any) -> str:
"""
Allow different subset of steps based on the agent's configuration
"""
return self.steps_prompt.format(allowed_steps=self.allowed_steps)

def prepare_tape(self, tape: GaiaTape, max_chars: int = 200) -> GaiaTape:
"""
Trim long observations except for the last 3 steps
"""
tape = super().prepare_tape(tape) # type: ignore
steps = []
for step in tape.steps[:-3]:
if isinstance(step, PageObservation):
short_text = f"{step.text[:max_chars]}\n..." if len(step.text) > max_chars else step.text
new_step = step.model_copy(update=dict(text=short_text))
elif isinstance(step, ActionExecutionFailure):
short_error = f"{step.error[:max_chars]}\n..." if len(step.error) > max_chars else step.error
new_step = step.model_copy(update=dict(error=short_error))
steps_border = -3
for step in tape.steps[:steps_border]:
if isinstance(step, PageObservation) and len(step.text) > max_chars:
trimmed_step = step.model_copy(update=dict(text=f"{step.text[:max_chars]}\n..."))
elif isinstance(step, ActionExecutionFailure) and len(step.error) > max_chars:
trimmed_step = step.model_copy(update=dict(error=f"{step.error[:max_chars]}\n..."))
elif isinstance(step, VideoObservation):
new_step = step.model_copy(update=dict(video_contact_sheet_paths=None, subtitle_text=None))
trimmed_step = step.model_copy(update=dict(video_contact_sheet_paths=None, subtitle_text=None))
else:
new_step = step
steps.append(new_step)
trimmed_tape = tape.model_copy(update=dict(steps=steps + tape.steps[-3:]))
return trimmed_tape

def trim_tape(self, tape: GaiaTape) -> GaiaTape:
"""
Make tape shorter to fit llm context size limits
"""
finish_subtask_positions = [i for i, step in enumerate(tape) if isinstance(step, FinishSubtask)]
# trim either after last finished subtask or at 2/3 of the tape
summarization_border = (finish_subtask_positions[-1] + 1) if finish_subtask_positions else int(len(tape) * 0.66)
short_tape = tape.model_copy(update=dict(steps=[]))
pre_tape: GaiaTape = tape[:summarization_border] # type: ignore
for step in pre_tape.steps:
if isinstance(
step,
(
GaiaQuestion,
PlanThought,
SourcesThought,
ListOfFactsThought,
NewFactThought,
CalculationResultObservation,
CodeResultObservation,
CodeExecutionResult,
),
):
short_tape.steps.append(step)
for step in tape.steps[summarization_border:]:
short_tape.steps.append(step)
logger.info(f"Tape reduced from {len(tape)} to {len(short_tape)} steps")
return short_tape

def parse_completion(self, llm_output: str, prompt_id: str):
if llm_output.strip().startswith("```"):
code_blocks = extract_code_blocks(llm_output)
yield ExecuteCode(code=code_blocks)
else:
for step in super().parse_completion(llm_output, prompt_id):
yield step
trimmed_step = step
steps.append(trimmed_step)
return tape.model_copy(update=dict(steps=steps + tape.steps[steps_border:]))


class GaiaAgent(Agent):
plain_code: bool
name: str = "gaia_agent_v3"

@classmethod
def create(cls, llm: LLM, plain_code: bool = False, **kwargs):
def create(cls, llm: LLM, actions: tuple[Step, ...], plain_code: bool = False, **kwargs):
steps_prompt = PromptRegistry.allowed_steps_code if plain_code else PromptRegistry.allowed_steps
steps = actions + THOUGHTS
nodes = [
GaiaNode(name="plan", guidance=PromptRegistry.plan, allowed_steps=plan_steps),
GaiaNode(name="facts_survey", guidance=PromptRegistry.facts_survey, allowed_steps=plan_steps),
GaiaNode(
name="start_execution",
guidance=PromptRegistry.start_execution,
steps_prompt=PromptRegistry.allowed_steps_code if plain_code else PromptRegistry.allowed_steps,
allowed_steps=nocode_steps if plain_code else all_steps,
),
GaiaNode(
name="act",
steps_prompt=PromptRegistry.allowed_steps_code if plain_code else PromptRegistry.allowed_steps,
allowed_steps=nocode_steps if plain_code else all_steps,
next_node="act",
),
GaiaNode(name="plan", guidance=PromptRegistry.plan, agent_steps=Plan),
GaiaNode(name="facts_survey", guidance=PromptRegistry.facts_survey, agent_steps=FactsSurvey),
GaiaNode(name="start", guidance=PromptRegistry.start, steps_prompt=steps_prompt, agent_steps=steps),
GaiaNode(name="act", steps_prompt=steps_prompt, agent_steps=steps, next_node="act"),
]
return super().create(llm, nodes=nodes, max_iterations=2, plain_code=plain_code, **kwargs)
return super().create(llm, nodes=nodes, max_iterations=2, **kwargs)
Loading

0 comments on commit 529e463

Please sign in to comment.