ServiceNow · ollmer · Sep 25, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/examples/tape_improver.py b/examples/tape_improver.py
@@ -5,7 +5,6 @@
 
 from tapeagents.agent import Agent
 from tapeagents.chain import Chain
-from tapeagents.team import TeamTape
 from tapeagents.core import (
     Action,
     AgentStep,
@@ -18,6 +17,7 @@
 from tapeagents.llms import LLM, LiteLLM, LLMStream
 from tapeagents.observe import observe_tape
 from tapeagents.rendering import PrettyRenderer
+from tapeagents.team import TeamTape
 from tapeagents.utils import run_in_tmp_dir_to_make_test_data
 from tapeagents.view import Call, Respond
 
@@ -122,7 +122,7 @@ def improver_tape_view(tape: Tape) -> str:
         data.append(step.llm_dict())
         data[-1]["index"] = index
         if isinstance(step, AgentStep):
-            data[-1]["by"] = step.by
+            data[-1]["by"] = step._metadata.by
     return json.dumps(data, indent=2)
 
 

diff --git a/examples/workarena/environment.py b/examples/workarena/environment.py
@@ -69,8 +69,8 @@ def start_task(
             text=text,
             current_page=self.browser.current_viewport,
             total_pages=self.browser.n_viewports,
-            screenshot_path=screen,
         )
+        obs._metadata.other["screenshot_path"] = screen
         tape = WorkArenaTape(steps=[obs, WorkArenaTask(task=info["goal"])])
         return tape, info
 
@@ -121,14 +121,15 @@ def react(self, tape: WorkArenaTape) -> WorkArenaTape:
 
     def perform_browser_action(self, action: str) -> PageObservation:
         text, screen, last_action_error, finished = self.browser.perform_action(action, self.baseline_obs)
-        return PageObservation(
+        obs = PageObservation(
             text=text,
             current_page=self.browser.current_viewport,
             total_pages=self.browser.n_viewports,
-            screenshot_path=screen,
-            env_finished=finished,
             last_action_error=last_action_error,
         )
+        obs._metadata.other["screenshot_path"] = screen
+        obs._metadata.other["env_finished"] = finished
+        return obs
 
     def goto_page(self, action: GotoPageAction) -> PageObservation:
         return self.perform_browser_action(f"goto('{action.url}')")
@@ -156,7 +157,6 @@ def scroll(self, action: ScrollAction) -> PageObservation:
             text=self.browser.scroll(action.direction),
             current_page=self.browser.current_viewport,
             total_pages=self.browser.n_viewports,
-            screenshot_path="",
         )
 
     def tab_focus(self, action: TabFocusAction) -> PageObservation:

diff --git a/examples/workarena/steps.py b/examples/workarena/steps.py
@@ -34,13 +34,8 @@ class PageObservation(WorkArenaObservation):
     text: str
     current_page: int
     total_pages: int
-    screenshot_path: str = ""
-    env_finished: bool = False
     last_action_error: str = ""
 
-    def llm_dict(self) -> dict[str, Any]:
-        return self.model_dump(exclude={"prompt_id", "screenshot_path", "env_finished"}, exclude_none=True)
-
 
 class ReasoningThought(WorkArenaThought):
     """

diff --git a/examples/workarena/tape_browser.py b/examples/workarena/tape_browser.py
@@ -138,8 +138,13 @@ def __init__(self, root_folder: str) -> None:
     def render_step(self, step: Step | dict, folded: bool = True, **kwargs) -> str:
         step_dict = step.model_dump() if isinstance(step, Step) else step
         html = super().render_step(step, folded, **kwargs)
+        screenshot_path = None
         if "screenshot_path" in step_dict:
-            screenshot_url = os.path.join("static", kwargs["tape_dir"], "screenshots", step_dict["screenshot_path"])
+            screenshot_path = step_dict["screenshot_path"]
+        if "screenshot_path" in step_dict.get("_metadata", {}).get("other", {}):
+            screenshot_path = step_dict["_metadata"]["other"]["screenshot_path"]
+        if screenshot_path:
+            screenshot_url = os.path.join("static", kwargs["tape_dir"], "screenshots", screenshot_path)
             html = f"<div class='basic-renderer-box' style='background-color:#baffc9;'><div><img src='{screenshot_url}' style='max-width: 100%;'></div>{html}</div>"
         return html
 

diff --git a/tapeagents/agent.py b/tapeagents/agent.py
@@ -308,7 +308,7 @@ def run_iteration(
             llm_stream = self.llm.generate(prompt) if prompt else LLMStream(None, prompt)
         for item in self.generate_steps(tape, llm_stream):
             if isinstance(item, AgentStep):
-                item.prompt_id = llm_stream.prompt.id
+                item._metadata.prompt_id = llm_stream.prompt.id
                 yield item
                 new_steps.append(item)
             else:
@@ -326,13 +326,13 @@ def _run_implementation():
             stop = False
             while n_iterations < self.max_iterations and not stop:
                 current_subagent = self.delegate(tape)
-                for item in current_subagent.run_iteration(tape):
-                    if isinstance(item, PartialStep):
-                        yield AgentEvent(partial_step=item)
-                    elif isinstance(item, AgentStep):
-                        item.by = current_subagent.full_name
-                        tape = tape.append(item)
-                        yield AgentEvent(step=item, partial_tape=tape)
+                for step in current_subagent.run_iteration(tape):
+                    if isinstance(step, PartialStep):
+                        yield AgentEvent(partial_step=step)
+                    elif isinstance(step, AgentStep):
+                        step._metadata.by = current_subagent.full_name
+                        tape = tape.append(step)
+                        yield AgentEvent(step=step, partial_tape=tape)
                         if self.should_stop(tape):
                             stop = True
                     else:

diff --git a/tapeagents/core.py b/tapeagents/core.py
@@ -27,10 +27,24 @@ def completion_str(self) -> str:
         return self.text[-self.n_predicted :]
 
 
+class StepMetadata(BaseModel):
+    """
+    Metadata for the step
+    """
+
+    id: str = Field(default_factory=lambda: str(uuid4()))
+    prompt_id: str = ""
+    task: str = ""
+    by: str = ""
+    other: dict[str, Any] = Field(default_factory=dict)
+
+
 class Step(BaseModel):
+    _metadata: StepMetadata = StepMetadata()
+
     def llm_dict(self) -> dict[str, Any]:
         """Dump step data only, drop the metadata"""
-        return self.model_dump(exclude_none=True)
+        return self.model_dump(exclude_none=True, exclude={"_metadata"})
 
     def llm_view(self, indent: int = 2) -> str:
         return json.dumps(self.llm_dict(), indent=indent, ensure_ascii=False)
@@ -51,12 +65,14 @@ class Error(Observation):
 
 
 class AgentStep(Step):
-    prompt_id: str = ""
-    task: str = ""
-    by: str = ""
+    pass
 
-    def llm_dict(self) -> dict:
-        return self.model_dump(exclude={"prompt_id", "task", "by"}, exclude_none=True)
+    def task(self, task: str) -> Self:
+        """
+        Set the task that is being solved when the step is produced
+        """
+        self._metadata.task = task
+        return self
 
 
 class Thought(AgentStep):

diff --git a/tapeagents/guided_agent.py b/tapeagents/guided_agent.py
@@ -125,5 +125,5 @@ def parse_completion(self, completion: str, prompt_id: str) -> Generator[Step, N
             )
             return
         for step in steps:
-            step.prompt_id = prompt_id
+            step._metadata.prompt_id = prompt_id
             yield step
diff --git a/tapeagents/llms.py b/tapeagents/llms.py
@@ -487,6 +487,7 @@ def _implementation():
                 known_prompts = list(self.completions.keys())
                 closest, score = closest_prompt(prompt_key, known_prompts)
                 if score >= 0.7:
+                    print("NEW\n", prompt_key, "\n")
                     logger.warning(f"Closest prompt score {score:.3f}:\n{diff_strings(prompt_key, closest)}")
                 raise FatalError("prompt not found")
             yield LLMEvent(completion=LLMMessage(content=completion))

diff --git a/tapeagents/rendering.py b/tapeagents/rendering.py
@@ -2,24 +2,24 @@
 import re
 from typing import Any, Type
 
-from pydantic import BaseModel
 import yaml
+from pydantic import BaseModel
 
 from .agent import Agent
-from .team import CodeExecutionResult, ExecuteCode
-from .observe import LLMCall, retrieve_tape_llm_calls
 from .container_executor import CodeBlock
-from .view import Call, Respond
 from .core import Action, Episode, Observation, Prompt, Step, Tape, Thought
 from .dialog_tape import (
     AssistantStep,
-    DialogTape,
     DialogContext,
+    DialogTape,
     SystemStep,
     ToolCalls,
     ToolResult,
     UserStep,
 )
+from .observe import LLMCall, retrieve_tape_llm_calls
+from .team import CodeExecutionResult, ExecuteCode
+from .view import Call, Respond
 
 
 def render_dialog_plain_text(tape: DialogTape) -> str:
@@ -226,11 +226,11 @@ def render_step(self, step: Step, index: int, **kwargs):
             class_ = "observation"
         elif isinstance(step, Call):
             role = ""
-            title = f"{step.by.split('/')[-1]} calls {step.agent_name}"
+            title = f"{step._metadata.by.split('/')[-1]} calls {step.agent_name}"
             class_ = "call"
         elif isinstance(step, Respond):
             role = ""
-            parts = step.by.split("/")
+            parts = step._metadata.by.split("/")
             title = f"{parts[-1]} responds to {parts[-2]}"
             class_ = "return"
         elif isinstance(step, Thought):
@@ -433,4 +433,4 @@ def render_subagents(agent, indent=4):
 
 def render_tape_with_prompts(tape: Tape, renderer: BasicRenderer):
     llm_calls = retrieve_tape_llm_calls(tape)
-    return renderer.style + renderer.render_tape(tape, llm_calls)
+    return renderer.style + renderer.render_tape(tape, llm_calls)
diff --git a/tapeagents/team.py b/tapeagents/team.py
@@ -8,7 +8,7 @@
 from tapeagents.agent import DEFAULT, Agent, AgentStep, Node
 from tapeagents.autogen_prompts import SELECT_SPEAKER_MESSAGE_AFTER_TEMPLATE, SELECT_SPEAKER_MESSAGE_BEFORE_TEMPLATE
 from tapeagents.container_executor import extract_code_blocks
-from tapeagents.core import FinalStep, Jump, Pass, Prompt, Tape
+from tapeagents.core import FinalStep, Jump, Pass, Prompt, StepMetadata, Tape
 from tapeagents.environment import CodeExecutionResult, ExecuteCode
 from tapeagents.llms import LLM, LLMStream
 from tapeagents.view import Broadcast, Call, Respond, TapeViewStack
@@ -124,7 +124,7 @@ def create_chat_initiator(
             },
             llms={DEFAULT: llm} if llm else {},
             subagents=[teammate],
-            flow=([ExecuteCodeNode()] if execute_code else []) + [CallNode(), TerminateOrRepeatNode()],
+            flow=([ExecuteCodeNode()] if execute_code else []) + [CallNode(), TerminateOrRepeatNode()],  # type: ignore
             max_calls=max_calls,
             init_message=init_message,
         )
@@ -139,19 +139,18 @@ def generate_steps(
         view = ActiveTeamAgentView(agent, tape)
         recipients = agent.get_subagent_names()
         last = view.messages[-1]
-        from_ = last.by.split("/")[-1]
+        from_ = last._metadata.by.split("/")[-1]
         match last:
             case Call():
-                yield Broadcast(task=self.name, content=last.content, from_=from_, to=list(recipients))
+                yield Broadcast(content=last.content, from_=from_, to=list(recipients)).task(self.name)
             case Respond():
-                recipients = [name for name in recipients if name != last.by.split("/")[-1]]
+                recipients = [name for name in recipients if name != last._metadata.by.split("/")[-1]]
                 yield Broadcast(
-                    task=self.name,
                     content=view.messages[-1].content,
                     from_=from_,
                     to=list(recipients),
-                )
-            case Broadcast(task=self.name):
+                ).task(self.name)
+            case Broadcast(_metadata=StepMetadata(task=self.name)):
                 pass
             case _:
                 assert False
@@ -175,12 +174,12 @@ def generate_steps(
         # if last node
         (other,) = agent.subagents
         if view.should_generate_message:
-            yield Call(task=self.name, agent_name=other.name, content=llm_stream.get_text())
+            yield Call(agent_name=other.name, content=llm_stream.get_text()).task(self.name)
         elif view.exec_result:
-            yield Call(task=self.name, agent_name=other.name, content=_exec_result_message(agent, tape))
+            yield Call(agent_name=other.name, content=_exec_result_message(agent, tape)).task(self.name)
         else:
             assert agent.init_message and not view.messages
-            yield Call(task=self.name, agent_name=other.name, content=agent.init_message)
+            yield Call(agent_name=other.name, content=agent.init_message).task(self.name)
 
 
 class SelectAndCallNode(Node):
@@ -208,21 +207,21 @@ def generate_steps(
         callee_name = llm_stream.get_text()
         # check if the callee is an existing subagent
         _ = agent.find_subagent(callee_name)
-        yield Call(task=self.name, agent_name=callee_name)
+        yield Call(agent_name=callee_name).task(self.name)
 
 
 class ExecuteCodeNode(Node):
     name: str = "execute_code"
 
-    def generate_steps(self, agent: logging.Any, tape: Tape, llm_stream: LLMStream) -> Generator[AgentStep, None, None]:
+    def generate_steps(self, agent: TeamAgent, tape: Tape, llm_stream: LLMStream) -> Generator[AgentStep, None, None]:
         assert not llm_stream
         view = ActiveTeamAgentView(agent, tape)
         if view.last_non_empty_message is None:
-            yield Pass(task=self.name)
+            yield Pass().task(self.name)
         elif code := extract_code_blocks(view.last_non_empty_message.content):
-            yield ExecuteCode(task=self.name, code=code)
+            yield ExecuteCode(code=code).task(self.name)
         else:
-            yield Pass(task=self.name)
+            yield Pass().task(self.name)
 
 
 class RespondNode(Node):
@@ -241,15 +240,15 @@ def generate_steps(
     ) -> Generator[AgentStep, None, None]:
         view = ActiveTeamAgentView(agent, tape)
         if view.should_generate_message:
-            yield Respond(task=self.name, content=llm_stream.get_text())
+            yield Respond(content=llm_stream.get_text()).task(self.name)
         elif view.exec_result:
-            yield Respond(task=self.name, content=_exec_result_message(agent, tape))
+            yield Respond(content=_exec_result_message(agent, tape)).task(self.name)
         else:
             logger.info(
                 f"Agent {agent.full_name} had to respond with an empty message."
                 f" You might want to optimize your orchestration logic."
             )
-            yield Respond(task=self.name)
+            yield Respond().task(self.name)
 
 
 class TerminateOrRepeatNode(Node):
@@ -261,9 +260,9 @@ def generate_steps(
         assert not llm_stream
         view = ActiveTeamAgentView(agent, tape)
         if view.should_stop:
-            yield FinalStep(task=self.name, reason="Termination message received")
+            yield FinalStep(reason="Termination message received").task(self.name)
         else:
-            yield Jump(task=self.name, next_node=0)
+            yield Jump(next_node=0).task(self.name)
 
 
 class RespondOrRepeatNode(Node):
@@ -274,9 +273,9 @@ def generate_steps(
     ) -> Generator[AgentStep, None, None]:
         view = ActiveTeamAgentView(agent, tape)
         if view.should_stop:
-            yield Respond(task=self.name)
+            yield Respond().task(self.name)
         else:
-            yield Jump(task=self.name, next_node=0)
+            yield Jump(next_node=0).task(self.name)
 
 
 def _exec_result_message(agent: TeamAgent, tape: TeamTape) -> str:
@@ -297,7 +296,7 @@ def _llm_messages_from_tape(agent: TeamAgent, tape: TeamTape) -> list[dict[str,
         match step:
             # When we make the LLM messages, we use "kind" == "user" for messages
             # originating from other agents, and "kind" == "assistant" for messages by this agent.
-            case Call() if step.by == agent.full_name:
+            case Call() if step._metadata.by == agent.full_name:
                 # I called someone
                 llm_messages.append({"role": "assistant", "content": step.content})
             case Call():
@@ -309,15 +308,15 @@ def _llm_messages_from_tape(agent: TeamAgent, tape: TeamTape) -> list[dict[str,
                     {
                         "role": "user",
                         "content": step.content,
-                        "name": step.by.split("/")[-1],
+                        "name": step._metadata.by.split("/")[-1],
                     }
                 )
-            case Respond() if step.by == agent.full_name:
+            case Respond() if step._metadata.by == agent.full_name:
                 # I responded to someone
                 llm_messages.append({"role": "assistant", "content": step.content})
             case Respond():
                 # someone responded to me
-                who_returned = step.by.split("/")[-1]
+                who_returned = step._metadata.by.split("/")[-1]
                 llm_messages.append({"role": "user", "content": step.content, "name": who_returned})
             case Broadcast():
                 llm_messages.append({"role": "user", "content": step.content, "name": step.from_})