Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP ] New planner and plan following for Gaia #99

Closed
wants to merge 65 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
65 commits
Select commit Hold shift + click to select a range
89672cf
new detailed planned, answer formalisation node
ollmer Nov 14, 2024
bc06b4b
explicit links to previous steps
ollmer Nov 14, 2024
170d6c6
support calling subagent from the mononode
ollmer Nov 15, 2024
62d2e4a
make plain text node common
ollmer Nov 15, 2024
238e2ff
controllable plan execution, full outline with node contents
ollmer Nov 15, 2024
83ef3e0
implement gaia new nodes, use free form thought for reflection everyw…
ollmer Nov 15, 2024
6f3e293
better debugging
ollmer Nov 15, 2024
a44ec5c
fix
ollmer Nov 15, 2024
85916dd
consistent node names, camel case
ollmer Nov 15, 2024
37d32f7
limit subagents scope using tape view stack
ollmer Nov 15, 2024
e8dff83
use free form reasoning for some thoughts
ollmer Nov 15, 2024
9921457
better subagent calls
ollmer Nov 15, 2024
f2495f4
proper return of subagent results
ollmer Nov 15, 2024
f092eda
pass any step positions as call args
ollmer Nov 15, 2024
173cb92
fix
ollmer Nov 15, 2024
f201acf
allow browser to show steps with missed prompts
ollmer Nov 18, 2024
1ebe86d
make formalize part of ThinkingNode, pass relevant facts to executor …
ollmer Nov 18, 2024
1c04dda
appendable web cache file
ollmer Nov 18, 2024
e91b5d5
update prompts, fix facts ledger update
ollmer Nov 18, 2024
4931d63
improve facts ledger
ollmer Nov 18, 2024
21834a4
pass subagent args using reference node and respond from subagent usi…
ollmer Nov 18, 2024
32459f3
simplify executor call, fix ledger update
ollmer Nov 18, 2024
440c648
remove print task in debug
ollmer Nov 18, 2024
aa022f1
adjust render
ollmer Nov 19, 2024
94e4304
mononode v2
ollmer Nov 19, 2024
c7b18ae
better debug
ollmer Nov 19, 2024
e9ed2e3
remove n_attempts and subtasks from old gaia
ollmer Nov 19, 2024
bffd013
reasoner subagent, use old gaia agent as executor
ollmer Nov 19, 2024
ca3c753
first eval of v2
ollmer Nov 19, 2024
a3f4e39
fix reasoner call
ollmer Nov 19, 2024
12f4b5a
update prompts, more fact format options
ollmer Nov 20, 2024
0d54e11
concurrent llm cache
ollmer Nov 20, 2024
7235004
allow load tapes with legacy steps in browser and results aggregation
ollmer Nov 20, 2024
575b69c
prompt adjust
ollmer Nov 20, 2024
3474156
fix
ollmer Nov 20, 2024
9092779
gaia node v2
ollmer Nov 20, 2024
7e53c5a
code sandbox
ollmer Nov 20, 2024
20f3fb3
fix v2 agent and test
ollmer Nov 21, 2024
79eadd5
fixes
ollmer Nov 21, 2024
48fb5b3
pass subtask result to the dependent task, looped reasoner with reading
ollmer Nov 21, 2024
5178f84
fix code execution
ollmer Nov 21, 2024
2668be6
reasonser start guidance
ollmer Nov 21, 2024
17dfb8a
coder agent, stop replan after 3 attempts
ollmer Nov 21, 2024
3c4f6ad
fix
ollmer Nov 21, 2024
be109b4
llm info and cost tracking
rizar Nov 21, 2024
a0b151c
actually log llm info
rizar Nov 21, 2024
0c9d187
better replan, guess when failed
ollmer Nov 22, 2024
ff35365
record loop termination on the tape
ollmer Nov 22, 2024
22cc688
show terminations in tape browser
ollmer Nov 22, 2024
0e83540
more loops for new arch
ollmer Nov 22, 2024
48e68d9
fix
ollmer Nov 22, 2024
442ada4
fix guess node
ollmer Nov 22, 2024
d9820ce
fix most tests
rizar Nov 22, 2024
cf24342
Merge branch 'gaia_planner' into llm_costs_and_info
rizar Nov 22, 2024
fc20b08
rm comment
rizar Nov 22, 2024
3791205
don't crash for local models because of LLM cost
rizar Nov 22, 2024
4e89b87
Merge pull request #108 from ServiceNow/llm_costs_and_info
rizar Nov 22, 2024
508ee8d
Merge branch 'main' into gaia_planner
ollmer Dec 2, 2024
dd49555
fix
ollmer Dec 2, 2024
3c5712f
fix test
ollmer Dec 2, 2024
7eb7fa5
update workarena agent, better llm replay errors
ollmer Dec 2, 2024
2eb1342
fix old agent
ollmer Dec 2, 2024
9c42610
update gaia test
ollmer Dec 2, 2024
31b5a41
update intro test
ollmer Dec 2, 2024
4ce872c
Merge branch 'main' into gaia_planner
ollmer Dec 2, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 173 additions & 2 deletions examples/gaia_agent/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
{prompt}
"""

FACTS_SURVEY = f"""Before we begin executing the plan, please answer the following pre-survey to the best of your ability.
FACTS_SURVEY = f"""Before we begin executing the plan, please answer the following pre-survey to the best of your ability.
Keep in mind that you are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from.
For each fact provide the description, expected json-compatible format and, if possible, measurement unit.
For each fact provide the description, expected json-compatible format and, if possible, measurement unit.
The fact name should be short and in lowercase. The description should be detailed, self-sustained and informative.
Here is the pre-survey:

Expand All @@ -54,6 +54,159 @@
If the objective has not been achieved, produce the next step.
"""

FACTS_SURVEY_V2_SYSTEM = """Below I will present you a request. Before we begin addressing the request, please answer the following pre-survey to the best of your ability. Keep in mind that you are Ken Jennings-level with trivia, and Mensa-level with puzzles, so there should be a deep well to draw from.

Here is the request:"""

FACTS_SURVEY_V2_GUIDANCE = """
Here is the pre-survey:

1. Please list any specific facts or figures that are GIVEN in the request itself. It is possible that there are none.
2. Please list any facts that may need to be looked up, and WHERE SPECIFICALLY they might be found. In some cases, authoritative sources are mentioned in the request itself.
3. Please list any facts that may need to be derived (e.g., via logical deduction, simulation, or computation)
4. Please list any facts that are recalled from memory, hunches, well-reasoned guesses, etc.

When answering this survey, keep in mind that "facts" will typically be specific names, dates, statistics, etc. Your answer should use headings:

1. GIVEN OR VERIFIED FACTS
2. FACTS TO LOOK UP
3. FACTS TO DERIVE
4. EDUCATED GUESSES

DO NOT include any other headings or sections in your response. DO NOT list next steps or plans until asked to do so.
"""

ALLOWED_STEPS_V2 = """
Produce answer with the following json schema:
{schema}
Do not reproduce schema when producing the step, use it only as a reference!
DO NOT OUTPUT ANYTHING BESIDES THE JSON. It will break the system that processes the output.
"""

TEAM = """
WebSurfer: A helpful assistant with access to a web browser. Ask them to perform web searches, open pages, and interact with content (e.g., clicking links, scrolling the viewport, etc., filling in form fields, etc.) It can also summarize the entire page, or answer questions based on the content of the page. It can also be asked to sleep and wait for pages to load, in cases where the pages seem to be taking a while to load.
Coder: A helpful and general-purpose AI programmer that has strong language skills, Python skills, and Linux command line skills. Avoid using for reading common file formats, as FileSurfer is more proficient at this. Can be helpful to process previously extracted numerical facts.
FileSurfer: An agent that can handle reading of local files only, could be helpful to read documents attached to the task or downloaded from the web by WebSurfer. Proficient with PDF, DOCX, XLSX, CSV, PPTX and other common formats.
"""

PLAN_V2 = """To address this request we have following tools:

Web search, web surfing, python code execution, reading local files, reasoning.

Based on the available tools and known and unknown facts, please reason step by step and produce short draft of the plan to solve this task.
After initial reasoning and drafting, produce more detailed bullet-point plan for addressing the original request. For each step of the plan, provide the following:
- detailed description of things to do
- list of tools and expected outcomes like concrete artifacts at the end of the step: facts, files, documents, or data to be produced
- prerequisites, a list of the results of the previous steps, or known facts needed to start working on this step.
"""

FORMALIZE_SYSTEM_PROMPT = """
You are an expert AI Agent trained to produce complex json structure from the plain text input.
"""

FORMALIZE_INPUT = """
Plain text input to be converted into json structure:
{content}
"""

FORMALIZE_GUIDANCE = """
Please produce the json structure from the plain text input in the previous message.
"""

FORMALIZE_FORMAT = """
Produce step using the following json schema:
{schema}
Do not reproduce schema fields when producing the step, use it only as a reference!
DO NOT OUTPUT ANYTHING BESIDES THE JSON. It will break the system that processes the output.
"""

START_EXECUTION_V2 = """
Let's start executing given task, using allowed steps described earlier.
Briefly describe required steps.
After that think what to do next.
"""

TODO_NEXT = """
Let's think what to do next.
"""

REFLECT_PLAN_STEP_RESULT = """
Reflect on the results of executing the subtask solving the plan step:
- If the subtask was successfully completed, compare the result with the expected outcome. If the expected outcome was not achieved, reflect on the reasons for the discrepancy.
- If the subtask was failed, reflect on the reasons for the failure. Think out loud how should we change the plan to be able to complete the task anyway.
"""

REFLECT_PLAN_STATUS = """
Reflect on the current state of the plan execution:
- If all the steps are completed, reflect on the overall success of the plan.
- If some steps are failed, reflect on the reasons for the failure. Think out loud how should we change the plan to be able to complete the task anyway.

"""

PLAN_STATUS = """
Current plan status:
Total steps: {total_steps}
Completed steps: {completed_steps}
Remaining steps: {remaining_steps}
"""

REPLAN = """Our previous attempt to solve task failed.
Our previous plan:
{plan}
Description of the failure:
{failure}


Please reason step by step and produce short draft of the new plan to solve this task. It should be different from the previous one.
Produce detailed bullet-point plan for addressing the original request. For each step of the plan, provide the following:
- detailed description of things to do
- list of tools and expected outcomes like concrete artifacts at the end of the step: facts, files, documents, or data to be produced
- prerequisites, a list of the results of the previous steps, or known facts needed to start working on this step.
"""

FINAL_ANSWER = """
Read the above messages and output a FINAL ANSWER to the question. The question is repeated here for convenience:

{task}

To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
If you are unable to determine the final answer, output empty result.
"""

REFLECT_OBSERVATION = """
Reflect on the results of the recent observation.
Check if the expected outcome was achieved.
Check if the observation was sufficient or not.
Check if the observation was inconclusive.
If the was some error, reflect on the reasons for the error and its content.
"""

FACTS_SURVEY_UPDATE = """
As a reminder, we are working to solve the following task:

{task}

It's clear we aren't making as much progress as we would like, but we may have learned something new. Please rewrite the following fact sheet, updating it to include anything new we have learned that may be helpful. Example edits can include (but are not limited to) adding new guesses, moving educated guesses to verified facts if appropriate, etc. Updates may be made to any section of the fact sheet, and more than one section of the fact sheet can be edited. This is an especially good time to update educated guesses, so please at least add or update one educated guess or hunch, and explain your reasoning.

Here is the old fact sheet:

1. GIVEN OR VERIFIED FACTS
{given}
2. FACTS TO LOOK UP
{lookup}
3. FACTS TO DERIVE
{derive}
4. EDUCATED GUESSES
{guesses}

Respond with updated facts sheet.
"""


class PromptRegistry:
system_prompt = SYSTEM_PROMPT
Expand All @@ -69,3 +222,21 @@ class PromptRegistry:
is_subtask_finished = IS_SUBTASK_FINISHED
think_after_observation = THINK_AFTER_OBSERVATION
think_after_calculation = THINK_AFTER_CALCULATION

facts_survey_v2_system = FACTS_SURVEY_V2_SYSTEM
facts_survey_v2 = FACTS_SURVEY_V2_GUIDANCE
allowed_steps_v2 = ALLOWED_STEPS_V2
plan_v2 = PLAN_V2
formalize_system_prompt = FORMALIZE_SYSTEM_PROMPT
formalize_guidance = FORMALIZE_GUIDANCE
formalize_input = FORMALIZE_INPUT
formalize_format = FORMALIZE_FORMAT
start_execution_v2 = START_EXECUTION_V2
todo_next = TODO_NEXT
reflect_plan_step_result = REFLECT_PLAN_STEP_RESULT
reflect_plan_status = REFLECT_PLAN_STATUS
plan_status = PLAN_STATUS
replan = REPLAN
final_answer = FINAL_ANSWER
reflect_observation = REFLECT_OBSERVATION
facts_survey_update = FACTS_SURVEY_UPDATE
52 changes: 52 additions & 0 deletions examples/gaia_agent/scripts/debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
from pprint import pprint

from tapeagents.io import save_json_tape
from tapeagents.llms import LiteLLM
from tapeagents.observe import retrieve_llm_call
from tapeagents.orchestrator import main_loop

from ..environment import GaiaEnvironment
from ..eval import load_dataset, task_to_question_step
from ..tape import GaiaTape
from ..v2 import GaiaPlanner


def main():
dset = load_dataset("../gaia/dataset/validation/")
exp_dir = "../gaia/runs/v2_debug/"
tapes_dir = f"{exp_dir}/tapes"
os.environ["TAPEAGENTS_SQLITE_DB"] = os.path.join(exp_dir, "tapedata.sqlite")
tape_name = "debug"
tasks = dset[2]
task = tasks[0]
pprint(task, width=140)
llm = LiteLLM(model_name="gpt-4o-mini-2024-07-18", context_size=128000, parameters={"temperature": 0.2})
env = GaiaEnvironment(vision_lm=llm, safe_calculator=False)
planner = GaiaPlanner.create(llm)
tape = GaiaTape(steps=[task_to_question_step(task, env)])
metadata = tape.metadata
metadata.task = task
metadata.level = 2
try:
for event in main_loop(planner, tape, env, max_loops=30):
if event.agent_event and event.agent_event.step:
step = event.agent_event.step
tape = tape.append(step)
llm_call = retrieve_llm_call(step.metadata.prompt_id)
assert llm_call
print("PROMPT:")
for i, m in enumerate(llm_call.prompt.messages):
print(f"M{i+1}")
pprint(m, width=140)
print(f"{len(tape)} STEP: {step.llm_view()}")
print("=" * 80)
input("Press Enter to continue...")
finally:
tape.metadata = metadata
save_json_tape(tape, tapes_dir, tape_name)
print(f"Saved tape to {tapes_dir}/{tape_name}.json")


if __name__ == "__main__":
main()
Loading
Loading