diff --git a/src/agentlab/analyze/error_analysis/base_idea.py b/src/agentlab/analyze/error_analysis/base_idea.py new file mode 100644 index 00000000..5d4827d4 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/base_idea.py @@ -0,0 +1,287 @@ +from dataclasses import dataclass + +from bgym import ExpResult, StepInfo + +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +{ + "changeSummary": "A new search results panel appeared on the right side.", + "actionAssessment": "Correct", + "explanation": "Clicking 'Search' was appropriate to display the results." +} + +Or for an incorrect action: + +{ + "changeSummary": "The page reloaded but the date fields were reset to defaults.", + "actionAssessment": "Incorrect", + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", + "suggestion": "Correct the date format or check for error messages." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Previous Observation: {past_observation} + +Current Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. AGENT ERRORS +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. + + - Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + + - Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + + - Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + + - Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +2. LANGUAGE MODEL ERRORS +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, +independent of the low-level web interactions. + + - Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + + - Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +3. BENCHMARK & ENVIRONMENT ERRORS +These errors are external to the agent's logic and the language model's reasoning, +arising from flaws in the system, network, or evaluation framework itself. + + - System Errors + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). + + - Benchmark Design Errors + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), + or inflexible evaluation systems that fail to account for valid alternative solutions. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Planning / Thought History + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. + +3. Current Observation (HTML / AX Tree Snippet) + - The webpage structure or state that the agent sees at a given point in time. + +4. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +5. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) + • Context: The agent correctly finds a cheaper product meeting the user's criteria, + but the benchmark expects a more expensive product and marks the solution as wrong. + • Classification: ["Benchmark Design Error"] + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid + and does not allow an alternative correct solution. + +2) EXAMPLE B (Agent Error - Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Agent Error - Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" + The query is ambiguous because "Upitts" is not a standard location. + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. + • Classification: ["Benchmark Design Error"] + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), + leading the agent astray due to unclear context. + +4) EXAMPLE D (Language Model Error - Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Language Model Error - Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide if the failure is: + - An Agent Error (which subcategory/subcategories), + - A Language Model Error (which subcategory/subcategories), + - A Benchmark/Environment Error (which subcategory/subcategories), + - Or a combination thereof (multi-label if needed). + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". + +Output Format Example: +{ + "errorCategory": ["Agent Error - Navigation & Planning"], + "explanation": "The agent opened the wrong GitLab page and never recovered..." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Overall goal: {goal} + +LLM Plan and thought history: {plan} + +Current Observation: {current_observation} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} +""" + + +def _diff(past_obs, current_obs): + """TODO: Implement the diff function. + + Returns a diff version of current_obs compares to past_obs, unless there is too many changes. + """ + raise ValueError("Not implemented yet.") + + +@dataclass +class ChangeSummarizer: + + llm: callable # language model + obs_formatter: callable + use_diff: bool = False + + def summarize( + self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] + ) -> str: + """Produces, a summary of the effect of an action.""" + past_obs_message = self.obs_formatter(past_obs) + current_obs_message = self.obs_formatter(current_obs) + + goal = past_obs["goal"] # Use goal object from agentlab + # Outsource everything to formatter + plan = past_obs["plan"] + if self.use_diff: + current_obs_message = _diff(past_obs_message, current_obs_message) + + return self.llm( + self.make_prompt( + past_obs_message, action, current_obs_message, past_summaries, goal, plan + ) + ) + + def make_prompt( + self, past_obs_message, action, current_obs_message, past_summaries, goal, plan + ): + """TODO: Implement the prompt.""" + return CHANGE_SUMMARIZER_PROMPT.format( + goal=goal, + plan=plan, + past_observation=past_obs_message, + current_observation=current_obs_message, + past_summaries=past_summaries, + action=action, + ) + + +@dataclass +class EpisodeAnalysis: + analysis: str # complete analysis of the episode + summary: str # short summary of the analysis + categories: dict[str, float] # score for each category e.g. type of error or difficulty levels + + +@dataclass +class EpisodeSummarizer: + + change_summarizer: ChangeSummarizer = None + + def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis: + """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" + pass + + +@dataclass +class EpisodeErrorSummarizer(EpisodeSummarizer): + + change_summarizer: ChangeSummarizer = None + + def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): + """TODO: Implement the prompt.""" + return ERROR_CLASSIFICATION_PROMPT.format( + goal=goal, + plan=plan, + current_observation=current_observation, + historical_summaries=historical_summaries, + action_history=action_history, + ) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 4a961b76..305d00b4 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -23,7 +23,6 @@ def __call__(self, *args, **kwds): class ErrorAnalysisPipeline: exp_dir: Path filter: str = None - step_summarizer: ChangeSummarizer = None episode_summarizer: EpisodeSummarizer = None analyzer: Analyzer = None @@ -38,26 +37,10 @@ def run_analysis(self): filtered_results = self.filter_exp_results() for exp_result in filtered_results: - step_analysis = self.analyze_step(exp_result) - episode_analysis = self.analyze_episode(exp_result, step_analysis) - error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis) + episode_summary = self.episode_summarizer(exp_result) + error_analysis = self.analyze_errors(exp_result, episode_summary) self.save_analysis(exp_result, error_analysis) - def analyze_step(self, exp_result: ExpResult) -> list[str]: - step_summaries = [] # type: list[str] - # this assumes that there is always an extra step at the end of the episode - # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info - # TODO:(thibault) make some checks - for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): - step_summaries.append( - self.step_summarizer.summarize(step, step.action, next_step, step_summaries) - ) - return step_summaries - - def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str: - episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis) - return episode_summary - def analyze_errors( self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] ) -> str: @@ -82,10 +65,20 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T args = parser.parse_args() exp_dir = Path(args.exp_dir) + filter = args.filter + + import openai + + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + + llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model() + + step_summarizer = ChangeSummarizer(llm, lambda x: x) + episode_summarizer = EpisodeSummarizer() pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, - filter=None, + filter=filter, episode_summarizer=EpisodeSummarizer(), step_summarizer=ChangeSummarizer(), analyzer=Analyzer("prompt"), diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index b3760216..7c5f9b03 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -1,209 +1,12 @@ from dataclasses import dataclass -from bgym import StepInfo +from bgym import ExpResult, StepInfo -CHANGE_SUMMARIZER_PROMPT = """ -You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, -you will receive the following pieces of information: - -1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). -2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. -3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). -4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). -5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. - -YOUR TASK (each step): -A) SUMMARIZE THE CHANGE - - Describe what visibly changed between the previous observation (or diff) and the current observation. - For example, did a new panel open, did the form reset, did nothing happen, etc.? - -B) ASSESS THE ACTION - - Decide whether the agent's action seems helpful or correct given the user's main goal, - or if it appears incorrect/unhelpful. - - Briefly explain why. - -OUTPUT FORMAT (per step): -Return your analysis as a JSON-like structure, for example: - -{ - "changeSummary": "A new search results panel appeared on the right side.", - "actionAssessment": "Correct", - "explanation": "Clicking 'Search' was appropriate to display the results." -} - -Or for an incorrect action: - -{ - "changeSummary": "The page reloaded but the date fields were reset to defaults.", - "actionAssessment": "Incorrect", - "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", - "suggestion": "Correct the date format or check for error messages." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Goal: {goal} - -LLM Plan: {plan} - -Previous Observation: {past_observation} - -Current Observation: {current_observation} - -Past summaries: {past_summaries} - -Action: {action} -""" - -ERROR_CLASSIFICATION_PROMPT = """ -You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. -Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), -followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), -a set of labeled examples for reference (few-shot), and finally the classification task you must complete. - --------------------------------------------------------------------------------- -TAXONOMY DEFINITIONS --------------------------------------------------------------------------------- - -1. AGENT ERRORS -These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. - - - Navigation & Planning Errors - The agent cannot construct or execute a correct sequence of actions to reach its goal - (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). - - - Interaction Execution Errors - The agent enters data in the wrong format, forgets to click "Submit" after typing, - repeats the same failing action without adaptation, or loses track of the changing webpage state. - - - Information Processing Errors - The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), - misconstrues relationships between pieces of information, or fails to validate data against task requirements. - - - Observation & Action Errors - The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) - or misaligns its actions (clicks the wrong element or stale link). - -2. LANGUAGE MODEL ERRORS -These errors result from the model's inability to correctly interpret or reason about the task at a higher level, -independent of the low-level web interactions. - - - Task Understanding Errors - The agent misreads or misunderstands the user's objective (goal interpretation), - loses crucial context (context loss), or performs actions beyond or short of the intended scope. - - - Reasoning Failures - The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, - or fails to prioritize important subtasks when handling complex goals. - -3. BENCHMARK & ENVIRONMENT ERRORS -These errors are external to the agent's logic and the language model's reasoning, -arising from flaws in the system, network, or evaluation framework itself. - - - System Errors - Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). - - - Benchmark Design Errors - Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), - or inflexible evaluation systems that fail to account for valid alternative solutions. - --------------------------------------------------------------------------------- -INPUT DESCRIPTION --------------------------------------------------------------------------------- - -You will receive the following for each scenario: -1. User Goal - - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). - -2. Planning / Thought History - - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. - -3. Current Observation (HTML / AX Tree Snippet) - - The webpage structure or state that the agent sees at a given point in time. - -4. Historical change summaries - - A list of summaries of changes in the observation that the agent has seen during the course of actions. - -5. Action History - - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) - along with immediate outcomes or errors. - -Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. - --------------------------------------------------------------------------------- -FEW-SHOT CLASSIFICATION EXAMPLES --------------------------------------------------------------------------------- - -1) EXAMPLE A (Benchmark Error - Benchmark Design Error) - • Context: The agent correctly finds a cheaper product meeting the user's criteria, - but the benchmark expects a more expensive product and marks the solution as wrong. - • Classification: ["Benchmark Design Error"] - • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid - and does not allow an alternative correct solution. - -2) EXAMPLE B (Agent Error - Interaction Execution) - • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. - Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. - • Classification: ["Agent Error - Interaction Execution"] - • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action - without adaptation ("Action Repetition"). - -3) EXAMPLE C (Benchmark Error - Benchmark Design Error) - • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" - The query is ambiguous because "Upitts" is not a standard location. - The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. - • Classification: ["Benchmark Design Error"] - • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), - leading the agent astray due to unclear context. - -4) EXAMPLE D (Language Model Error - Task Understanding) - • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' - that are older than 30 days and add a comment saying 'I can help fix this.'" - The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue - with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. - • Classification: ["Language Model Error - Task Understanding"] - • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, - it focused on creating a new issue. This is a misinterpretation of the instructions, - not a mechanical error in clicking or input format. - --------------------------------------------------------------------------------- -CLASSIFICATION TASK --------------------------------------------------------------------------------- - -1. Read through: - - The planning and thought history - - The action history - - The current HTML or AX Tree observation - - The user goal - -2. Decide if the failure is: - - An Agent Error (which subcategory/subcategories), - - A Language Model Error (which subcategory/subcategories), - - A Benchmark/Environment Error (which subcategory/subcategories), - - Or a combination thereof (multi-label if needed). - -3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. - -4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". - -Output Format Example: -{ - "errorCategory": ["Agent Error - Navigation & Planning"], - "explanation": "The agent opened the wrong GitLab page and never recovered..." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Overall goal: {goal} - -LLM Plan and thought history: {plan} - -Current Observation: {current_observation} - -Historical change summaries: {historical_summaries} - -Action history: {action_history} -""" +from agentlab.analyze.error_analysis.summarizer_prompts import ( + CHANGE_SUMMARIZER_PROMPT, + ERROR_CLASSIFICATION_PROMPT, +) +from agentlab.analyze.inspect_results import summarize def _diff(past_obs, current_obs): @@ -218,25 +21,31 @@ def _diff(past_obs, current_obs): class ChangeSummarizer: llm: callable # language model - obs_formatter: callable + obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available") use_diff: bool = False - def summarize( - self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] - ) -> str: + def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str: """Produces, a summary of the effect of an action.""" - past_obs_message = self.obs_formatter(past_obs) - current_obs_message = self.obs_formatter(current_obs) + obs_message = self.obs_formatter(obs.obs) + next_obs_message = self.obs_formatter(next_obs.obs) - goal = past_obs["goal"] # Use goal object from agentlab + action = obs.action + + goal = obs.obs["goal"] # Use goal object from agentlab + # TODO(thibault): switch to 'goal_object' # Outsource everything to formatter - plan = past_obs["plan"] + if self.use_diff: - current_obs_message = _diff(past_obs_message, current_obs_message) + next_obs_message = _diff(obs_message, next_obs_message) return self.llm( self.make_prompt( - past_obs_message, action, current_obs_message, past_summaries, goal, plan + obs_message, + action, + next_obs_message, + past_summaries, + goal, + obs.obs.get("plan", "No plan available"), ) ) @@ -266,9 +75,20 @@ class EpisodeSummarizer: change_summarizer: ChangeSummarizer = None - def summarize(episode: list[StepInfo]) -> EpisodeAnalysis: + def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... + + def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - pass + summaries = self.make_change_summaries(exp_results) + + def make_change_summaries(self, exp_result: ExpResult) -> list[str]: + summaries = [] # type: list[str] + # this assumes that there is always an extra step at the end of the episode + # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info + # TODO:(thibault) make some checks or w/e + for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): + summaries.append(self.change_summarizer.summarize(step, next_step, summaries)) + return summaries @dataclass diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py new file mode 100644 index 00000000..382c2805 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -0,0 +1,202 @@ +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +{{ + "changeSummary": "A new search results panel appeared on the right side.", + "actionAssessment": "Correct", + "explanation": "Clicking 'Search' was appropriate to display the results." +}} + +Or for an incorrect action: + +{{ + "changeSummary": "The page reloaded but the date fields were reset to defaults.", + "actionAssessment": "Incorrect", + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", + "suggestion": "Correct the date format or check for error messages." +}} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Current Observation: {past_observation} + +Next Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. AGENT ERRORS +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. + + - Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + + - Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + + - Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + + - Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +2. LANGUAGE MODEL ERRORS +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, +independent of the low-level web interactions. + + - Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + + - Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +3. BENCHMARK & ENVIRONMENT ERRORS +These errors are external to the agent's logic and the language model's reasoning, +arising from flaws in the system, network, or evaluation framework itself. + + - System Errors + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). + + - Benchmark Design Errors + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), + or inflexible evaluation systems that fail to account for valid alternative solutions. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Planning / Thought History + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. + +3. Current Observation (HTML / AX Tree Snippet) + - The webpage structure or state that the agent sees at a given point in time. + +4. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +5. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) + • Context: The agent correctly finds a cheaper product meeting the user's criteria, + but the benchmark expects a more expensive product and marks the solution as wrong. + • Classification: ["Benchmark Design Error"] + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid + and does not allow an alternative correct solution. + +2) EXAMPLE B (Agent Error - Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Agent Error - Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" + The query is ambiguous because "Upitts" is not a standard location. + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. + • Classification: ["Benchmark Design Error"] + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), + leading the agent astray due to unclear context. + +4) EXAMPLE D (Language Model Error - Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Language Model Error - Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide if the failure is: + - An Agent Error (which subcategory/subcategories), + - A Language Model Error (which subcategory/subcategories), + - A Benchmark/Environment Error (which subcategory/subcategories), + - Or a combination thereof (multi-label if needed). + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". + +Output Format Example: +{{ + "errorCategory": ["Agent Error - Navigation & Planning"], + "explanation": "The agent opened the wrong GitLab page and never recovered..." +}} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Overall goal: {goal} + +LLM Plan and thought history: {plan} + +Current Observation: {current_observation} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} +""" diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py new file mode 100644 index 00000000..e9fe0ecc --- /dev/null +++ b/tests/analyze/error_analysis/test_summarizer.py @@ -0,0 +1,22 @@ +from pathlib import Path + +import pytest +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer +from agentlab.analyze.inspect_results import yield_all_exp_results + + +@pytest.fixture(scope="module") +def exp_results() -> list[ExpResult]: + exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis" + return list(yield_all_exp_results(exp_dir)) + + +def test_change_summarizer(exp_results: list[ExpResult]): + summarizer = ChangeSummarizer(llm=lambda x: x) + step = exp_results[0].steps_info[0] + next_step = exp_results[0].steps_info[1] + past_summaries = [] + summary = summarizer.summarize(step, next_step, past_summaries) + assert isinstance(summary, str)