diff --git a/src/agentlab/benchmarks/abstract_env.py b/src/agentlab/benchmarks/abstract_env.py new file mode 100644 index 00000000..0529a128 --- /dev/null +++ b/src/agentlab/benchmarks/abstract_env.py @@ -0,0 +1,59 @@ +import gym +from abc import ABC, abstractmethod + + +class AbstractEnvArgs(ABC): + """Easily serialiazable class to store the arguments of an environment""" + + @abstractmethod + def make_env(self, action_mapping, exp_dir, exp_task_kwargs) -> "AbstractEnv": + """Create an instance of the environment with the arguments stored in this object. + + Args: + action_mapping (dict[str,str]): mapping from the agent's action space to the environment's action space + see AbstractActionSet.to_python_code from BrowserGym for an example + exp_dir (str): directory where the experiment is stored + exp_task_kwargs (dict[str,Any]): additional arguments for the environment + + Returns: + env (AbstractEnv): instance of the environment. + """ + + +class AbstractEnv(gym.Env, ABC): + + @abstractmethod + def reset(self, seed: int = None) -> tuple[dict[str, any], dict[str, any]]: + """Reset the environment to the initial state, ready for an agent to start a new episode. + + Args: + seed (int): seed to be used for the environment's random number generator. Some task may + be deterministic and not require a seed. + + Returns: + obs (dict[str,Any]): dictionary containing the observations + env_info (dict[str,Any]): additional information about the environment (see step's docstring) + """ + + @abstractmethod + def step(self, action: str): + """Exection action in the environment and return the next observations + + Args: + action (str): action to be executed in the environment, as a string + + Returns: + obs (dict[str,Any]): dictionary containing the observations + reward (float): reward obtained after executing the action + terminated (bool): whether the episode is terminated. The MDP reached a terminal state + truncated (bool): whether the episode is truncated. The episode was truncated due to external reasons + env_info (dict[str,Any]): additional information about the environment + task_info (str): Some potential debugging information about the task, not intended for the agent + action_exec_start (float): time when the action execution started + action_exec_stop (float): time when the action execution ended + action_exec_timeout (float): TODO I don't remember exactly what this is + """ + + @abstractmethod + def close(self): + """Close any resources used by the environment""" diff --git a/src/agentlab/benchmarks/tau_bench.py b/src/agentlab/benchmarks/tau_bench.py new file mode 100644 index 00000000..41ad55f1 --- /dev/null +++ b/src/agentlab/benchmarks/tau_bench.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass +from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs +import bgym + + +@dataclass +class TauBenchEnvArgs(AbstractEnvArgs): + """All arguments parameterizing a task in tau-bench""" + + task_name: str + task_seed: int # is there any seeds or tasks are deterministic? + + def __init__(self): + super().__init__() + + def make_env(self, action_mapping, exp_dir, exp_task_kwargs) -> "AbstractEnv": + # TODO look at how bgym does it. You need to register tasks and do gym.make(task_name) + pass + + +class TauBenchEnv(AbstractEnv): + def __init__(self): + super().__init__() + + def reset(self, seed=None): + pass + + def step(self, action: str): + pass + + def close(self): + pass + + +@dataclass +class TauBenchActionSetArgs: + """Holds hyperparameters for the TauBenchActionSet""" + + def make_action_set(self): + return TauBenchActionSet() + + +class TauBenchActionSet(bgym.AbstractActionSet): + # TODO: Get inspiration from bgym's HighLevelActionSet, perhaps reusing code there, TBD + + def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str: + # TODO: Implement this method + pass + + def example_action(self, abstract: bool) -> str: + # TODO: Implement this method + + pass + + def to_python_code(self, action) -> str: + # TODO: Implement this method + + pass + + +def _make_env_args_list(): + # TODO generate all evn_args for the benchmark, get inspiration from bgym's task_list_from_metadata and make_env_args_list_from_repeat_tasks + return [TauBenchEnvArgs()] + + +def _task_metadata(): + # load a dataframe containing configuration for all tasks + pass + + +def make_tau_benchmark(): + return bgym.Benchmark( + name="tau-bench", + high_level_action_set_args=TauBenchActionSet(), + is_multi_tab=False, + supports_parallel_seeds=True, + backends=[ + "taubench" + ], # TODO this is not an implemented backend yet and bgym's make_backed implementation with match case needs to be revised + env_args_list=_make_env_args_list(), # TODO adapt + task_metadata=_task_metadata(), # TODO adapt + )