From 808453ca4be316f45d303d9c780f45c980c24ba4 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 13 Oct 2023 19:16:00 +0800 Subject: [PATCH 01/78] init v0.1.8 --- README.md | 2 +- README_zh.md | 2 +- openrl/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 741fdef2..3f855903 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.7 is updated on Sep 21, 2023 +OpenRL-v0.1.8 is updated on Oct 13, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/README_zh.md b/README_zh.md index c8fb4619..b6fd07b1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -29,7 +29,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.7 is updated on Sep 21, 2023 +OpenRL-v0.1.8 is updated on Oct 13, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/openrl/__init__.py b/openrl/__init__.py index 00bcaacf..254628c0 100644 --- a/openrl/__init__.py +++ b/openrl/__init__.py @@ -1,5 +1,5 @@ __TITLE__ = "openrl" -__VERSION__ = "v0.1.7" +__VERSION__ = "v0.1.8" __DESCRIPTION__ = "Distributed Deep RL Framework" __AUTHOR__ = "OpenRL Contributors" __EMAIL__ = "huangshiyu@4paradigm.com" From 75bf53153f2b9ff1acc4fcce11b1a1a5d9bae61a Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 14:42:40 +0800 Subject: [PATCH 02/78] update test --- openrl/envs/toy_envs/bit_flipping_env.py | 4 +- openrl/envs/toy_envs/identity_env.py | 4 +- openrl/envs/toy_envs/multi_input_envs.py | 4 +- openrl/utils/type_aliases.py | 4 +- setup.py | 2 + tests/test_env/test_wrappers.py | 47 ++++++++++++++++++++++++ 6 files changed, 53 insertions(+), 12 deletions(-) create mode 100644 tests/test_env/test_wrappers.py diff --git a/openrl/envs/toy_envs/bit_flipping_env.py b/openrl/envs/toy_envs/bit_flipping_env.py index 0d77ebd4..5534ed37 100644 --- a/openrl/envs/toy_envs/bit_flipping_env.py +++ b/openrl/envs/toy_envs/bit_flipping_env.py @@ -5,8 +5,6 @@ from gymnasium import Env, spaces from gymnasium.envs.registration import EnvSpec -from openrl.utils.type_aliases import GymStepReturn - class BitFlippingEnv(Env): """ @@ -175,7 +173,7 @@ def reset( self.state = self.obs_space.sample() return self._get_obs(), {} - def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: + def step(self, action: Union[np.ndarray, int]): if self.continuous: self.state[action > 0] = 1 - self.state[action > 0] else: diff --git a/openrl/envs/toy_envs/identity_env.py b/openrl/envs/toy_envs/identity_env.py index dd653626..21a9bb6a 100644 --- a/openrl/envs/toy_envs/identity_env.py +++ b/openrl/envs/toy_envs/identity_env.py @@ -6,8 +6,6 @@ from gymnasium.envs.registration import EnvSpec from gymnasium.utils import seeding -from openrl.utils.type_aliases import GymStepReturn - T = TypeVar("T", int, np.ndarray) @@ -256,7 +254,7 @@ def reset(self) -> np.ndarray: self.current_step = 0 return self.observation_space.sample() - def step(self, action: Union[np.ndarray, int]) -> GymStepReturn: + def step(self, action: Union[np.ndarray, int]): reward = 0.0 self.current_step += 1 done = self.current_step >= self.ep_length diff --git a/openrl/envs/toy_envs/multi_input_envs.py b/openrl/envs/toy_envs/multi_input_envs.py index 952a5b04..eccb1f6f 100644 --- a/openrl/envs/toy_envs/multi_input_envs.py +++ b/openrl/envs/toy_envs/multi_input_envs.py @@ -4,8 +4,6 @@ import numpy as np from gymnasium import spaces -from openrl.utils.type_aliases import GymStepReturn - # Not Work Yet class SimpleMultiObsEnv(gym.Env): @@ -124,7 +122,7 @@ def init_possible_transitions(self) -> None: self.right_possible = [0, 1, 2, 12, 13, 14] self.up_possible = [4, 8, 12, 7, 11, 15] - def step(self, action: Union[float, np.ndarray]) -> GymStepReturn: + def step(self, action: Union[float, np.ndarray]): """ Run one timestep of the environment's dynamics. When end of episode is reached, you are responsible for calling `reset()` diff --git a/openrl/utils/type_aliases.py b/openrl/utils/type_aliases.py index 25991e24..d9012d7d 100644 --- a/openrl/utils/type_aliases.py +++ b/openrl/utils/type_aliases.py @@ -13,9 +13,7 @@ GymEnv = Union[gym.Env, vec_env.BaseVecEnv] GymObs = Union[Tuple, Dict[str, Any], np.ndarray, int] -GymStepReturn = Union[ - Tuple[GymObs, float, bool, Dict], Tuple[GymObs, float, bool, bool, Dict] -] + TensorDict = Dict[Union[str, int], th.Tensor] OptimizerStateDict = Dict[str, Any] MaybeCallback = Union[ diff --git a/setup.py b/setup.py index 494e4c4c..fa5d6d31 100644 --- a/setup.py +++ b/setup.py @@ -68,8 +68,10 @@ def get_extra_requires() -> dict: "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], + "atari": ["gymnasium[atari]", "gymnasium[accept-rom-license]"], } req["test"].extend(req["selfplay"]) + req["test"].extend(req["atari"]) return req diff --git a/tests/test_env/test_wrappers.py b/tests/test_env/test_wrappers.py new file mode 100644 index 00000000..6042eccf --- /dev/null +++ b/tests/test_env/test_wrappers.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import pytest + + +@pytest.mark.unittest +def test_atari_wrappers(): + import gymnasium + + from openrl.envs.wrappers.atari_wrappers import ( + ClipRewardEnv, + EpisodicLifeEnv, + FireResetEnv, + NoopResetEnv, + WarpFrame, + ) + + env = gymnasium.make("ALE/Breakout-v5") + env = FireResetEnv(EpisodicLifeEnv(ClipRewardEnv(WarpFrame(NoopResetEnv(env))))) + env.reset(seed=0) + while True: + obs, reward, done, truncated, info = env.step(0) + if done: + break + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 7e5b043a53844a84747c2fd7d2312fefdfb3570a Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 14:54:14 +0800 Subject: [PATCH 03/78] update test --- .../test_vec_env/test_vec_wrappers.py | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 tests/test_env/test_vec_env/test_vec_wrappers.py diff --git a/tests/test_env/test_vec_env/test_vec_wrappers.py b/tests/test_env/test_vec_env/test_vec_wrappers.py new file mode 100644 index 00000000..b71ce173 --- /dev/null +++ b/tests/test_env/test_vec_env/test_vec_wrappers.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest + +from openrl.envs.common import make +from openrl.envs.vec_env.wrappers.zero_reward_wrapper import ZeroRewardWrapper + + +@pytest.mark.unittest +def test_zero_reward_wrapper(): + env = make("IdentityEnv", env_num=1) + env = ZeroRewardWrapper(env) + env.reset(seed=0) + while True: + obs, reward, done, info = env.step(env.random_action()) + assert np.all(reward == 0), "reward should be zero" + if done: + break + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 06453deed1f47f67eb01ea3ad58bb79d0cbb9337 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 16:06:27 +0800 Subject: [PATCH 04/78] update test --- openrl/envs/snake/snake_3v3.py | 854 ------------------ tests/test_env/test_snake_env.py | 69 ++ .../test_vec_env/test_vec_wrappers.py | 45 + 3 files changed, 114 insertions(+), 854 deletions(-) delete mode 100644 openrl/envs/snake/snake_3v3.py create mode 100644 tests/test_env/test_snake_env.py diff --git a/openrl/envs/snake/snake_3v3.py b/openrl/envs/snake/snake_3v3.py deleted file mode 100644 index 78d787ef..00000000 --- a/openrl/envs/snake/snake_3v3.py +++ /dev/null @@ -1,854 +0,0 @@ -# -*- coding:utf-8 -*- -# 作者:zruizhi -# 创建时间: 2020/7/30 17:24 下午 -# 描述: -import copy -import itertools -import random -import time -from itertools import count - -import numpy as np -from gym import Env, spaces -from PIL import Image, ImageDraw, ImageFont - -from .common import Board, HiddenPrints, SnakePos # TODO: Snake类的重名问题 -from .discrete import Discrete -from .gridgame import GridGame -from .observation import * - - -class SnakeEatBeans(GridGame, GridObservation, DictObservation): - def __init__(self, all_args, env_id): - self.all_args = all_args - conf = { - "class_literal": "SnakeEatBeans", - "n_player": 6, - "board_width": 20, - "board_height": 10, - "channels": 15, - "cell_range": 8, - "n_beans": 5, - "max_step": 200, - "game_name": "snakes", - "is_obs_continuous": False, - "is_act_continuous": False, - "agent_nums": [3, 3], - "obs_type": ["dict", "dict"], - "save_interval": 100, - "save_path": "../../replay/snake_3v3/replay_{}.gif", - } - self.terminate_flg = False - colors = conf.get("colors", [(255, 255, 255), (255, 140, 0)]) - super(SnakeEatBeans, self).__init__(conf, colors) - # 0: 没有 1:食物 2-n_player+1:各玩家蛇身 - self.n_cell_type = self.n_player + 2 - self.step_cnt = 1 - self.n_beans = int(conf["n_beans"]) - # 方向[-2,2,-1,1]分别表示[上,下,左,右] - self.actions = [-2, 2, -1, 1] - self.actions_name = {-2: "up", 2: "down", -1: "left", 1: "right"} - self.snakes_position = {} - self.players = [] - self.cur_bean_num = 0 - self.beans_position = [] - # 1<= init_len <= 3 - self.init_len = 3 - self.current_state = self.init_state() - self.all_observes = self.get_all_observes() - if self.n_player * self.init_len > self.board_height * self.board_width: - raise Exception( - "玩家数量过多:%d,超出board范围:%d,%d" - % (self.n_player, self.board_width, self.board_height) - ) - - self.input_dimension = self.board_width * self.board_height - self.action_dim = self.get_action_dim() - self.channels = conf["channels"] - - self.num_agents = conf["agent_nums"][0] - self.num_enemys = conf["agent_nums"][1] - - self.observation_space = [ - spaces.Box( - low=-np.inf, - high=-np.inf, - shape=(self.channels, self.board_width, self.board_height), - dtype=np.float32, - ) - ] - self.share_observation_space = [] - self.share_observation_space = [ - spaces.Box( - low=-np.inf, - high=+np.inf, - shape=(self.channels, self.board_width, self.board_height), - dtype=np.float32, - ) - ] - self.action_space = [Discrete(4) for _ in range(self.n_player)] - self.save_interval = conf["save_interval"] - self.save_path = conf["save_path"] - self.episode = 0 - self.render = all_args.save_replay - self.img_list = [] - self.env_id = env_id - - def seed(self, seed=None): - if seed is None: - np.random.seed(1) - else: - np.random.seed(seed) - - def check_win(self): - flg = self.won.index(max(self.won)) + 2 - return flg - - def get_grid_observation(self, current_state, player_id, info_before): - return current_state - - def get_dict_observation(self, current_state, player_id, info_before): - key_info = {1: self.beans_position} - for i in range(self.n_player): - snake = self.players[i] - key_info[snake.player_id] = snake.segments - # key_info['state_map'] = current_state - key_info["board_width"] = self.board_width - key_info["board_height"] = self.board_height - key_info["last_direction"] = ( - info_before.get("directions") if isinstance(info_before, dict) else None - ) - key_info["controlled_snake_index"] = player_id - - return key_info - - def set_action_space(self): - action_space = [[Discrete(4)] for _ in range(self.n_player)] - return action_space - - def reset(self): - self.step_cnt = 1 - self.snakes_position = ( - {} - ) # 格式类似于{1: [[3, 1], [4, 3], [1, 2], [0, 6], [3, 3]], 2: [[3, 0], [3, 7], [3, 6]], 3: [[2, 7], [1, 7], [0, 7]]} - self.players = [] - self.cur_bean_num = 0 - self.beans_position = [] - self.current_state = self.init_state() - self.all_observes = self.get_all_observes() - self.terminate_flg = False - self.img_list = [] - self.episode += 1 - - # available actions - left_avail_actions = np.ones([self.num_agents, self.action_dim]) - right_avail_actions = np.ones([self.num_enemys, self.action_dim]) - avail_actions = np.concatenate([left_avail_actions, right_avail_actions], 0) - # process obs - board = [] - for i in range(self.n_player): - board.append([self.get_board(self.all_observes[i])]) - - board_ = np.concatenate(board) - obs = [] - for raw_obs in self.all_observes: - obs.append([self.raw2vec(raw_obs)]) - obs_ = np.concatenate(obs) - obs_ = np.concatenate((obs_, board_), axis=1) - - share_obs = np.repeat(np.expand_dims(obs_[0], axis=0), 6, 0) - - return obs_, share_obs, avail_actions # obs:(n_player, 288) - - # return self.all_observes - - def step(self, joint_action): - info_before = self.step_before_info() - joint_action = np.expand_dims(joint_action, 1) - all_observes, info_after = self.get_next_state(joint_action) - done = self.is_terminal() - reward = self.get_reward(joint_action) - left_avail_actions = np.ones([self.num_agents, self.action_dim]) - right_avail_actions = np.ones([self.num_enemys, self.action_dim]) - avail_actions = np.concatenate([left_avail_actions, right_avail_actions], 0) - - board = [] - for i in range(self.n_player): - board.append([self.get_board(all_observes[i])]) - - board_ = np.concatenate(board) - - obs = [] - - for raw_obs in all_observes: - obs.append([self.raw2vec(raw_obs)]) # obs:[[(14, 20, 10)], [], ..., []] - - obs_ = np.concatenate(obs) # (n_player, channels, width, height) - obs_ = np.concatenate((obs_, board_), axis=1) - - share_obs = np.repeat(np.expand_dims(obs_[0], axis=0), 6, 0) - - if done: - reward = self.get_final_reward(reward) - - rewards = np.expand_dims(np.array(reward), axis=1) - - dones = [done] * self.n_player - infos = [info_after] * self.n_player - - if self.render and self.episode % self.save_interval == 0 and self.env_id == 0: - img = self.render_board() - img_pil = Image.fromarray(img) - self.img_list.append(img_pil) - - if done: - self.img_list[0].save( - self.save_path.format(self.episode), - save_all=True, - append_images=self.img_list[1:], - duration=400, - ) - print("save replay gif to" + self.save_path.format(self.episode)) - - return obs_, share_obs, rewards, dones, infos, avail_actions - # return all_observes, reward, done, info_before, info_after - - # obs: 0 空白 1 豆子 2 我方蛇头 3 我方蛇身 4-5 友方蛇头 6-7 友方蛇身 8-10 敌方蛇头 11-13 敌方蛇身 - def raw2vec(self, raw_obs): - control_index = raw_obs["controlled_snake_index"] - width = raw_obs["board_width"] - height = raw_obs["board_height"] - beans = raw_obs[1] - pos = raw_obs[control_index] - - obs = np.zeros(width * height, dtype=int) - head_h, head_w = pos[0] - obs[head_h * width + head_w] = 2 - - for bean in beans: - h, w = bean - obs[h * width + w] = 1 - - for p in pos[1:]: - h, w = p - obs[h * width + w] = 3 - - if control_index == 2: - h1, w1 = raw_obs[3][0] - h2, w2 = raw_obs[4][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[3][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[4][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(self.num_agents + 2, self.n_player + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 3 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 6 - elif control_index == 3: - h1, w1 = raw_obs[2][0] - h2, w2 = raw_obs[4][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[2][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[4][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(self.num_agents + 2, self.n_player + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 3 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 6 - elif control_index == 4: - h1, w1 = raw_obs[2][0] - h2, w2 = raw_obs[3][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[2][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[3][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(self.num_agents + 2, self.n_player + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 3 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 6 - elif control_index == 5: - h1, w1 = raw_obs[6][0] - h2, w2 = raw_obs[7][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[6][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[7][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(2, self.num_agents + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 6 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 9 - elif control_index == 6: - h1, w1 = raw_obs[5][0] - h2, w2 = raw_obs[7][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[5][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[7][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(2, self.num_agents + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 6 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 9 - else: - h1, w1 = raw_obs[5][0] - h2, w2 = raw_obs[6][0] - obs[h1 * width + w1] = 4 - obs[h2 * width + w2] = 5 - for p in raw_obs[5][1:]: - h, w = p - obs[h * width + w] = 6 - for p in raw_obs[6][1:]: - h, w = p - obs[h * width + w] = 7 - for i in range(2, self.num_agents + 2): - h, w = raw_obs[i][0] - obs[h * width + w] = i + 6 - for p in raw_obs[i][1:]: - h, w = p - obs[h * width + w] = i + 9 - - obs_ = np.zeros(width * height * (self.channels - 1), dtype=int) - for i in range(width * height): - obs_[i * (self.channels - 1) + obs[i]] = ( - 1 # channels的最后一维是territory matrix, 此处不生成, 要减去 - ) - obs_ = obs_.reshape( - height, width, (self.channels - 1) - ) # (height, width, channels-1 ) - obs_ = obs_.transpose((2, 1, 0)) - - return obs_ - - def get_board(self, observation_list): - observation_len = len(observation_list.keys()) - teams = None - teams = [[0, 1, 2], [3, 4, 5]] # 3v3 - teams_count = len(teams) - snakes_count = sum([len(_) for _ in teams]) - - # read observation - obs = observation_list.copy() - board_height = obs["board_height"] # 10 - board_width = obs["board_width"] # 20 - # print("obs['controlled_snake_index'] is ", obs['controlled_snake_index']) - ctrl_agent_index = obs["controlled_snake_index"] - 2 # 0, 1, 2, 3, 4, 5 - # last_directions = obs['last_direction'] # ['up', 'left', 'down', 'left', 'left', 'left'] - beans_positions = obs[1] # e.g.[[7, 15], [4, 14], [5, 12], [4, 12], [5, 7]] - snakes = { - key - 2: SnakePos(obs[key], board_height, board_width, beans_positions) - for key in obs.keys() & {_ + 2 for _ in range(snakes_count)} - } # &: intersection - team_indexes = [_ for _ in teams if ctrl_agent_index in _][0] - - init_board = Board(board_height, board_width, snakes, beans_positions, teams) - bd = copy.deepcopy(init_board) - - with HiddenPrints(): - while not all( - _ == [] for _ in bd.open.values() - ): # loop until all values in open are empty list - bd.step() - - board = np.array(bd.board).transpose() - board = np.expand_dims(board, axis=0) - return board - - def init_state(self): - for i in range(self.n_player): - s = Snake(i + 2, self.board_width, self.board_height, self.init_len) - s_len = 1 - while s_len < self.init_len: - if s_len == 1 and i > 0: - origin_hit = self.is_hit(s.headPos, self.snakes_position) - else: - origin_hit = 0 - cur_head = s.move_and_add(self.snakes_position) - cur_hit = self.is_hit(cur_head, self.snakes_position) or self.is_hit( - cur_head, {i: s.segments[1:]} - ) - if origin_hit or cur_hit: - x = random.randrange(0, self.board_height) - y = random.randrange(0, self.board_width) - s.headPos = [x, y] - s.segments = [s.headPos] - s.direction = random.choice(self.actions) - s_len = 1 - else: - s_len += 1 - self.snakes_position[s.player_id] = s.segments - self.players.append(s) - - self.generate_beans() - self.init_info = { - "snakes_position": [ - list(v) - for k, v in sorted( - self.snakes_position.items(), key=lambda item: item[0] - ) - ], - "beans_position": list(self.beans_position), - } - directs = [] - for i in range(len(self.players)): - s = self.players[i] - directs.append(self.actions_name[s.direction]) - self.init_info["directions"] = directs - - return self.update_state() - - def update_state(self): - next_state = [ - [[0] * self.cell_dim for _ in range(self.board_width)] - for _ in range(self.board_height) - ] - for i in range(self.n_player): - snake = self.players[i] - for pos in snake.segments: - next_state[pos[0]][pos[1]][0] = i + 2 - - for pos in self.beans_position: - next_state[pos[0]][pos[1]][0] = 1 - - return next_state - - def step_before_info(self, info=""): - directs = [] - for i in range(len(self.players)): - s = self.players[i] - directs.append(self.actions_name[s.direction]) - info = {"directions": directs} - - return info - - def is_hit(self, cur_head, snakes_position): - is_hit = False - for k, v in snakes_position.items(): - for pos in v: - if cur_head == pos: - is_hit = True - # print("hit:", cur_head, snakes_position) - break - if is_hit: - break - - return is_hit - - def generate_beans(self): - all_valid_positions = set( - itertools.product(range(0, self.board_height), range(0, self.board_width)) - ) - all_valid_positions = all_valid_positions - set(map(tuple, self.beans_position)) - for positions in self.snakes_position.values(): - all_valid_positions = all_valid_positions - set(map(tuple, positions)) - - left_bean_num = self.n_beans - self.cur_bean_num - all_valid_positions = np.array(list(all_valid_positions)) - left_valid_positions = len(all_valid_positions) - - new_bean_num = ( - left_bean_num - if left_valid_positions > left_bean_num - else left_valid_positions - ) - - if left_valid_positions > 0: - new_bean_positions_idx = np.random.choice( - left_valid_positions, size=new_bean_num, replace=False - ) - new_bean_positions = all_valid_positions[new_bean_positions_idx] - else: - new_bean_positions = [] - - for new_bean_pos in new_bean_positions: - self.beans_position.append(list(new_bean_pos)) - self.cur_bean_num += 1 - - def get_all_observes(self, before_info=""): - self.all_observes = [] - for i in range(self.n_player): - each_obs = self.get_dict_observation(self.current_state, i + 2, before_info) - self.all_observes.append(each_obs) - - return self.all_observes - - def get_next_state(self, all_action): - before_info = self.step_before_info() - not_valid = self.is_not_valid_action(all_action) - if not not_valid: - # 各玩家行动 - # print("current_state", self.current_state) - eat_snakes = [0] * self.n_player - ally_reward = 0 - enemy_reward = 0 - for i in range(self.n_player): # 判断是否吃到了豆子 - snake = self.players[i] - act = self.actions[np.argmax(all_action[i][0])] - # print(snake.player_id, "此轮的动作为:", self.actions_name[act]) - snake.change_direction(act) - snake.move_and_add(self.snakes_position) # 更新snake.segment - if self.be_eaten(snake.headPos): # @yanxue - snake.snake_reward = 1 - eat_snakes[i] = 1 - else: - snake.snake_reward = 0 - snake.pop() - # print(snake.player_id, snake.segments) # @yanxue - snake_position = [[-1] * self.board_width for _ in range(self.board_height)] - re_generatelist = [0] * self.n_player - for i in range(self.n_player): # 判断是否相撞 - snake = self.players[i] - segment = snake.segments - for j in range(len(segment)): - x = segment[j][0] - y = segment[j][1] - if snake_position[x][y] != -1: - if j == 0: # 撞头 - re_generatelist[i] = 1 - compare_snake = self.players[snake_position[x][y]] - if [x, y] == compare_snake.segments[0]: # 两头相撞won - re_generatelist[snake_position[x][y]] = 1 - else: - snake_position[x][y] = i - for i in range(self.n_player): - snake = self.players[i] - if re_generatelist[i] == 1: - if eat_snakes[i] == 1: - snake.snake_reward = ( - self.init_len - len(snake.segments) + 1 - ) # 身体越长,惩罚越大 - else: - snake.snake_reward = self.init_len - len(snake.segments) - snake.segments = [] - - for i in range(self.num_agents): - ally_reward += self.players[i].snake_reward - for i in range(self.num_enemys): - enemy_reward += self.players[i + self.num_agents].snake_reward - alpha = 0.8 - for i in range(self.num_agents): - self.players[i].snake_reward = ( - self.players[i].snake_reward - enemy_reward / 3 - ) * alpha + ally_reward / 3 * (1 - alpha) - for i in range(self.num_agents, self.n_player): - self.players[i].snake_reward = ( - self.players[i].snake_reward - ally_reward / 3 - ) * alpha + enemy_reward / 3 * (1 - alpha) - - for i in range(self.n_player): - snake = self.players[i] - if re_generatelist[i] == 1: - snake = self.clear_or_regenerate(snake) - self.snakes_position[snake.player_id] = snake.segments - snake.score = snake.get_score() - # yanxue add - # 更新状态 - self.generate_beans() - - next_state = self.update_state() - self.current_state = next_state - self.step_cnt += 1 - - self.won = [0] * self.n_player - - for i in range(self.n_player): - s = self.players[i] - self.won[i] = s.score - info_after = {} - info_after["snakes_position"] = [ - list(v) - for k, v in sorted( - self.snakes_position.items(), key=lambda item: item[0] - ) - ] - info_after["beans_position"] = list(self.beans_position) - info_after["hit"] = re_generatelist - info_after["score"] = self.won - self.all_observes = self.get_all_observes(before_info) - - return self.all_observes, info_after - - def clear_or_regenerate(self, snake): - direct_x = [0, 1, -1, 0] - direct_y = [1, 0, 0, -1] - snake.segments = [] - snake.score = 0 - grid = self.get_render_data(self.update_state()) - - def can_regenerate(): - for x in range(self.board_height): - for y in range(self.board_width): - if grid[x][y] == 0: - q = [] - q.append([x, y]) - seg = [] - while q: - cur = q.pop(0) - if cur not in seg: - seg.append(cur) - for i in range(4): - nx = (direct_x[i] + cur[0]) % self.board_height - ny = (direct_y[i] + cur[1]) % self.board_width - # if nx < 0 or nx >= self.board_height or ny < 0 or ny >= self.board_width: - # continue - if grid[nx][ny] == 0 and [nx, ny] not in q: - grid[nx][ny] = 1 - q.append([nx, ny]) - if len(seg) == self.init_len: - # print("regenerate") - if len(seg) < 3: - snake.direction = random.choice(self.actions) - elif len(seg) == 3: - mid = ( - [seg[1][0], seg[2][1]], - [seg[2][0], seg[1][1]], - ) - if seg[0] in mid: - seg[0], seg[1] = seg[1], seg[0] - snake.segments = seg - snake.headPos = seg[0] - if seg[0][0] == seg[1][0]: - # 右 - if seg[0][1] > seg[1][1]: - snake.direction = 1 - # 左 - else: - snake.direction = -1 - elif seg[0][1] == seg[1][1]: - # 下 - if seg[0][0] > seg[1][0]: - snake.direction = 2 - # 上 - else: - snake.direction = -2 - # print("re head", snake.headPos) # 输出重新生成的蛇 - # print("re snakes segments", snake.segments) - return True - # print("clear") - return False - - flg = can_regenerate() - if not flg: - self.terminate_flg = True - # print(self.terminate_flg) - return snake - - def is_not_valid_action(self, all_action): - not_valid = 0 - if len(all_action) != self.n_player: - raise Exception("all action 维度不正确!", len(all_action)) - - for i in range(self.n_player): - if len(all_action[i][0]) != 4: - raise Exception("玩家%d joint action维度不正确!" % i, all_action[i]) - return not_valid - - def get_reward(self, all_action): - r = [0] * self.n_player - for i in range(self.n_player): - r[i] = self.players[i].snake_reward - self.n_return[i] += r[i] - # print("score:", self.won) - return r - - def get_final_reward(self, reward): - ally_reward = reward[0] + reward[1] + reward[2] - enemy_reward = reward[3] + reward[4] + reward[5] - if ally_reward > enemy_reward: - reward[0] += 10 - reward[1] += 10 - reward[2] += 10 - reward[3] -= 10 - reward[4] -= 10 - reward[5] -= 10 - elif ally_reward < enemy_reward: - reward[3] += 10 - reward[4] += 10 - reward[5] += 10 - reward[0] -= 10 - reward[1] -= 10 - reward[2] -= 10 - return reward - - def is_terminal(self): - all_member = self.n_beans - # all_member = len(self.beans_position) - for s in self.players: - all_member += len(s.segments) - is_done = ( - self.step_cnt > self.max_step - or all_member > self.board_height * self.board_width - ) - - return is_done or self.terminate_flg - - def encode(self, actions): - joint_action = self.init_action_space() - if len(actions) != self.n_player: - raise Exception("action输入维度不正确!", len(actions)) - for i in range(self.n_player): - joint_action[i][0][int(actions[i])] = 1 - return joint_action - - def get_terminal_actions(self): - print("请输入%d个玩家的动作方向[0-3](上下左右),空格隔开:" % self.n_player) - cur = input() - actions = cur.split(" ") - return self.encode(actions) - - def be_eaten(self, snake_pos): - for bean in self.beans_position: - if snake_pos[0] == bean[0] and snake_pos[1] == bean[1]: - self.beans_position.remove(bean) - self.cur_bean_num -= 1 - return True - return False - - def get_action_dim(self): - action_dim = 1 - for i in range(len(self.joint_action_space[0])): - action_dim *= self.joint_action_space[0][i].n - - return action_dim - - def draw_board(self): - cols = [chr(i) for i in range(65, 65 + self.board_width)] - s = ", ".join(cols) - print(" ", s) - for i in range(self.board_height): - # print(i) - print(chr(i + 65), self.current_state[i]) - - @staticmethod - def _render_board(state, board, colors, unit, fix, extra_info): - im = GridGame._render_board(state, board, colors, unit, fix) - draw = ImageDraw.Draw(im) - # fnt = ImageFont.truetype("Courier.dfont", 16) - fnt = ImageFont.load_default() - for i, pos in zip(count(1), extra_info): - x, y = pos - draw.text( - ((y + 1 / 4) * unit, (x + 1 / 4) * unit), - "#{}".format(i), - font=fnt, - fill=(0, 0, 0), - ) - - return im - - def render_board(self): - extra_info = [tuple(x.headPos) for x in self.players] - im_data = np.array( - SnakeEatBeans._render_board( - self.get_render_data(self.current_state), - self.grid, - self.colors, - self.grid_unit, - self.grid_unit_fix, - extra_info, - ) - ) - return im_data - - @staticmethod - def parse_extra_info(data): - # return eval(re.search(r'({.*})', data['info_after']).group(1)).values() - # d = (eval(eval(data)['snakes_position']).values()) - if isinstance(data, str): - d = eval(data)["snakes_position"] - else: - d = data["snakes_position"] - - return [i[0] for i in d] - - -class Snake: - def __init__(self, player_id, board_width, board_height, init_len): - self.actions = [-2, 2, -1, 1] - self.actions_name = {-2: "up", 2: "down", -1: "left", 1: "right"} - self.direction = random.choice(self.actions) # 方向[-2,2,-1,1]分别表示[上,下,左,右] - self.board_width = board_width - self.board_height = board_height - x = random.randrange(0, board_height) - y = random.randrange(0, board_width) - self.segments = [[x, y]] - self.headPos = self.segments[0] - self.player_id = player_id - self.score = 0 - self.snake_reward = 0 - self.init_len = init_len - - def get_score(self): - return len(self.segments) - self.init_len - - def change_direction(self, act): - if act + self.direction != 0: - self.direction = act - else: - n_direct = random.choice(self.actions) - while n_direct + self.direction == 0: - n_direct = random.choice(self.actions) - self.direction = n_direct - # print("方向不合法,重新生成") - # print("direction", self.actions_name[self.direction]) - - # 超过边界,可以穿越 - def update_position(self, position): - position[0] %= self.board_height - position[1] %= self.board_width - return position - - def move_and_add(self, snakes_position): - cur_head = list(self.headPos) - # 根据方向移动蛇头的坐标 - # 右 - if self.direction == 1: - cur_head[1] += 1 - # 左 - if self.direction == -1: - cur_head[1] -= 1 - # 上 - if self.direction == -2: - cur_head[0] -= 1 - # 下 - if self.direction == 2: - cur_head[0] += 1 - - cur_head = self.update_position(cur_head) - # print("cur head", cur_head) - # print("cur snakes positions", snakes_position) - - self.segments.insert(0, cur_head) - self.headPos = self.segments[0] - return cur_head - - def pop(self): - self.segments.pop() # 在蛇尾减去一格 diff --git a/tests/test_env/test_snake_env.py b/tests/test_env/test_snake_env.py new file mode 100644 index 00000000..8e558231 --- /dev/null +++ b/tests/test_env/test_snake_env.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import gymnasium as gym +import numpy as np +import pytest +from gymnasium import spaces + +from openrl.envs.common import make +from openrl.envs.wrappers.base_wrapper import BaseObservationWrapper +from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper + + +class ConvertObs(BaseObservationWrapper): + def __init__(self, env: gym.Env): + BaseObservationWrapper.__init__(self, env) + self.observation_space = spaces.Box( + low=-np.inf, high=np.inf, shape=(576,), dtype=np.float32 + ) + + def observation(self, observation): + new_obs = np.zeros((len(observation), 576), dtype=int) + return new_obs + + +@pytest.mark.unittest +def test_snake(): + env_num = 2 + for i in [1, 3]: + env = make( + f"snakes_{i}v{i}", + env_num=env_num, + asynchronous=False, + opponent_wrappers=[RandomOpponentWrapper], + env_wrappers=[ConvertObs], + auto_reset=False, + ) + ep_num = 3 + for ep_now in range(ep_num): + obs, info = env.reset() + done = False + step = 0 + + while not np.any(done): + obs, r, done, info = env.step(env.random_action()) + step += 1 + + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_env/test_vec_env/test_vec_wrappers.py b/tests/test_env/test_vec_env/test_vec_wrappers.py index b71ce173..9b5735c9 100644 --- a/tests/test_env/test_vec_env/test_vec_wrappers.py +++ b/tests/test_env/test_vec_env/test_vec_wrappers.py @@ -16,13 +16,16 @@ """""" import os +import pickle import sys import numpy as np import pytest from openrl.envs.common import make +from openrl.envs.vec_env.wrappers.gen_data import GenDataWrapper, GenDataWrapper_v1 from openrl.envs.vec_env.wrappers.zero_reward_wrapper import ZeroRewardWrapper +from openrl.envs.wrappers.monitor import Monitor @pytest.mark.unittest @@ -38,5 +41,47 @@ def test_zero_reward_wrapper(): env.close() +@pytest.mark.unittest +def test_gen_data(tmp_path): + total_episode = 4 + env = make("IdentityEnv", env_wrappers=[Monitor], env_num=1) + data_save_path = tmp_path / "data.pkl" + env = GenDataWrapper( + env, data_save_path=str(data_save_path), total_episode=total_episode + ) + obs, info = env.reset(seed=0) + done = False + while not done: + obs, r, done, info = env.step(env.random_action()) + env.close() + + save_data = pickle.load(open(data_save_path, "rb")) + assert len(save_data["episode_lengths"]) == total_episode, ( + f"episode_lengths {len(save_data['episode_lengths'])} " + f"should be equal to total_episode {total_episode}" + ) + + +@pytest.mark.unittest +def test_gen_data_old(tmp_path): + total_episode = 4 + env = make("IdentityEnv", env_wrappers=[Monitor], env_num=1) + data_save_path = tmp_path / "data.pkl" + env = GenDataWrapper_v1( + env, data_save_path=str(data_save_path), total_episode=total_episode + ) + obs, info = env.reset(seed=0) + done = False + while not done: + obs, r, done, info = env.step(env.random_action()) + env.close() + + save_data = pickle.load(open(data_save_path, "rb")) + assert save_data["total_episode"] == total_episode, ( + f"episode_lengths {save_data['total_episode']} " + f"should be equal to total_episode {total_episode}" + ) + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 9daedf1ba6156eff09e1bc83f101c6e2870d5072 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 16:12:28 +0800 Subject: [PATCH 05/78] update test --- openrl/envs/mpe/rendering.py | 78 ---------------------------------- tests/test_env/test_mpe_env.py | 31 ++++++++++++-- 2 files changed, 27 insertions(+), 82 deletions(-) diff --git a/openrl/envs/mpe/rendering.py b/openrl/envs/mpe/rendering.py index ab1a47db..9eed52e1 100644 --- a/openrl/envs/mpe/rendering.py +++ b/openrl/envs/mpe/rendering.py @@ -320,28 +320,6 @@ def make_polyline(v): return PolyLine(v, False) -def make_capsule(length, width): - left, r, t, b = 0, length, width / 2, -width / 2 - box = make_polygon([(left, b), (left, t), (r, t), (r, b)]) - circ0 = make_circle(width / 2) - circ1 = make_circle(width / 2) - circ1.add_attr(Transform(translation=(length, 0))) - geom = Compound([box, circ0, circ1]) - return geom - - -class Compound(Geom): - def __init__(self, gs): - Geom.__init__(self) - self.gs = gs - for g in self.gs: - g.attrs = [a for a in g.attrs if not isinstance(a, Color)] - - def render1(self): - for g in self.gs: - g.render() - - class PolyLine(Geom): def __init__(self, v, close): Geom.__init__(self) @@ -373,59 +351,3 @@ def render1(self): glVertex2f(*self.start) glVertex2f(*self.end) glEnd() - - -class Image(Geom): - def __init__(self, fname, width, height): - Geom.__init__(self) - self.width = width - self.height = height - img = pyglet.image.load(fname) - self.img = img - self.flip = False - - def render1(self): - self.img.blit( - -self.width / 2, -self.height / 2, width=self.width, height=self.height - ) - - -# ================================================================ - - -class SimpleImageViewer(object): - def __init__(self, display=None): - self.window = None - self.isopen = False - self.display = display - - def imshow(self, arr): - if self.window is None: - height, width, channels = arr.shape - self.window = pyglet.window.Window( - width=width, height=height, display=self.display - ) - self.width = width - self.height = height - self.isopen = True - assert arr.shape == ( - self.height, - self.width, - 3, - ), "You passed in an image with the wrong number shape" - image = pyglet.image.ImageData( - self.width, self.height, "RGB", arr.tobytes(), pitch=self.width * -3 - ) - self.window.clear() - self.window.switch_to() - self.window.dispatch_events() - image.blit(0, 0) - self.window.flip() - - def close(self): - if self.isopen: - self.window.close() - self.isopen = False - - def __del__(self): - self.close() diff --git a/tests/test_env/test_mpe_env.py b/tests/test_env/test_mpe_env.py index 2dd664b6..0b555bb2 100644 --- a/tests/test_env/test_mpe_env.py +++ b/tests/test_env/test_mpe_env.py @@ -18,15 +18,16 @@ import os import sys +import numpy as np import pytest +from openrl.envs.common import make + @pytest.mark.unittest def test_mpe(): - from openrl.envs.common import make - - env_num = 6 - env = make("simple_spread", env_num=6) + env_num = 3 + env = make("simple_spread", env_num=env_num) obs, info = env.reset() obs, reward, done, info = env.step(env.random_action()) assert env.agent_num == 3 @@ -34,5 +35,27 @@ def test_mpe(): env.close() +@pytest.mark.unittest +def test_mpe_render(): + render_model = "human" + env_num = 2 + env = make( + "simple_spread", render_mode=render_model, env_num=env_num, asynchronous=False + ) + + env.reset(seed=0) + done = False + step = 0 + total_reward = 0 + while not np.any(done): + # Based on environmental observation input, predict next action. + + obs, r, done, info = env.step(env.random_action()) + step += 1 + total_reward += np.mean(r) + + env.close() + + if __name__ == "__main__": sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From d1948605a4874b2a09c10353e1206162711853a0 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 16:57:21 +0800 Subject: [PATCH 06/78] update test --- .github/workflows/unit_test.yml | 6 +++++- openrl/envs/mpe/rendering.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index cac1ec21..4c516d0f 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -17,6 +17,10 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y xvfb libglu1-mesa-dev python-opengl - name: Upgrade pip run: | python -m pip install --upgrade pip setuptools wheel @@ -27,7 +31,7 @@ jobs: - name: do_unittest timeout-minutes: 40 run: | - python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes + xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v3 with: diff --git a/openrl/envs/mpe/rendering.py b/openrl/envs/mpe/rendering.py index 9eed52e1..c66f2a0c 100644 --- a/openrl/envs/mpe/rendering.py +++ b/openrl/envs/mpe/rendering.py @@ -26,6 +26,7 @@ try: from pyglet.gl import * + except ImportError: print( "Error occured while running `from pyglet.gl import *`", From 93ce6f1accda2a3d6d5e99408f50c3de49f82e85 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 17 Oct 2023 17:06:01 +0800 Subject: [PATCH 07/78] update test --- .github/workflows/unit_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index 4c516d0f..e327cdf5 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -20,7 +20,7 @@ jobs: - name: Install system dependencies run: | sudo apt-get update - sudo apt-get install -y xvfb libglu1-mesa-dev python-opengl + sudo apt-get install -y xvfb libglu1-mesa-dev python3-opengl - name: Upgrade pip run: | python -m pip install --upgrade pip setuptools wheel From 8e17ae19d82c9ec6a077a079636d6ea182b143eb Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 18 Oct 2023 12:59:44 +0800 Subject: [PATCH 08/78] fix winning rate output bug of smac --- examples/smac/custom_vecinfo.py | 8 ++++---- examples/smac/train_ppo.py | 3 ++- examples/smacv2/custom_vecinfo.py | 14 +++++++------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/smac/custom_vecinfo.py b/examples/smac/custom_vecinfo.py index 52a2b5b2..ba39f6e1 100644 --- a/examples/smac/custom_vecinfo.py +++ b/examples/smac/custom_vecinfo.py @@ -41,10 +41,10 @@ def statistics(self, buffer: Any) -> Dict[str, Any]: assert ( "game_state" in singe_env_info["final_info"].keys() ), "game_state must be in info" - assert singe_env_info["final_info"]["game_state"] in [ - "win", - "lose", - ], "game_state in the final_info must be win or lose" + # assert singe_env_info["final_info"]["game_state"] in [ + # "win", + # "lose", + # ], "game_state in the final_info must be win or lose" self.win_history.append( singe_env_info["final_info"]["game_state"] == "win" ) diff --git a/examples/smac/train_ppo.py b/examples/smac/train_ppo.py index 32f8acff..4c03d295 100644 --- a/examples/smac/train_ppo.py +++ b/examples/smac/train_ppo.py @@ -25,7 +25,8 @@ def train(): # create environment env_num = 8 env = make( - "2s_vs_1sc", + "3m", + # "2s_vs_1sc", env_num=env_num, asynchronous=True, cfg=cfg, diff --git a/examples/smacv2/custom_vecinfo.py b/examples/smacv2/custom_vecinfo.py index 6dd90d00..48fc210d 100644 --- a/examples/smacv2/custom_vecinfo.py +++ b/examples/smacv2/custom_vecinfo.py @@ -33,21 +33,21 @@ def __init__(self, *args, **kwargs): def statistics(self, buffer: Any) -> Dict[str, Any]: info_dict = super().statistics(buffer) - """for step_info in self.infos: + for step_info in self.infos: for singe_env_info in step_info: assert isinstance(singe_env_info, dict), "singe_env_info must be dict" if "final_info" in singe_env_info.keys(): assert ( "game_state" in singe_env_info["final_info"].keys() - ), "game_state must be in info" - assert singe_env_info["final_info"]["game_state"] in [ - "win", - "lose", - ], "game_state in the final_info must be win or lose" + ), "win_state must be in info" + # assert singe_env_info["final_info"]["game_state"] in [ + # "win", + # "lose", + # ], "win_state in the final_info must be win or lose" self.win_history.append( singe_env_info["final_info"]["game_state"] == "win" - )""" + ) if len(self.win_history) > 0: info_dict["win_rate"] = np.mean(self.win_history) From 4703fc9ca9fea65574ae506ca57cd2ab850a63c5 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Wed, 18 Oct 2023 23:48:52 -0400 Subject: [PATCH 09/78] support deepspeed --- examples/nlp/nlp_ppo.yaml | 12 +- examples/nlp/train_ppo.py | 2 +- openrl/algorithms/ppo.py | 15 +- openrl/configs/config.py | 18 + openrl/envs/common/registration.py | 1 - openrl/envs/nlp/daily_dialog_env.py | 2 +- openrl/envs/nlp/rewards/intent.py | 43 +- openrl/envs/nlp/rewards/kl_penalty.py | 47 +- openrl/modules/networks/policy_network.py | 10 +- .../networks/policy_value_network_gpt.py | 20 +- .../modules/networks/utils/distributions.py | 2 +- .../modules/networks/utils/nlp/base_policy.py | 4 +- .../networks/utils/nlp/causal_policy.py | 43 +- .../networks/utils/nlp/hf_generation_utils.py | 4000 ----------------- openrl/modules/networks/value_network.py | 4 + openrl/modules/rl_module.py | 70 +- 16 files changed, 226 insertions(+), 4067 deletions(-) delete mode 100644 openrl/modules/networks/utils/nlp/hf_generation_utils.py diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index 47da5280..0b4e0f56 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -1,19 +1,19 @@ seed: 0 -lr: 1e-6 -critic_lr: 1e-6 +lr: 2e-7 +critic_lr: 2e-7 run_dir: ./run_results/ log_interval: 1 -use_recurrent_policy: true use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 128 +episode_length: 112 num_mini_batch: 20 use_share_model: true -use_amp: true + hidden_size: 1 -data_chunk_length: 1 +use_deepspeed: true +use_fp16: true model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog env: diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index f549c122..e6d115c1 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -13,7 +13,7 @@ def train(): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args() - env_num = 10 + env_num = 5 env = make( "daily_dialog", env_num=env_num, diff --git a/openrl/algorithms/ppo.py b/openrl/algorithms/ppo.py index 51400374..160fcfbe 100644 --- a/openrl/algorithms/ppo.py +++ b/openrl/algorithms/ppo.py @@ -41,6 +41,7 @@ def __init__( self.use_joint_action_loss = cfg.use_joint_action_loss super(PPOAlgorithm, self).__init__(cfg, init_module, agent_num, device) self.train_list = [self.train_ppo] + self.use_deepspeed = cfg.use_deepspeed def ppo_update(self, sample, turn_on=True): for optimizer in self.algo_module.optimizers.values(): @@ -108,8 +109,18 @@ def ppo_update(self, sample, turn_on=True): active_masks_batch, turn_on, ) - for loss in loss_list: - loss.backward() + if self.use_deepspeed: + if self._use_share_model: + for loss in loss_list: + self.algo_module.models["model"].backward(loss) + else: + actor_loss = loss_list[0] + critic_loss = loss_list[1] + self.algo_module.models["policy"].backward(actor_loss) + self.algo_module.models["critic"].backward(critic_loss) + else: + for loss in loss_list: + loss.backward() # else: if self._use_share_model: diff --git a/openrl/configs/config.py b/openrl/configs/config.py index 8c714b68..fc74e568 100644 --- a/openrl/configs/config.py +++ b/openrl/configs/config.py @@ -1214,5 +1214,23 @@ def create_config_parser(): type=float, help="newest_weight", ) + parser.add_argument( + "--use_deepspeed", + default=False, + type=bool, + help="whether to use deepspeed", + ) + parser.add_argument( + "--local_rank", + default=-1, + type=int, + help="local_rank", + ) + parser.add_argument( + "--use_fp16", + default=False, + type=bool, + help="whether to use fp16", + ) return parser diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 4b852954..5ad99078 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -107,7 +107,6 @@ def make( id=id, env_num=env_num, render_mode=convert_render_mode, - cfg=cfg, **kwargs, ) elif id in openrl.envs.toy_all_envs: diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 0c7a6ff7..fa838a06 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -40,7 +40,7 @@ def __init__( self.env_name = "daily_dialog" tokenizer_name = cfg.env.args["tokenizer_path"] - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True) + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer.padding_side = "left" diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 397ea810..10fa65d1 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -9,25 +9,52 @@ from openrl.supports.opengpu.manager import LocalGPUManager +def get_eval_ds_config(offload, stage=0): + device = "cpu" if offload else "none" + zero_opt_dict = { + "stage": stage, + "offload_param": { + "device": device + }, + } + return { + "train_batch_size": 28, + "train_micro_batch_size_per_gpu": 7, + "steps_per_print": 10, + "zero_optimization": zero_opt_dict, + "fp16": { + "enabled": True + }, + } + + class Intent: def __init__(self, intent_model: str, intent_coeff: float = 1.0) -> None: super().__init__() self._intent_coeff = intent_coeff + self.use_deepspeed = True # TODO model_path = data_abs_path(intent_model) self._tokenizer = AutoTokenizer.from_pretrained(intent_model) self._model = AutoModelForSequenceClassification.from_pretrained(model_path) - if torch.cuda.is_available(): - manager = LocalGPUManager() - manager.log_info() - self._device = f"cuda:{manager.get_gpu()}" + if self.use_deepspeed: + import deepspeed + self._model = self._model.to('cuda') + ds_config = get_eval_ds_config(offload=True, stage=0) + self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) + self._device = "cuda" else: - self._device = "cpu" - print("Intent Model choose to use device:{}".format(self._device)) - - self._model = self._model.to(self._device) + if torch.cuda.is_available(): + manager = LocalGPUManager() + manager.log_info() + self._device = f"cuda:{manager.get_gpu()}" + else: + self._device = "cpu" + print("Intent Model choose to use device:{}".format(self._device)) + + self._model = self._model.to(self._device) def __call__( self, diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 039d82d5..bf5a074f 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -10,6 +10,25 @@ from openrl.envs.nlp.utils.distribution import CategoricalDistribution +def get_eval_ds_config(offload, stage=0): + device = "cpu" if offload else "none" + zero_opt_dict = { + "stage": stage, + "offload_param": { + "device": device + }, + } + return { + "train_batch_size": 28, # + "train_micro_batch_size_per_gpu": 7, + "steps_per_print": 10, + "zero_optimization": zero_opt_dict, + "fp16": { + "enabled": True + }, + } + + class KLPenalty(nn.Module): def __init__( self, @@ -18,16 +37,23 @@ def __init__( apply_model_parallel: bool = True, ): super().__init__() + self.use_deepspeed = True + self.use_fp16 = True # reference model self._apply_model_parallel = apply_model_parallel self._ref_net = AutoModelForCausalLM.from_pretrained(ref_model) self._ref_net = self._ref_net.eval() - if torch.cuda.is_available(): + if self.use_deepspeed: + import deepspeed + ds_config = get_eval_ds_config(offload=True, stage=0) + self._ref_engine, *_ = deepspeed.initialize(model=self, config=ds_config) + elif torch.cuda.is_available(): if self._apply_model_parallel and self._ref_net.is_parallelizable: self._ref_net.parallelize() else: # else defaults to data parallel self._ref_net = torch.nn.DataParallel(self._ref_net) + # alpha adjustment self._alpha = 0.2 @@ -61,11 +87,16 @@ def __call__( past_model_kwargs = { "attention_mask": attention_mask, } - model_inputs = self._prepare_inputs_for_model( self._ref_net, input_ids, past_model_kwargs ) + if self.use_fp16: + for key in ["input_ids", "position_ids"]: + model_inputs[key] = model_inputs[key].half().int() + for key in ["attention_mask"]: + model_inputs[key] = model_inputs[key].half() + with torch.no_grad(): output = self._ref_net(output_hidden_states=True, **model_inputs) output["past_key_values"] = None @@ -108,4 +139,16 @@ def _prepare_inputs_for_model( ) for key, value in model_inputs.items() } + + if self.use_deepspeed: + model_inputs = { + key: ( + value.to('cuda') + if isinstance(value, torch.Tensor) + else value + ) + for key, value in model_inputs.items() + } + + return model_inputs diff --git a/openrl/modules/networks/policy_network.py b/openrl/modules/networks/policy_network.py index 422eaa58..280adfc6 100644 --- a/openrl/modules/networks/policy_network.py +++ b/openrl/modules/networks/policy_network.py @@ -53,6 +53,7 @@ def __init__( self._influence_layer_N = cfg.influence_layer_N self._use_policy_vhead = cfg.use_policy_vhead self._recurrent_N = cfg.recurrent_N + self._use_fp16 = cfg.use_fp16 and cfg.use_deepspeed self.use_half = use_half self.tpdv = dict(dtype=torch.float32, device=device) @@ -135,8 +136,9 @@ def forward_original( policy_obs[key].half() else: policy_obs = check(policy_obs, self.use_half, self.tpdv) - # if self.use_half: - # obs = obs.half() + if self.use_half or self._use_fp16: + policy_obs = policy_obs.half() + rnn_states = check(rnn_states, self.use_half, self.tpdv) masks = check(masks, self.use_half, self.tpdv) @@ -165,6 +167,8 @@ def eval_actions( obs[key] = check(obs[key], self.use_half, self.tpdv) else: obs = check(obs, self.use_half, self.tpdv) + if self._use_fp16: + obs = obs.half() rnn_states = check(rnn_states, self.use_half, self.tpdv) action = check(action, self.use_half, self.tpdv) @@ -202,6 +206,8 @@ def get_policy_values(self, obs, rnn_states, masks): obs[key] = check(obs[key], self.use_half, self.tpdv) else: obs = check(obs).to(**self.tpdv) + if self.use_half or self._use_fp16: + obs = obs.half() rnn_states = check(rnn_states, self.use_half, self.tpdv) masks = check(masks, self.use_half, self.tpdv) diff --git a/openrl/modules/networks/policy_value_network_gpt.py b/openrl/modules/networks/policy_value_network_gpt.py index e87e146b..1549b5b9 100644 --- a/openrl/modules/networks/policy_value_network_gpt.py +++ b/openrl/modules/networks/policy_value_network_gpt.py @@ -43,6 +43,7 @@ def __init__( device=device, ) self.use_half = use_half + self._use_fp16 = cfg.use_fp16 and cfg.use_deepspeed self.tpdv = dict(dtype=torch.float32, device=device) def get_actor_para(self): @@ -66,6 +67,8 @@ def get_actions( ): for key in obs.keys(): obs[key] = check(obs[key], self.use_half, self.tpdv) + if self._use_fp16: + obs[key] = obs[key].half() rnn_states = check(rnn_states, self.use_half, self.tpdv) past_model_kwargs = None @@ -83,6 +86,8 @@ def eval_actions( ): for key in obs.keys(): obs[key] = check(obs[key], self.use_half, self.tpdv) + if self._use_fp16: + obs[key] = obs[key].half() action = check(action, self.use_half, self.tpdv).squeeze() eval_output = super().evaluate_actions(obs, action) @@ -95,20 +100,11 @@ def eval_actions( def get_values(self, obs, rnn_states, masks): for key in obs.keys(): obs[key] = check(obs[key], self.use_half, self.tpdv) + if self._use_fp16: + obs[key] = obs[key].half() rnn_states = check(rnn_states, self.use_half, self.tpdv) value_output = super().forward_value(obs) values = value_output.values - return values, rnn_states - - def get_log_probs_ref_model(self, obs, action): - for key in obs.keys(): - obs[key] = check(obs[key], self.use_half, self.tpdv) - action = check(action, self.use_half, self.tpdv) - action = action.squeeze(-1) - - policy_output = super().get_log_probs_ref_model(obs, action) - action_log_probs = policy_output.log_probs - - return action_log_probs.detach().cpu().numpy() + return values, rnn_states \ No newline at end of file diff --git a/openrl/modules/networks/utils/distributions.py b/openrl/modules/networks/utils/distributions.py index 340015a4..ebd7421f 100644 --- a/openrl/modules/networks/utils/distributions.py +++ b/openrl/modules/networks/utils/distributions.py @@ -68,7 +68,7 @@ def init_(m): def forward(self, x, action_masks=None): x = self.linear(x) if action_masks is not None: - x[action_masks == 0] = -1e10 + x[action_masks == 0] = -6e4 # fp16 return FixedCategorical(logits=x) diff --git a/openrl/modules/networks/utils/nlp/base_policy.py b/openrl/modules/networks/utils/nlp/base_policy.py index 9051b886..02d99086 100644 --- a/openrl/modules/networks/utils/nlp/base_policy.py +++ b/openrl/modules/networks/utils/nlp/base_policy.py @@ -130,7 +130,7 @@ def __init__( optimizer_kwargs: Dict[str, Any] = {}, weight_decay: float = 1e-6, use_sde: bool = None, - apply_model_parallel: bool = True, + apply_model_parallel: bool = False, # TODO optimizer_class: torch.optim.Optimizer = torch.optim.AdamW, generation_kwargs: Dict[str, Any] = {}, prompt_truncation_side: str = "left", @@ -152,10 +152,10 @@ def __init__( prompt_truncation_side (str, optional): truncation side for prompt text. Defaults to "left". """ super().__init__() + self._use_deepspeed = True # TODO self._action_space = action_space self._apply_model_parallel = apply_model_parallel self._build_model_heads(model_name, config, device) - self._setup_optimizer(optimizer_kwargs, weight_decay, optimizer_class) self._action_dist = CategoricalDistribution(self._action_space.n) self._generation_kwargs = generation_kwargs self._prompt_truncation_side = prompt_truncation_side diff --git a/openrl/modules/networks/utils/nlp/causal_policy.py b/openrl/modules/networks/utils/nlp/causal_policy.py index dedfc4aa..956d3471 100644 --- a/openrl/modules/networks/utils/nlp/causal_policy.py +++ b/openrl/modules/networks/utils/nlp/causal_policy.py @@ -15,10 +15,6 @@ PolicyType, ValueOutput, ) -from openrl.modules.networks.utils.nlp.hf_generation_utils import ( - override_generation_routines, - unwrap_generation_routines, -) from openrl.modules.utils.valuenorm import ValueNorm @@ -65,7 +61,6 @@ def load_from_dict(self, state_dict: dict = None): @property def policy(self): policy_model = self._policy_model - policy_model.__class__ = unwrap_generation_routines(type(policy_model)) return policy_model def _build_model_heads(self, model_name: str, config: str, device: str): @@ -81,10 +76,6 @@ def _build_model_heads(self, model_name: str, config: str, device: str): model_name, config=config ) - self._policy_model.__class__ = override_generation_routines( - type(self._policy_model) - ) - self._value_model = AutoModelForCausalLM.from_pretrained( model_name, config=config ) @@ -99,7 +90,16 @@ def _build_model_heads(self, model_name: str, config: str, device: str): torch.multiprocessing.set_sharing_strategy("file_system") # apply model parallel if torch.cuda.is_available(): - if self._apply_model_parallel and self._policy_model.is_parallelizable: + if self._use_deepspeed: + if self.value_normalizer is not None: + import deepspeed + para = self.value_normalizer.running_mean + deepspeed.zero.register_external_parameter(self, para) + para = self.value_normalizer.running_mean_sq + deepspeed.zero.register_external_parameter(self, para) + para = self.value_normalizer.debiasing_term + deepspeed.zero.register_external_parameter(self, para) + elif self._apply_model_parallel and self._policy_model.is_parallelizable: self._policy_model.parallelize() self._value_model.parallelize() self._value_head = self._value_head.to(self.device) @@ -126,17 +126,18 @@ def _prepare_inputs_for_model( input_ids, **model_kwargs ) - if self._apply_model_parallel and unwrap_model(model).is_parallelizable: - # if model is in parallel mode, move the tensors to the first device - model_inputs = { - key: ( - value.to(model.transformer.first_device) - if isinstance(value, torch.Tensor) - and hasattr(model.transformer, "first_device") - else value - ) - for key, value in model_inputs.items() - } + if not self._use_deepspeed: + if self._apply_model_parallel and unwrap_model(model).is_parallelizable: + # if model is in parallel mode, move the tensors to the first device + model_inputs = { + key: ( + value.to(model.transformer.first_device) + if isinstance(value, torch.Tensor) + and hasattr(model.transformer, "first_device") + else value + ) + for key, value in model_inputs.items() + } return model_inputs def forward_policy( diff --git a/openrl/modules/networks/utils/nlp/hf_generation_utils.py b/openrl/modules/networks/utils/nlp/hf_generation_utils.py deleted file mode 100644 index 37d80875..00000000 --- a/openrl/modules/networks/utils/nlp/hf_generation_utils.py +++ /dev/null @@ -1,4000 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import warnings -from dataclasses import dataclass -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union - -import torch -import torch.distributed as dist -from torch import nn -from transformers.generation_beam_constraints import ( - Constraint, - DisjunctiveConstraint, - PhrasalConstraint, -) -from transformers.generation_beam_search import ( - BeamScorer, - BeamSearchScorer, - ConstrainedBeamSearchScorer, -) -from transformers.generation_logits_process import ( - EncoderNoRepeatNGramLogitsProcessor, - ExponentialDecayLengthPenalty, - ForcedBOSTokenLogitsProcessor, - ForcedEOSTokenLogitsProcessor, - HammingDiversityLogitsProcessor, - InfNanRemoveLogitsProcessor, - LogitsProcessorList, - MinLengthLogitsProcessor, - NoBadWordsLogitsProcessor, - NoRepeatNGramLogitsProcessor, - PrefixConstrainedLogitsProcessor, - RepetitionPenaltyLogitsProcessor, - TemperatureLogitsWarper, - TopKLogitsWarper, - TopPLogitsWarper, - TypicalLogitsWarper, -) -from transformers.generation_stopping_criteria import ( - MaxLengthCriteria, - MaxTimeCriteria, - StoppingCriteria, - StoppingCriteriaList, - validate_stopping_criteria, -) -from transformers.generation_utils import GenerationMixin -from transformers.pytorch_utils import torch_int_div -from transformers.utils import ModelOutput, logging - -logger = logging.get_logger(__name__) - - -@dataclass -class GreedySearchDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using greedy search. - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each - tensor of shape `(batch_size, config.vocab_size)`). - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class GreedySearchEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape - `(batch_size, config.vocab_size)`). - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class SampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using sampling. - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each - tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`). - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, - sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class SampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of - the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) - at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape - `(batch_size*num_return_sequences, config.vocab_size)`). - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape - `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, - sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - scores: Optional[Tuple[torch.FloatTensor]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class BeamSearchDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam search. - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of shape - `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). - beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `(batch_size*num_return_sequences)`-shaped - tuple of `(max_length-input_ids.shape[-1],)`-shaped tuples of scalar `torch.LongTensor` tensors. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[Tuple[Tuple[torch.LongTensor]]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class BeamSearchEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights - of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states - attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape `(batch_size*num_beams, - config.vocab_size)`). - beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `(batch_size*num_return_sequences)`-shaped - tuple of `(max_length-1,)`-shaped tuples of scalar `torch.LongTensor` tensors. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, - sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[Tuple[Tuple[torch.LongTensor]]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class BeamSampleDecoderOnlyOutput(ModelOutput): - """ - Base class for outputs of decoder-only generation models using beam sample. - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of shape - `(batch_size*num_beams*num_return_sequences, config.vocab_size)`). - beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `(batch_size*num_return_sequences)`-shaped - tuple of `(max_length-input_ids.shape[-1],)`-shaped tuples of scalar `torch.LongTensor` tensors. - attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[Tuple[Tuple[torch.LongTensor]]] = None - attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -@dataclass -class BeamSampleEncoderDecoderOutput(ModelOutput): - """ - Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention - weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the - encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) - - Args: - sequences (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`): - The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter - if all batches finished early due to the `eos_token_id`. - sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Final beam scores of the generated `sequences`. - scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting - of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam. - `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape `(batch_size*num_beams, - config.vocab_size)`). - beam_indices (`tuple(tuple(torch.LongTensor))`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`): - Beam indices of generated token id at each generation step. `(batch_size*num_return_sequences)`-shaped - tuple of `(max_length-1,)`-shaped tuples of scalar `torch.LongTensor` tensors. - encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, - sequence_length, sequence_length)`. - encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of - shape `(batch_size*num_beams, sequence_length, hidden_size)`. - decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`. - cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`. - decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of - `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`. - """ - - sequences: torch.LongTensor = None - sequences_scores: Optional[torch.FloatTensor] = None - scores: Optional[Tuple[torch.FloatTensor]] = None - beam_indices: Optional[Tuple[Tuple[torch.LongTensor]]] = None - encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None - encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None - decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - cross_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - decoder_hidden_states: Optional[Tuple[Tuple[torch.FloatTensor]]] = None - - -GreedySearchOutput = Union[ - GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput -] -SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput] -BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] -BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput] - - -class GenerationMixinWithRawScores: - """ - A class containing all functions for auto-regressive text generation, to be used as a mixin in [`PreTrainedModel`]. - - The class exposes [`~generation_utils.GenerationMixin.generate`], which can be used for: - - *greedy decoding* by calling [`~generation_utils.GenerationMixin.greedy_search`] if `num_beams=1` and - `do_sample=False`. - - *multinomial sampling* by calling [`~generation_utils.GenerationMixin.sample`] if `num_beams=1` and - `do_sample=True`. - - *beam-search decoding* by calling [`~generation_utils.GenerationMixin.beam_search`] if `num_beams>1` and - `do_sample=False`. - - *beam-search multinomial sampling* by calling [`~generation_utils.GenerationMixin.beam_sample`] if - `num_beams>1` and `do_sample=True`. - - *diverse beam-search decoding* by calling [`~generation_utils.GenerationMixin.group_beam_search`], if - `num_beams>1` and `num_beam_groups>1`. - - *constrained beam-search decoding* by calling [`~generation_utils.GenerationMixin.constrained_beam_search`], - if `constraints!=None` or `force_words_ids!=None`. - """ - - def _prepare_model_inputs( - self, - inputs: Optional[torch.Tensor] = None, - bos_token_id: Optional[int] = None, - model_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, Optional[str], Dict[str, torch.Tensor]]: - """ - This function extracts the model-specific `inputs` for generation. - """ - # 1. retrieve all kwargs that are non-None or non-model input related. - # some encoder-decoder models have different names for model and encoder - if ( - self.config.is_encoder_decoder - and hasattr(self, "encoder") - and self.encoder.main_input_name != self.main_input_name - ): - input_name = self.encoder.main_input_name - else: - input_name = self.main_input_name - - model_kwargs = { - k: v for k, v in model_kwargs.items() if v is not None or k != input_name - } - - # 2. check whether model_input_name is passed as kwarg - # if yes and `inputs` is None use kwarg inputs - inputs_kwarg = model_kwargs.pop(input_name, None) - if inputs_kwarg is not None and inputs is not None: - raise ValueError( - f"`inputs`: {inputs}` were passed alongside " - f"{input_name} which is not allowed." - f"Make sure to either pass {inputs} or {input_name}=..." - ) - elif inputs_kwarg is not None: - inputs = inputs_kwarg - - # 3. models with `input_ids` can also make use of `inputs_embeds` - if self._can_retrieve_inputs_from_name(inputs, "inputs_embeds", model_kwargs): - inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds" - - # 4. Only encoder-decoder models can have non `input_ids` input format - if not self.config.is_encoder_decoder and input_name != "input_ids": - raise ValueError( - f"If {input_name} is passed as model-specific keyword " - "input then model has to be an encoder-decoder and not a " - f"{self.__class__.__name__}." - ) - - # 5. if `inputs` is still None, try to create `input_ids` from BOS token - if inputs is None: - inputs = self._prepare_input_ids_for_generation( - bos_token_id, model_kwargs.get("encoder_outputs") - ) - - return inputs, input_name, model_kwargs - - def _can_retrieve_inputs_from_name( - self, - inputs: Optional[torch.Tensor], - name: str, - model_kwargs: Dict[str, torch.Tensor], - ) -> torch.Tensor: - """ - If `inputs` is None and `name` is in both forward function and keyword arguments, then inputs can be retrieved - from name - """ - can_retrieve_inputs = model_kwargs.get(name, None) is not None and name in set( - inspect.signature(self.forward).parameters.keys() - ) - - if can_retrieve_inputs and inputs is not None: - raise ValueError( - f"Cannot only pass one of {name} and {self.main_input_name}" - ) - - return can_retrieve_inputs - - def prepare_inputs_for_generation( - self, input_ids: torch.LongTensor, **kwargs - ) -> Dict[str, Any]: - """ - Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the generate method. - """ - return {"input_ids": input_ids} - - def adjust_logits_during_generation( - self, logits: torch.FloatTensor, **kwargs - ) -> torch.FloatTensor: - """ - Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in the generate method. - """ - return logits - - def _prepare_input_ids_for_generation( - self, bos_token_id: Optional[int], encoder_outputs: Optional[ModelOutput] - ) -> torch.LongTensor: - if self.config.is_encoder_decoder and encoder_outputs is not None: - # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding - shape = encoder_outputs.last_hidden_state.size()[:-1] - return torch.ones(shape, dtype=torch.long, device=self.device) * -100 - - if bos_token_id is None: - raise ValueError( - "`bos_token_id` has to be defined when no `input_ids` are provided." - ) - return torch.ones((1, 1), dtype=torch.long, device=self.device) * bos_token_id - - def _prepare_attention_mask_for_generation( - self, - inputs: torch.Tensor, - pad_token_id: int, - eos_token_id: int, - ) -> torch.LongTensor: - is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [ - torch.int, - torch.long, - ] - is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs) - is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( - (eos_token_id is not None) and (pad_token_id != eos_token_id) - ) - # Check if input is input_ids and padded -> only then is attention_mask defined - if ( - is_input_ids - and is_pad_token_in_inputs - and is_pad_token_not_equal_to_eos_token_id - ): - return inputs.ne(pad_token_id).long() - else: - return torch.ones(inputs.shape[:2], dtype=torch.long, device=self.device) - - def _prepare_encoder_decoder_kwargs_for_generation( - self, - inputs_tensor: torch.Tensor, - model_kwargs, - model_input_name: Optional[str] = None, - ) -> Dict[str, Any]: - # 1. get encoder - encoder = self.get_encoder() - - # 2. prepare encoder args and encoder kwargs from model kwargs - irrelevant_prefix = ["decoder_", "cross_attn", "use_cache"] - encoder_kwargs = { - argument: value - for argument, value in model_kwargs.items() - if not any(argument.startswith(p) for p in irrelevant_prefix) - } - - # 3. make sure that encoder returns `ModelOutput` - model_input_name = ( - model_input_name if model_input_name is not None else self.main_input_name - ) - encoder_kwargs["return_dict"] = True - encoder_kwargs[model_input_name] = inputs_tensor - model_kwargs["encoder_outputs"]: ModelOutput = encoder(**encoder_kwargs) - - return model_kwargs - - def _prepare_decoder_input_ids_for_generation( - self, - batch_size: int, - decoder_start_token_id: int = None, - bos_token_id: int = None, - model_kwargs: Optional[Dict[str, torch.Tensor]] = None, - ) -> torch.LongTensor: - if model_kwargs is not None and "decoder_input_ids" in model_kwargs: - return model_kwargs.pop("decoder_input_ids") - else: - decoder_start_token_id = self._get_decoder_start_token_id( - decoder_start_token_id, bos_token_id - ) - return ( - torch.ones((batch_size, 1), dtype=torch.long, device=self.device) - * decoder_start_token_id - ) - - def _get_decoder_start_token_id( - self, decoder_start_token_id: int = None, bos_token_id: int = None - ) -> int: - decoder_start_token_id = ( - decoder_start_token_id - if decoder_start_token_id is not None - else self.config.decoder_start_token_id - ) - bos_token_id = ( - bos_token_id if bos_token_id is not None else self.config.bos_token_id - ) - - if decoder_start_token_id is not None: - return decoder_start_token_id - elif ( - hasattr(self.config, "decoder") - and hasattr(self.config.decoder, "decoder_start_token_id") - and self.config.decoder.decoder_start_token_id is not None - ): - return self.config.decoder.decoder_start_token_id - elif bos_token_id is not None: - return bos_token_id - elif ( - hasattr(self.config, "decoder") - and hasattr(self.config.decoder, "bos_token_id") - and self.config.decoder.bos_token_id is not None - ): - return self.config.decoder.bos_token_id - raise ValueError( - "`decoder_start_token_id` or `bos_token_id` has to be defined for" - " encoder-decoder generation." - ) - - @staticmethod - def _expand_inputs_for_generation( - input_ids: torch.LongTensor, - expand_size: int = 1, - is_encoder_decoder: bool = False, - attention_mask: Optional[torch.LongTensor] = None, - encoder_outputs: Optional[ModelOutput] = None, - **model_kwargs, - ) -> Tuple[torch.LongTensor, Dict[str, Any]]: - expanded_return_idx = ( - torch.arange(input_ids.shape[0]) - .view(-1, 1) - .repeat(1, expand_size) - .view(-1) - .to(input_ids.device) - ) - input_ids = input_ids.index_select(0, expanded_return_idx) - - if "token_type_ids" in model_kwargs: - token_type_ids = model_kwargs["token_type_ids"] - model_kwargs["token_type_ids"] = token_type_ids.index_select( - 0, expanded_return_idx - ) - - if attention_mask is not None: - model_kwargs["attention_mask"] = attention_mask.index_select( - 0, expanded_return_idx - ) - - if is_encoder_decoder: - if encoder_outputs is None: - raise ValueError( - "If `is_encoder_decoder` is True, make sure that `encoder_outputs`" - " is defined." - ) - encoder_outputs["last_hidden_state"] = ( - encoder_outputs.last_hidden_state.index_select( - 0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device) - ) - ) - model_kwargs["encoder_outputs"] = encoder_outputs - return input_ids, model_kwargs - - @staticmethod - def _update_model_kwargs_for_generation( - outputs: ModelOutput, - model_kwargs: Dict[str, Any], - is_encoder_decoder: bool = False, - ) -> Dict[str, Any]: - # update past - if "past_key_values" in outputs: - model_kwargs["past"] = outputs.past_key_values - elif "mems" in outputs: - model_kwargs["past"] = outputs.mems - elif "past_buckets_states" in outputs: - model_kwargs["past"] = outputs.past_buckets_states - else: - model_kwargs["past"] = None - - # update token_type_ids with last value - if "token_type_ids" in model_kwargs: - token_type_ids = model_kwargs["token_type_ids"] - model_kwargs["token_type_ids"] = torch.cat( - [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1 - ) - - # update attention mask - if not is_encoder_decoder: - if "attention_mask" in model_kwargs: - attention_mask = model_kwargs["attention_mask"] - model_kwargs["attention_mask"] = torch.cat( - [ - attention_mask, - attention_mask.new_ones((attention_mask.shape[0], 1)), - ], - dim=-1, - ) - - return model_kwargs - - def _reorder_cache(self, past, beam_idx): - raise NotImplementedError( - "Make sure that a `_reorder_cache` function is correctly implemented in" - f" {self.__class__.__module__} to enable beam search for {self.__class__}" - ) - - def _get_logits_warper( - self, - top_k: Optional[int] = None, - top_p: Optional[float] = None, - typical_p: Optional[float] = None, - temperature: Optional[float] = None, - num_beams: Optional[int] = None, - ) -> LogitsProcessorList: - """ - This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsWarper`] instances - used for multinomial sampling. - """ - - # init warp parameters - top_k = top_k if top_k is not None else self.config.top_k - top_p = top_p if top_p is not None else self.config.top_p - typical_p = typical_p if typical_p is not None else self.config.typical_p - temperature = ( - temperature if temperature is not None else self.config.temperature - ) - # instantiate warpers list - warpers = LogitsProcessorList() - - # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files - # all samplers can be found in `generation_utils_samplers.py` - if temperature is not None and temperature != 1.0: - warpers.append(TemperatureLogitsWarper(temperature)) - if top_k is not None and top_k != 0: - warpers.append( - TopKLogitsWarper( - top_k=top_k, min_tokens_to_keep=(2 if num_beams > 1 else 1) - ) - ) - if top_p is not None and top_p < 1.0: - warpers.append( - TopPLogitsWarper( - top_p=top_p, min_tokens_to_keep=(2 if num_beams > 1 else 1) - ) - ) - if typical_p is not None and typical_p < 1.0: - warpers.append( - TypicalLogitsWarper( - mass=typical_p, min_tokens_to_keep=(2 if num_beams > 1 else 1) - ) - ) - return warpers - - def _get_logits_processor( - self, - repetition_penalty: float, - no_repeat_ngram_size: int, - encoder_no_repeat_ngram_size: int, - input_ids_seq_length: int, - encoder_input_ids: torch.LongTensor, - bad_words_ids: List[List[int]], - min_length: int, - max_length: int, - eos_token_id: int, - forced_bos_token_id: int, - forced_eos_token_id: int, - prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], - num_beams: int, - num_beam_groups: int, - diversity_penalty: float, - remove_invalid_values: bool, - exponential_decay_length_penalty: Tuple, - logits_processor: Optional[LogitsProcessorList], - ) -> LogitsProcessorList: - """ - This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`] - instances used to modify the scores of the language model head. - """ - processors = LogitsProcessorList() - - # init warp parameters - repetition_penalty = ( - repetition_penalty - if repetition_penalty is not None - else self.config.repetition_penalty - ) - no_repeat_ngram_size = ( - no_repeat_ngram_size - if no_repeat_ngram_size is not None - else self.config.no_repeat_ngram_size - ) - encoder_no_repeat_ngram_size = ( - encoder_no_repeat_ngram_size - if encoder_no_repeat_ngram_size is not None - else self.config.encoder_no_repeat_ngram_size - ) - bad_words_ids = ( - bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids - ) - min_length = min_length if min_length is not None else self.config.min_length - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - diversity_penalty = ( - diversity_penalty - if diversity_penalty is not None - else self.config.diversity_penalty - ) - forced_bos_token_id = ( - forced_bos_token_id - if forced_bos_token_id is not None - else self.config.forced_bos_token_id - ) - forced_eos_token_id = ( - forced_eos_token_id - if forced_eos_token_id is not None - else self.config.forced_eos_token_id - ) - remove_invalid_values = ( - remove_invalid_values - if remove_invalid_values is not None - else self.config.remove_invalid_values - ) - exponential_decay_length_penalty = ( - exponential_decay_length_penalty - if exponential_decay_length_penalty is not None - else self.config.exponential_decay_length_penalty - ) - # instantiate processors list - - # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files - # all samplers can be found in `generation_utils_samplers.py` - if diversity_penalty is not None and diversity_penalty > 0.0: - processors.append( - HammingDiversityLogitsProcessor( - diversity_penalty=diversity_penalty, - num_beams=num_beams, - num_beam_groups=num_beam_groups, - ) - ) - if repetition_penalty is not None and repetition_penalty != 1.0: - processors.append( - RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty) - ) - if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0: - processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size)) - if ( - encoder_no_repeat_ngram_size is not None - and encoder_no_repeat_ngram_size > 0 - ): - if self.config.is_encoder_decoder: - processors.append( - EncoderNoRepeatNGramLogitsProcessor( - encoder_no_repeat_ngram_size, encoder_input_ids - ) - ) - else: - raise ValueError( - "It's impossible to use `encoder_no_repeat_ngram_size` with" - " decoder-only architecture" - ) - if bad_words_ids is not None: - processors.append(NoBadWordsLogitsProcessor(bad_words_ids, eos_token_id)) - if min_length is not None and eos_token_id is not None and min_length > 0: - processors.append(MinLengthLogitsProcessor(min_length, eos_token_id)) - if prefix_allowed_tokens_fn is not None: - processors.append( - PrefixConstrainedLogitsProcessor( - prefix_allowed_tokens_fn, num_beams // num_beam_groups - ) - ) - if forced_bos_token_id is not None: - processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id)) - if forced_eos_token_id is not None: - processors.append( - ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id) - ) - if remove_invalid_values is True: - processors.append(InfNanRemoveLogitsProcessor()) - if exponential_decay_length_penalty is not None: - processors.append( - ExponentialDecayLengthPenalty( - exponential_decay_length_penalty, eos_token_id, input_ids_seq_length - ) - ) - processors = self._merge_criteria_processor_list(processors, logits_processor) - return processors - - def _get_stopping_criteria( - self, - max_length: Optional[int], - max_time: Optional[float], - stopping_criteria: Optional[StoppingCriteriaList], - ) -> StoppingCriteriaList: - criteria = StoppingCriteriaList() - if max_length is not None: - criteria.append(MaxLengthCriteria(max_length=max_length)) - if max_time is not None: - criteria.append(MaxTimeCriteria(max_time=max_time)) - criteria = self._merge_criteria_processor_list(criteria, stopping_criteria) - return criteria - - def _merge_criteria_processor_list( - self, - default_list: Union[LogitsProcessorList, StoppingCriteriaList], - custom_list: Union[LogitsProcessorList, StoppingCriteriaList], - ) -> Union[LogitsProcessorList, StoppingCriteriaList]: - if len(custom_list) == 0: - return default_list - for default in default_list: - for custom in custom_list: - if type(custom) is type(default): - object_type = ( - "stopping criteria" - if isinstance(custom, StoppingCriteria) - else "logits processor" - ) - raise ValueError( - f"A custom {object_type} of type {type(custom)} with values" - f" {custom} has been passed to `generate`, but it has already" - f" been created with the values {default}. {default} has been" - " created by passing the corresponding arguments to generate" - " or by the model's config default values. If you just want to" - f" change the default values of {object_type} consider passing" - " them as arguments to `generate` instead of using a custom" - f" {object_type}." - ) - default_list.extend(custom_list) - return default_list - - def compute_beam_search_raw_logits( - self, - sequences: torch.Tensor, - scores: Tuple[torch.Tensor], - beam_indices: torch.Tensor, - eos_token_id: int = None, - ): - """Compute raw logits for beam search""" - - if not self.config.is_encoder_decoder: - raise NotImplementedError( - "Beam Search raw logits code is implemented only for enoder-decoder" - " only models" - ) - - # since sequences can be shorter than scores (probably due to beam search finalization) - # we always have to generate raw_logits only for generated sequences - # cut off the start tokens from generated - sequences = sequences.clone() - sequences = sequences[:, 1:] - gen_steps = sequences.shape[1] - - # align scores and beam indices according to gen_steps - # scores(gen_steps x(batch_size * num_beams) x vocab_size) - scores = scores[:gen_steps] - scores = torch.stack(scores) - _, _, vocab_size = scores.shape - - beam_indices = torch.tensor(beam_indices).T.to(scores.device) - beam_indices = beam_indices[:gen_steps, :] - batch_size = beam_indices.shape[1] - - # gen_steps x batch_size x vocab_size - beam_indices = beam_indices.unsqueeze(-1).repeat(1, 1, vocab_size) - step_wise_logits = scores.gather(dim=1, index=beam_indices) - assert step_wise_logits.shape == torch.Size((gen_steps, batch_size, vocab_size)) - - # finally convert to tuples - step_wise_logits = [(step_wise_logits[t], None) for t in range(gen_steps)] - return step_wise_logits - - @torch.no_grad() - def generate( - self, - inputs: Optional[torch.Tensor] = None, - max_length: Optional[int] = None, - min_length: Optional[int] = None, - do_sample: Optional[bool] = None, - early_stopping: Optional[bool] = None, - num_beams: Optional[int] = None, - temperature: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, - typical_p: Optional[float] = None, - repetition_penalty: Optional[float] = None, - bad_words_ids: Optional[Iterable[int]] = None, - force_words_ids: Optional[Union[Iterable[int], Iterable[Iterable[int]]]] = None, - bos_token_id: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - length_penalty: Optional[float] = None, - no_repeat_ngram_size: Optional[int] = None, - encoder_no_repeat_ngram_size: Optional[int] = None, - num_return_sequences: Optional[int] = None, - max_time: Optional[float] = None, - max_new_tokens: Optional[int] = None, - decoder_start_token_id: Optional[int] = None, - use_cache: Optional[bool] = None, - num_beam_groups: Optional[int] = None, - diversity_penalty: Optional[float] = None, - prefix_allowed_tokens_fn: Optional[ - Callable[[int, torch.Tensor], List[int]] - ] = None, - logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(), - stopping_criteria: Optional[StoppingCriteriaList] = StoppingCriteriaList(), - constraints: Optional[List[Constraint]] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - forced_bos_token_id: Optional[int] = None, - forced_eos_token_id: Optional[int] = None, - remove_invalid_values: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None, - **model_kwargs, - ) -> Union[ - GreedySearchOutput, - SampleOutput, - BeamSearchOutput, - BeamSampleOutput, - torch.LongTensor, - ]: - r""" - - Generates sequences of token ids for models with a language modeling head. The method supports the following - generation methods for text-decoder, text-to-text, speech-to-text, and vision-to-text models: - - - *greedy decoding* by calling [`~generation_utils.GenerationMixin.greedy_search`] if `num_beams=1` and - `do_sample=False`. - - *multinomial sampling* by calling [`~generation_utils.GenerationMixin.sample`] if `num_beams=1` and - `do_sample=True`. - - *beam-search decoding* by calling [`~generation_utils.GenerationMixin.beam_search`] if `num_beams>1` and - `do_sample=False`. - - *beam-search multinomial sampling* by calling [`~generation_utils.GenerationMixin.beam_sample`] if - `num_beams>1` and `do_sample=True`. - - *diverse beam-search decoding* by calling [`~generation_utils.GenerationMixin.group_beam_search`], if - `num_beams>1` and `num_beam_groups>1`. - - *constrained beam-search decoding* by calling - [`~generation_utils.GenerationMixin.constrained_beam_search`], if `constraints!=None` or - `force_words_ids!=None`. - - - - Apart from `inputs`, all the arguments below will default to the value of the attribute of the same name as - defined in the model's config (`config.json`) which in turn defaults to the - [`~modeling_utils.PretrainedConfig`] of the model. - - - - Most of these parameters are explained in more detail in [this blog - post](https://huggingface.co/blog/how-to-generate). - - Parameters: - inputs (`torch.Tensor` of varying shape depending on the modality, *optional*): - The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the - method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs` - should of in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of - `input_ids`, `input_values`, `input_features`, or `pixel_values`. - max_length (`int`, *optional*, defaults to `model.config.max_length`): - The maximum length of the sequence to be generated. - max_new_tokens (`int`, *optional*, defaults to None): - The maximum numbers of tokens to generate, ignore the current number of tokens. Use either - `max_new_tokens` or `max_length` but not both, they serve the same purpose. - min_length (`int`, *optional*, defaults to 10): - The minimum length of the sequence to be generated. - do_sample (`bool`, *optional*, defaults to `False`): - Whether or not to use sampling ; use greedy decoding otherwise. - early_stopping (`bool`, *optional*, defaults to `False`): - Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not. - num_beams (`int`, *optional*, defaults to 1): - Number of beams for beam search. 1 means no beam search. - temperature (`float`, *optional*, defaults to 1.0): - The value used to module the next token probabilities. - top_k (`int`, *optional*, defaults to 50): - The number of highest probability vocabulary tokens to keep for top-k-filtering. - top_p (`float`, *optional*, defaults to 1.0): - If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or higher - are kept for generation. - repetition_penalty (`float`, *optional*, defaults to 1.0): - The parameter for repetition penalty. 1.0 means no penalty. See [this - paper](https://arxiv.org/pdf/1909.05858.pdf) for more details. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - bos_token_id (`int`, *optional*): - The id of the *beginning-of-sequence* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - length_penalty (`float`, *optional*, defaults to 1.0): - Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the - model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer - sequences. - no_repeat_ngram_size (`int`, *optional*, defaults to 0): - If set to int > 0, all ngrams of that size can only occur once. - encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0): - If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the - `decoder_input_ids`. - bad_words_ids(`List[List[int]]`, *optional*): - List of token ids that are not allowed to be generated. In order to get the token ids of the words that - should not appear in the generated text, use `tokenizer(bad_words, add_prefix_space=True, - add_special_tokens=False).input_ids`. - force_words_ids(`List[List[int]]` or `List[List[List[int]]]`, *optional*): - List of token ids that must be generated. If given a `List[List[int]]`, this is treated as a simple - list of words that must be included, the opposite to `bad_words_ids`. If given `List[List[List[int]]]`, - this triggers a [disjunctive constraint](https://github.com/huggingface/transformers/issues/14081), - where one can allow different forms of each word. - num_return_sequences(`int`, *optional*, defaults to 1): - The number of independently computed returned sequences for each element in the batch. - max_time(`float`, *optional*, defaults to None): - The maximum amount of time you allow the computation to run for in seconds. generation will still - finish the current pass after allocated time has been passed. - attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for tokens - that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same shape - as `input_ids` that masks the pad token. [What are attention masks?](../glossary#attention-mask) - decoder_start_token_id (`int`, *optional*): - If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token. - use_cache: (`bool`, *optional*, defaults to `True`): - Whether or not the model should use the past last key/values attentions (if applicable to the model) to - speed up decoding. - num_beam_groups (`int`, *optional*, defaults to 1): - Number of groups to divide `num_beams` into in order to ensure diversity among different groups of - beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details. - diversity_penalty (`float`, *optional*, defaults to 0.0): - This value is subtracted from a beam's score if it generates a token same as any beam from other group - at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is - enabled. - prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*): - If provided, this function constraints the beam search to allowed tokens only at each step. If not - provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and - `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned - on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful - for constrained generation conditioned on the prefix, as described in [Autoregressive Entity - Retrieval](https://arxiv.org/abs/2010.00904). - logits_processor (`LogitsProcessorList`, *optional*): - Custom logits processors that complement the default logits processors built from arguments and a - model's config. If a logit processor is passed that is already created with the arguments or a model's - config an error is thrown. This feature is intended for advanced users. - stopping_criteria (`StoppingCriteriaList`, *optional*): - Custom stopping criteria that complement the default stopping criteria built from arguments and a - model's config. If a stopping criteria is passed that is already created with the arguments or a - model's config an error is thrown. This feature is intended for advanced users. - constraints (`List[Constraint]`, *optional*): - Custom constraints that can be added to the generation to ensure that the output will contain the use - of certain tokens as defined by `Constraint` objects, in the most sensible way possible. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - forced_bos_token_id (`int`, *optional*): - The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful - for multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be - the target language token. - forced_eos_token_id (`int`, *optional*): - The id of the token to force as the last generated token when `max_length` is reached. - remove_invalid_values (`bool`, *optional*): - Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to - crash. Note that using `remove_invalid_values` can slow down generation. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - exponential_decay_length_penalty (`tuple(int, float)`, *optional*): - This Tuple adds an exponentially increasing length penalty, after a certain amount of tokens have been - generated. The tuple shall consist of: `(start_index, decay_factor)` where `start_index` indicates - where penalty starts and `decay_factor` represents the factor of exponential decay - - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If the model - is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific kwargs - should be prefixed with *decoder_*. - - Return: - [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True` - or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. - - If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible - [`~utils.ModelOutput`] types are: - - - [`~generation_utils.GreedySearchDecoderOnlyOutput`], - - [`~generation_utils.SampleDecoderOnlyOutput`], - - [`~generation_utils.BeamSearchDecoderOnlyOutput`], - - [`~generation_utils.BeamSampleDecoderOnlyOutput`] - - If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible - [`~utils.ModelOutput`] types are: - - - [`~generation_utils.GreedySearchEncoderDecoderOutput`], - - [`~generation_utils.SampleEncoderDecoderOutput`], - - [`~generation_utils.BeamSearchEncoderDecoderOutput`], - - [`~generation_utils.BeamSampleEncoderDecoderOutput`] - - Examples: - - Greedy Decoding: - - ```python - >>> from transformers import AutoTokenizer, AutoModelForCausalLM - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> prompt = "Today I believe we can finally" - >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids - - >>> # generate up to 30 tokens - >>> outputs = model.generate(input_ids, do_sample=False, max_length=30) - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n'] - ``` - - Multinomial Sampling: - - ```python - >>> from transformers import AutoTokenizer, AutoModelForCausalLM - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> prompt = "Today I believe we can finally" - >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids - - >>> # sample up to 30 tokens - >>> torch.manual_seed(0) # doctest: +IGNORE_RESULT - >>> outputs = model.generate(input_ids, do_sample=True, max_length=30) - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Today I believe we can finally get rid of discrimination," said Rep. Mark Pocan (D-Wis.).\n\n"Just look at the'] - ``` - - Beam-search decoding: - - ```python - >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM - - >>> tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de") - - >>> sentence = "Paris is one of the densest populated areas in Europe." - >>> input_ids = tokenizer(sentence, return_tensors="pt").input_ids - - >>> outputs = model.generate(input_ids) - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Paris ist eines der dichtesten besiedelten Gebiete Europas.'] - ```""" - # 1. Set generation parameters if not already defined - bos_token_id = ( - bos_token_id if bos_token_id is not None else self.config.bos_token_id - ) - num_beams = num_beams if num_beams is not None else self.config.num_beams - length_penalty = ( - length_penalty if length_penalty is not None else self.config.length_penalty - ) - early_stopping = ( - early_stopping if early_stopping is not None else self.config.early_stopping - ) - num_beam_groups = ( - num_beam_groups - if num_beam_groups is not None - else self.config.num_beam_groups - ) - do_sample = do_sample if do_sample is not None else self.config.do_sample - num_return_sequences = ( - num_return_sequences - if num_return_sequences is not None - else self.config.num_return_sequences - ) - - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - - if eos_token_id is None and hasattr(self.config, "decoder"): - eos_token_id = self.config.decoder.eos_token_id - - if pad_token_id is None and eos_token_id is not None: - # special case if pad_token_id is not defined - # logger.warning( - # f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") - - pad_token_id = eos_token_id - - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - # 2. Define model inputs - # inputs_tensor has to be defined - # model_input_name is defined if model-specific keyword input is passed - # otherwise model_input_name is None - # all model-specific keyword inputs are removed from `model_kwargs` - inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs( - inputs, bos_token_id, model_kwargs - ) - batch_size = inputs_tensor.shape[0] - - # 3. Define other model kwargs - model_kwargs["output_attentions"] = output_attentions - model_kwargs["output_hidden_states"] = output_hidden_states - model_kwargs["use_cache"] = use_cache - - accepts_attention_mask = "attention_mask" in set( - inspect.signature(self.forward).parameters.keys() - ) - requires_attention_mask = "encoder_outputs" not in model_kwargs - - if ( - model_kwargs.get("attention_mask", None) is None - and requires_attention_mask - and accepts_attention_mask - ): - model_kwargs["attention_mask"] = ( - self._prepare_attention_mask_for_generation( - inputs_tensor, pad_token_id, eos_token_id - ) - ) - - if self.config.is_encoder_decoder and "encoder_outputs" not in model_kwargs: - # if model is encoder decoder encoder_outputs are created - # and added to `model_kwargs` - model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation( - inputs_tensor, model_kwargs, model_input_name - ) - - # 4. Prepare `input_ids` which will be used for auto-regressive generation - if self.config.is_encoder_decoder: - input_ids = self._prepare_decoder_input_ids_for_generation( - batch_size, - decoder_start_token_id=decoder_start_token_id, - bos_token_id=bos_token_id, - model_kwargs=model_kwargs, - ) - else: - # if decoder-only then inputs_tensor has to be `input_ids` - input_ids = inputs_tensor - - input_ids_seq_length = input_ids.shape[-1] - - # 5. Prepare `max_length` depending on other stopping criteria - # if `max_new_tokens` is passed, but not `max_length` -> set `max_length = max_new_tokens` - if max_length is None and max_new_tokens is not None: - max_length = max_new_tokens + input_ids_seq_length - elif max_length is not None and max_new_tokens is not None: - # Both are set, this is odd, raise a warning - warnings.warn( - ( - "Both `max_length` and `max_new_tokens` have been set " - f"but they serve the same purpose. `max_length` {max_length} " - f"will take priority over `max_new_tokens` {max_new_tokens}." - ), - UserWarning, - ) - # default to config if still None - max_length = max_length if max_length is not None else self.config.max_length - - if input_ids_seq_length >= max_length: - input_ids_string = ( - "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" - ) - logger.warning( - f"Input length of {input_ids_string} is {input_ids_seq_length}, but" - f" ``max_length`` is set to {max_length}. This can lead to unexpected" - " behavior. You should consider increasing ``config.max_length`` or" - " ``max_length``." - ) - - # 6. determine generation mode - is_constraint_gen_mode = constraints is not None or force_words_ids is not None - is_greedy_gen_mode = ( - (num_beams == 1) - and (num_beam_groups == 1) - and do_sample is False - and not is_constraint_gen_mode - ) - is_sample_gen_mode = ( - (num_beams == 1) - and (num_beam_groups == 1) - and do_sample is True - and not is_constraint_gen_mode - ) - is_beam_gen_mode = ( - (num_beams > 1) - and (num_beam_groups == 1) - and do_sample is False - and not is_constraint_gen_mode - ) - is_beam_sample_gen_mode = ( - (num_beams > 1) - and (num_beam_groups == 1) - and do_sample is True - and not is_constraint_gen_mode - ) - is_group_beam_gen_mode = ( - (num_beams > 1) and (num_beam_groups > 1) and not is_constraint_gen_mode - ) - - if num_beam_groups > num_beams: - raise ValueError( - "`num_beam_groups` has to be smaller or equal to `num_beams`" - ) - if is_group_beam_gen_mode and do_sample is True: - raise ValueError( - "Diverse beam search cannot be used in sampling mode. Make sure that" - " `do_sample` is set to `False`." - ) - - # 7. prepare distribution pre_processing samplers - logits_processor = self._get_logits_processor( - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, - input_ids_seq_length=input_ids_seq_length, - encoder_input_ids=inputs_tensor, - bad_words_ids=bad_words_ids, - min_length=min_length, - max_length=max_length, - eos_token_id=eos_token_id, - forced_bos_token_id=forced_bos_token_id, - forced_eos_token_id=forced_eos_token_id, - prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, - num_beams=num_beams, - num_beam_groups=num_beam_groups, - diversity_penalty=diversity_penalty, - remove_invalid_values=remove_invalid_values, - exponential_decay_length_penalty=exponential_decay_length_penalty, - logits_processor=logits_processor, - ) - - # 8. prepare stopping criteria - stopping_criteria = self._get_stopping_criteria( - max_length=max_length, - max_time=max_time, - stopping_criteria=stopping_criteria, - ) - - # 9. go into different generation modes - if is_greedy_gen_mode: - if num_return_sequences > 1: - raise ValueError( - "num_return_sequences has to be 1, but is" - f" {num_return_sequences} when doing greedy search." - ) - - # 10. run greedy search - return self.greedy_search( - input_ids, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - elif is_sample_gen_mode: - # 10. prepare logits warper - logits_warper = self._get_logits_warper( - top_k=top_k, - top_p=top_p, - typical_p=typical_p, - temperature=temperature, - num_beams=num_beams, - ) - - # 11. expand input_ids with `num_return_sequences` additional sequences per batch - input_ids, model_kwargs = self._expand_inputs_for_generation( - input_ids, - expand_size=num_return_sequences, - is_encoder_decoder=self.config.is_encoder_decoder, - **model_kwargs, - ) - - # 12. run sample - return self.sample( - input_ids, - logits_processor=logits_processor, - logits_warper=logits_warper, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - elif is_beam_gen_mode: - if num_return_sequences > num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) - - if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) - - # 10. prepare beam search scorer - beam_scorer = BeamSearchScorer( - batch_size=batch_size, - num_beams=num_beams, - device=self.device, - length_penalty=length_penalty, - do_early_stopping=early_stopping, - num_beam_hyps_to_keep=num_return_sequences, - ) - # 11. interleave input_ids with `num_beams` additional sequences per batch - input_ids, model_kwargs = self._expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - is_encoder_decoder=self.config.is_encoder_decoder, - **model_kwargs, - ) - # 12. run beam search - return self.beam_search( - input_ids, - beam_scorer, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - elif is_beam_sample_gen_mode: - # 10. prepare logits warper - logits_warper = self._get_logits_warper( - top_k=top_k, - top_p=top_p, - typical_p=typical_p, - temperature=temperature, - num_beams=num_beams, - ) - - if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) - # 11. prepare beam search scorer - beam_scorer = BeamSearchScorer( - batch_size=batch_size * num_return_sequences, - num_beams=num_beams, - device=self.device, - length_penalty=length_penalty, - do_early_stopping=early_stopping, - ) - - # 12. interleave input_ids with `num_beams` additional sequences per batch - input_ids, model_kwargs = self._expand_inputs_for_generation( - input_ids, - expand_size=num_beams * num_return_sequences, - is_encoder_decoder=self.config.is_encoder_decoder, - **model_kwargs, - ) - - # 13. run beam sample - return self.beam_sample( - input_ids, - beam_scorer, - logits_processor=logits_processor, - logits_warper=logits_warper, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - elif is_group_beam_gen_mode: - if num_return_sequences > num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) - - if num_beams % num_beam_groups != 0: - raise ValueError( - "`num_beams` should be divisible by `num_beam_groups` for group" - " beam search." - ) - - if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) - - # 10. prepare beam search scorer - beam_scorer = BeamSearchScorer( - batch_size=batch_size, - num_beams=num_beams, - max_length=stopping_criteria.max_length, - device=self.device, - length_penalty=length_penalty, - do_early_stopping=early_stopping, - num_beam_hyps_to_keep=num_return_sequences, - num_beam_groups=num_beam_groups, - ) - # 11. interleave input_ids with `num_beams` additional sequences per batch - input_ids, model_kwargs = self._expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - is_encoder_decoder=self.config.is_encoder_decoder, - **model_kwargs, - ) - # 12. run beam search - return self.group_beam_search( - input_ids, - beam_scorer, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - elif is_constraint_gen_mode: - if num_return_sequences > num_beams: - raise ValueError( - "`num_return_sequences` has to be smaller or equal to `num_beams`." - ) - - if stopping_criteria.max_length is None: - raise ValueError( - "`max_length` needs to be a stopping_criteria for now." - ) - - if num_beams <= 1: - raise ValueError( - "`num_beams` needs to be greater than 1 for constrained" - " genertation." - ) - - if do_sample: - raise ValueError( - "`do_sample` needs to be false for constrained generation." - ) - - if num_beam_groups is not None and num_beam_groups > 1: - raise ValueError( - "`num_beam_groups` not supported yet for constrained generation." - ) - - final_constraints = [] - if constraints is not None: - final_constraints = constraints - - if force_words_ids is not None: - - def typeerror(): - raise ValueError( - "`force_words_ids` has to either be a `List[List[List[int]]]`" - " or `List[List[int]]`of positive integers, but is" - f" {force_words_ids}." - ) - - if not isinstance(force_words_ids, list) or len(force_words_ids) == 0: - typeerror() - - for word_ids in force_words_ids: - if isinstance(word_ids[0], list): - if not isinstance(word_ids, list) or len(word_ids) == 0: - typeerror() - if any( - not isinstance(token_ids, list) for token_ids in word_ids - ): - typeerror() - if any( - any( - (not isinstance(token_id, int) or token_id < 0) - for token_id in token_ids - ) - for token_ids in word_ids - ): - typeerror() - - constraint = DisjunctiveConstraint(word_ids) - else: - if not isinstance(word_ids, list) or len(word_ids) == 0: - typeerror() - if any( - (not isinstance(token_id, int) or token_id < 0) - for token_id in word_ids - ): - typeerror() - - constraint = PhrasalConstraint(word_ids) - final_constraints.append(constraint) - - # 10. prepare beam search scorer - constrained_beam_scorer = ConstrainedBeamSearchScorer( - constraints=final_constraints, - batch_size=batch_size, - num_beams=num_beams, - device=self.device, - length_penalty=length_penalty, - do_early_stopping=early_stopping, - num_beam_hyps_to_keep=num_return_sequences, - ) - # 11. interleave input_ids with `num_beams` additional sequences per batch - input_ids, model_kwargs = self._expand_inputs_for_generation( - input_ids, - expand_size=num_beams, - is_encoder_decoder=self.config.is_encoder_decoder, - **model_kwargs, - ) - # 12. run beam search - return self.constrained_beam_search( - input_ids, - constrained_beam_scorer=constrained_beam_scorer, - logits_processor=logits_processor, - stopping_criteria=stopping_criteria, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - synced_gpus=synced_gpus, - **model_kwargs, - ) - - def greedy_search( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[GreedySearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **greedy decoding** and can be - used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific keyword arguments will be forwarded to the `forward` function of the model. - If model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation_utils.GreedySearchDecoderOnlyOutput`], [`~generation_utils.GreedySearchEncoderDecoderOutput`] - or `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.GreedySearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.GreedySearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token - >>> model.config.pad_token_id = model.config.eos_token_id - - >>> input_prompt = "It might be possible to" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(10, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - - >>> outputs = model.greedy_search( - ... input_ids, logits_processor=logits_processor, stopping_criteria=stopping_criteria - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ["It might be possible to get a better understanding of the nature of the problem, but it's not"] - ```""" - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - cur_len = input_ids.shape[-1] - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_logits,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # pre-process distribution - next_tokens_scores = logits_processor( - input_ids, next_token_logits, model_inputs - ) - - # argmax - next_tokens = torch.argmax(next_tokens_scores, dim=-1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError( - "If `eos_token_id` is defined, make sure that `pad_token_id` is" - " defined." - ) - next_tokens = next_tokens * unfinished_sequences + pad_token_id * ( - 1 - unfinished_sequences - ) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - cur_len = cur_len + 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id is not None: - unfinished_sequences = unfinished_sequences.mul( - (next_tokens != eos_token_id).long() - ) - - # stop when each sentence is finished, or if we exceed the maximum length - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return GreedySearchEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return GreedySearchDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids - - def sample( - self, - input_ids: torch.LongTensor, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[SampleOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - logits_warper (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used - to warp the prediction score distribution of the language modeling head applied before multinomial - sampling at each generation step. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation_utils.SampleDecoderOnlyOutput`], [`~generation_utils.SampleEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.SampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.SampleEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForCausalLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... TopKLogitsWarper, - ... TemperatureLogitsWarper, - ... StoppingCriteriaList, - ... MaxLengthCriteria, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") - >>> model = AutoModelForCausalLM.from_pretrained("gpt2") - - >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token - >>> model.config.pad_token_id = model.config.eos_token_id - - >>> input_prompt = "Today is a beautiful day, and" - >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - >>> # instantiate logits processors - >>> logits_warper = LogitsProcessorList( - ... [ - ... TopKLogitsWarper(50), - ... TemperatureLogitsWarper(0.7), - ... ] - ... ) - - >>> stopping_criteria = StoppingCriteriaList([MaxLengthCriteria(max_length=20)]) - - >>> torch.manual_seed(0) # doctest: +IGNORE_RESULT - >>> outputs = model.sample( - ... input_ids, - ... logits_processor=logits_processor, - ... logits_warper=logits_warper, - ... stopping_criteria=stopping_criteria, - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Today is a beautiful day, and a wonderful day.\n\nI was lucky enough to meet the'] - ```""" - - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - logits_warper = ( - logits_warper if logits_warper is not None else LogitsProcessorList() - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - # keep track of which sequences are already finished - unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) - cur_len = input_ids.shape[-1] - - this_peer_finished = False # used by synced_gpus only - # auto-regressive generation - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # prepare model inputs - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - # forward pass to get next token - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits_raw = outputs.logits[:, -1, :].clone() - next_token_logits = outputs.logits[:, -1, :] - - # pre-process distribution - next_token_scores = logits_processor( - input_ids, next_token_logits, model_inputs=model_inputs - ) - next_token_scores = logits_warper(input_ids, next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += ((next_token_logits_raw, next_token_scores),) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # sample - probs = nn.functional.softmax(next_token_scores, dim=-1) - next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) - - # finished sentences should have their next token be a padding token - if eos_token_id is not None: - if pad_token_id is None: - raise ValueError( - "If `eos_token_id` is defined, make sure that `pad_token_id` is" - " defined." - ) - next_tokens = next_tokens * unfinished_sequences + pad_token_id * ( - 1 - unfinished_sequences - ) - - # update generated ids, model inputs, and length for next step - input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - cur_len = cur_len + 1 - - # if eos_token was found in one sentence, set sentence to finished - if eos_token_id is not None: - unfinished_sequences = unfinished_sequences.mul( - (next_tokens != eos_token_id).long() - ) - - # stop when each sentence is finished, or if we exceed the maximum length - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - if return_dict_in_generate: - if self.config.is_encoder_decoder: - return SampleEncoderDecoderOutput( - sequences=input_ids, - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return SampleDecoderOnlyOutput( - sequences=input_ids, - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return input_ids - - def beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search decoding** and - can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation_utilsBeamSearchDecoderOnlyOutput`], [`~generation_utils.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.BeamSearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - if len(stopping_criteria) == 0: - warnings.warn( - ( - "You don't have defined any stopping_criteria, this will likely" - " loop forever" - ), - UserWarning, - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}," - f" but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) - if (return_dict_in_generate and output_scores) - else None - ) - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - beam_scores = torch.zeros( - (batch_size, num_beams), dtype=torch.float, device=input_ids.device - ) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits = outputs.logits[:, -1, :] - next_token_logits_raw = next_token_logits.clone() - - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation( - next_token_logits, cur_len=cur_len - ) - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor( - input_ids, next_token_scores, model_inputs=model_inputs - ) - next_token_scores = next_token_scores_processed + beam_scores[ - :, None - ].expand_as(next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (next_token_logits_raw,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view( - batch_size, num_beams * vocab_size - ) - - next_token_scores, next_tokens = torch.topk( - next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True - ) - - next_indices = torch_int_div(next_tokens, vocab_size) - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - ) - - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - input_ids = torch.cat( - [input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1 - ) - - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - if model_kwargs["past"] is not None: - model_kwargs["past"] = self._reorder_cache( - model_kwargs["past"], beam_idx - ) - - if return_dict_in_generate and output_scores: - beam_indices = tuple( - ( - beam_indices[beam_idx[i]] + (beam_idx[i],) - for i in range(len(beam_indices)) - ) - ) - - # increase cur_len - cur_len = cur_len + 1 - - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - else: - num_return_sequences = beam_scorer.num_beam_hyps_to_keep - # return only as many indices as sequences - beam_indices = tuple( - ( - beam_indices[ - i * num_beams : i * num_beams + num_return_sequences - ] - for i in range(batch_size) - ) - ) - beam_indices = sum(beam_indices, ()) - - step_wise_raw_logits = self.compute_beam_search_raw_logits( - sequence_outputs["sequences"].clone(), - scores, - beam_indices, - eos_token_id, - ) - - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=step_wise_raw_logits, # raw logits - beam_indices=beam_indices, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=beam_indices, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] - - def beam_sample( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_warper: Optional[LogitsProcessorList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ) -> Union[BeamSampleOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **beam search multinomial - sampling** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - logits_warper (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used - to warp the prediction score distribution of the language modeling head applied before multinomial - sampling at each generation step. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation_utils.BeamSampleDecoderOnlyOutput`], [`~generation_utils.BeamSampleEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.BeamSampleDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.BeamSampleEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... TopKLogitsWarper, - ... TemperatureLogitsWarper, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... max_length=model.config.max_length, - ... num_beams=num_beams, - ... device=model.device, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)] - ... ) - >>> # instantiate logits processors - >>> logits_warper = LogitsProcessorList( - ... [ - ... TopKLogitsWarper(50), - ... TemperatureLogitsWarper(0.7), - ... ] - ... ) - - >>> outputs = model.beam_sample( - ... input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - beam_indices = ( - tuple(() for _ in range(batch_beam_size)) - if (return_dict_in_generate and output_scores) - else None - ) - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - beam_scores = torch.zeros( - (batch_size, num_beams), dtype=torch.float, device=input_ids.device - ) - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits_raw = outputs.logits[:, -1, :] - - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation( - next_token_logits_raw, cur_len=cur_len - ) - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor( - input_ids, next_token_logits, model_inputs=model_inputs - ) - next_token_scores = next_token_scores_processed + beam_scores[ - :, None - ].expand_as(next_token_scores) - next_token_scores = logits_warper(input_ids, next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - # return raw scores instead of post-processed - scores += ((next_token_logits_raw, next_token_scores),) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view( - batch_size, num_beams * vocab_size - ) - - probs = nn.functional.softmax(next_token_scores, dim=-1) - - next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) - next_token_scores = torch.gather(next_token_scores, -1, next_tokens) - - next_token_scores, _indices = torch.sort( - next_token_scores, descending=True, dim=1 - ) - next_tokens = torch.gather(next_tokens, -1, _indices) - - next_indices = torch_int_div(next_tokens, vocab_size) - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - ) - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - input_ids = torch.cat( - [input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1 - ) - - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - if model_kwargs["past"] is not None: - model_kwargs["past"] = self._reorder_cache( - model_kwargs["past"], beam_idx - ) - - if return_dict_in_generate and output_scores: - beam_indices = tuple( - ( - beam_indices[beam_idx[i]] + (beam_idx[i],) - for i in range(len(beam_indices)) - ) - ) - - # increase cur_len - cur_len = cur_len + 1 - - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - else: - num_return_sequences = beam_scorer.num_beam_hyps_to_keep - # return only as many indices as sequences - beam_indices = tuple( - ( - beam_indices[ - i * num_beams : i * num_beams + num_return_sequences - ] - for i in range(batch_size) - ) - ) - beam_indices = sum(beam_indices, ()) - - if self.config.is_encoder_decoder: - return BeamSampleEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=beam_indices, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSampleDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=beam_indices, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] - - def group_beam_search( - self, - input_ids: torch.LongTensor, - beam_scorer: BeamScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = False, - **model_kwargs, - ): - r""" - Generates sequences of token ids for models with a language modeling head using **diverse beam search - decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - beam_scorer (`BeamScorer`): - An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation. For more information, the documentation of [`BeamScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - - model_kwargs: - Additional model specific kwargs that will be forwarded to the `forward` function of the model. If - model is an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`~generation_utils.BeamSearchDecoderOnlyOutput`], [`~generation_utils.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.BeamSearchDecoderOnlyOutput`] if [`~generation_utils.BeamSearchDecoderOnlyOutput`] if - `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a - [`~generation_utils.BeamSearchEncoderDecoderOutput`] if `model.config.is_encoder_decoder=True`. - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... HammingDiversityLogitsProcessor, - ... BeamSearchScorer, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run diverse beam search using 6 beams - >>> num_beams = 6 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> # instantiate beam scorer - >>> beam_scorer = BeamSearchScorer( - ... batch_size=1, - ... max_length=model.config.max_length, - ... num_beams=num_beams, - ... device=model.device, - ... num_beam_groups=3, - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3), - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.group_beam_search( - ... input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt bist du?'] - ```""" - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - batch_size = len(beam_scorer._beam_hyps) - num_beams = beam_scorer.num_beams - num_beam_groups = beam_scorer.num_beam_groups - num_sub_beams = num_beams // num_beam_groups - device = input_ids.device - - batch_beam_size, cur_len = input_ids.shape - - if return_dict_in_generate and output_scores: - beam_indices = [ - tuple(() for _ in range(num_sub_beams * batch_size)) - for _ in range(num_beam_groups) - ] - else: - beam_indices = None - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}," - f" but is {batch_beam_size}." - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - beam_scores = torch.full( - (batch_size, num_beams), -1e9, dtype=torch.float, device=device - ) - # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in - # the same group don't produce same tokens everytime. - beam_scores[:, ::num_sub_beams] = 0 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - # predicted tokens in cur_len step - current_tokens = torch.zeros( - batch_size * num_beams, dtype=input_ids.dtype, device=device - ) - - # indices which will form the beams in the next time step - reordering_indices = torch.zeros( - batch_size * num_beams, dtype=torch.long, device=device - ) - - # do one decoder step on all beams of all sentences in batch - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - if output_scores: - processed_score = torch.zeros_like(outputs.logits[:, -1, :]) - - for beam_group_idx in range(num_beam_groups): - group_start_idx = beam_group_idx * num_sub_beams - group_end_idx = min(group_start_idx + num_sub_beams, num_beams) - group_size = group_end_idx - group_start_idx - - # indices of beams of current group among all sentences in batch - batch_group_indices = [] - - for batch_idx in range(batch_size): - batch_group_indices.extend( - [ - batch_idx * num_beams + idx - for idx in range(group_start_idx, group_end_idx) - ] - ) - group_input_ids = input_ids[batch_group_indices] - - # select outputs of beams of current group only - next_token_logits_raw = outputs.logits[batch_group_indices, -1, :] - - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation( - next_token_logits_raw, cur_len=cur_len - ) - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * group_size, vocab_size) - vocab_size = next_token_scores.shape[-1] - - next_token_scores_processed = logits_processor( - group_input_ids, - next_token_scores, - current_tokens=current_tokens, - beam_group_idx=beam_group_idx, - model_inputs=model_inputs, - ) - next_token_scores = next_token_scores_processed + beam_scores[ - batch_group_indices - ].unsqueeze(-1) - next_token_scores = next_token_scores.expand_as( - next_token_scores_processed - ) - - if output_scores: - processed_score[batch_group_indices] = next_token_logits_raw - - # reshape for beam search - next_token_scores = next_token_scores.view( - batch_size, group_size * vocab_size - ) - - next_token_scores, next_tokens = torch.topk( - next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True - ) - - next_indices = torch_int_div(next_tokens, vocab_size) - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = beam_scorer.process( - group_input_ids, - next_token_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - ) - beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - if return_dict_in_generate and output_scores: - beam_indices[beam_group_idx] = tuple( - beam_indices[beam_group_idx][beam_idx[i]] + (beam_idx[i],) - for i in range(len(beam_indices[0])) - ) - - input_ids[batch_group_indices] = group_input_ids[beam_idx] - group_input_ids = torch.cat( - [group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], - dim=-1, - ) - current_tokens[batch_group_indices] = group_input_ids[:, -1] - - # (beam_idx // group_size) -> batch_idx - # (beam_idx % group_size) -> offset of idx inside the group - reordering_indices[batch_group_indices] = ( - num_beams * torch_int_div(beam_idx, group_size) - + group_start_idx - + (beam_idx % group_size) - ) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += (processed_score,) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1) - - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - if model_kwargs["past"] is not None: - model_kwargs["past"] = self._reorder_cache( - model_kwargs["past"], reordering_indices - ) - - # increase cur_len - cur_len = cur_len + 1 - - if beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - else: - beam_indices = sum(beam_indices, ()) - num_return_sequences = beam_scorer.num_beam_hyps_to_keep - # return only as many indices as sequences - beam_indices = tuple( - ( - beam_indices[ - i * num_beams : i * num_beams + num_return_sequences - ] - for i in range(batch_size) - ) - ) - beam_indices = sum(beam_indices, ()) - - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - beam_indices=beam_indices, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] - - def constrained_beam_search( - self, - input_ids: torch.LongTensor, - constrained_beam_scorer: ConstrainedBeamSearchScorer, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - max_length: Optional[int] = None, - pad_token_id: Optional[int] = None, - eos_token_id: Optional[int] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - output_scores: Optional[bool] = None, - return_dict_in_generate: Optional[bool] = None, - synced_gpus: Optional[bool] = None, - **model_kwargs, - ) -> Union[BeamSearchOutput, torch.LongTensor]: - r""" - Generates sequences of token ids for models with a language modeling head using **constrained beam search - decoding** and can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models. - - Parameters: - input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): - The sequence used as a prompt for the generation. - constrained_beam_scorer (`ConstrainedBeamSearchScorer`): - A derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and - sorted during generation, while satisfying a list of positive constraints. For more information, the - documentation of [`ConstrainedBeamSearchScorer`] should be read. - logits_processor (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`] - used to modify the prediction scores of the language modeling head applied at each generation step. - stopping_criteria (`StoppingCriteriaList`, *optional*): - An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`] - used to tell if the generation loop should stop. - logits_warper (`LogitsProcessorList`, *optional*): - An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used - to warp the prediction score distribution of the language modeling head applied before multinomial - sampling at each generation step. - max_length (`int`, *optional*, defaults to 20): - **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of generated - tokens. The maximum length of the sequence to be generated. - pad_token_id (`int`, *optional*): - The id of the *padding* token. - eos_token_id (`int`, *optional*): - The id of the *end-of-sequence* token. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more details. - output_hidden_states (`bool`, *optional*, defaults to `False`): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more details. - output_scores (`bool`, *optional*, defaults to `False`): - Whether or not to return the prediction scores. See `scores` under returned tensors for more details. - return_dict_in_generate (`bool`, *optional*, defaults to `False`): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - synced_gpus (`bool`, *optional*, defaults to `False`): - Whether to continue running the while loop until max_length (needed for ZeRO stage 3) - model_kwargs: - Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is - an encoder-decoder model the kwargs should include `encoder_outputs`. - - Return: - [`generation_utilsBeamSearchDecoderOnlyOutput`], [`~generation_utils.BeamSearchEncoderDecoderOutput`] or - `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a - [`~generation_utils.BeamSearchDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and - `return_dict_in_generate=True` or a [`~generation_utils.BeamSearchEncoderDecoderOutput`] if - `model.config.is_encoder_decoder=True`. - - - Examples: - - ```python - >>> from transformers import ( - ... AutoTokenizer, - ... AutoModelForSeq2SeqLM, - ... LogitsProcessorList, - ... MinLengthLogitsProcessor, - ... ConstrainedBeamSearchScorer, - ... PhrasalConstraint, - ... ) - >>> import torch - - >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") - >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - - >>> encoder_input_str = "translate English to German: How old are you?" - >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids - - - >>> # lets run beam search using 3 beams - >>> num_beams = 3 - >>> # define decoder start token ids - >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) - >>> input_ids = input_ids * model.config.decoder_start_token_id - - >>> # add encoder_outputs to model keyword arguments - >>> model_kwargs = { - ... "encoder_outputs": model.get_encoder()( - ... encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True - ... ) - ... } - - >>> constraint_str = "Sie" - >>> constraint_token_ids = tokenizer.encode(constraint_str)[:-1] # slice to remove eos token - >>> constraints = [PhrasalConstraint(token_ids=constraint_token_ids)] - - - >>> # instantiate beam scorer - >>> beam_scorer = ConstrainedBeamSearchScorer( - ... batch_size=1, num_beams=num_beams, device=model.device, constraints=constraints - ... ) - - >>> # instantiate logits processors - >>> logits_processor = LogitsProcessorList( - ... [ - ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), - ... ] - ... ) - - >>> outputs = model.constrained_beam_search( - ... input_ids, beam_scorer, constraints=constraints, logits_processor=logits_processor, **model_kwargs - ... ) - - >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) - ['Wie alt sind Sie?'] - ```""" - # init values - logits_processor = ( - logits_processor if logits_processor is not None else LogitsProcessorList() - ) - stopping_criteria = ( - stopping_criteria - if stopping_criteria is not None - else StoppingCriteriaList() - ) - if max_length is not None: - warnings.warn( - ( - "`max_length` is deprecated in this function, use" - " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`" - " instead." - ), - UserWarning, - ) - stopping_criteria = validate_stopping_criteria( - stopping_criteria, max_length - ) - if len(stopping_criteria) == 0: - warnings.warn( - ( - "You don't have defined any stopping_criteria, this will likely" - " loop forever" - ), - UserWarning, - ) - pad_token_id = ( - pad_token_id if pad_token_id is not None else self.config.pad_token_id - ) - eos_token_id = ( - eos_token_id if eos_token_id is not None else self.config.eos_token_id - ) - output_scores = ( - output_scores if output_scores is not None else self.config.output_scores - ) - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict_in_generate = ( - return_dict_in_generate - if return_dict_in_generate is not None - else self.config.return_dict_in_generate - ) - - # init attention / hidden states / scores tuples - scores = () if (return_dict_in_generate and output_scores) else None - decoder_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - cross_attentions = ( - () if (return_dict_in_generate and output_attentions) else None - ) - decoder_hidden_states = ( - () if (return_dict_in_generate and output_hidden_states) else None - ) - - # if model is an encoder-decoder, retrieve encoder attention weights and hidden states - if return_dict_in_generate and self.config.is_encoder_decoder: - encoder_attentions = ( - model_kwargs["encoder_outputs"].get("attentions") - if output_attentions - else None - ) - encoder_hidden_states = ( - model_kwargs["encoder_outputs"].get("hidden_states") - if output_hidden_states - else None - ) - - batch_size = len(constrained_beam_scorer._beam_hyps) - num_beams = constrained_beam_scorer.num_beams - - batch_beam_size, cur_len = input_ids.shape - - if num_beams * batch_size != batch_beam_size: - raise ValueError( - f"Batch dimension of `input_ids` should be {num_beams * batch_size}," - f" but is {batch_beam_size}." - ) - - beam_scores = torch.zeros( - (batch_size, num_beams), dtype=torch.float, device=input_ids.device - ) - beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view((batch_size * num_beams,)) - - this_peer_finished = False # used by synced_gpus only - while True: - if synced_gpus: - # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. - # The following logic allows an early break if all peers finished generating their sequence - this_peer_finished_flag = torch.tensor( - 0.0 if this_peer_finished else 1.0 - ).to(input_ids.device) - # send 0.0 if we finished, 1.0 otherwise - dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) - # did all peers finish? the reduced sum will be 0.0 then - if this_peer_finished_flag.item() == 0.0: - break - - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) - - outputs = self( - **model_inputs, - return_dict=True, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need - - next_token_logits_raw = outputs.logits[:, -1, :] - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation( - next_token_logits_raw, cur_len=cur_len - ) - next_token_scores = nn.functional.log_softmax( - next_token_logits, dim=-1 - ) # (batch_size * num_beams, vocab_size) - - next_token_scores_processed = logits_processor( - input_ids, next_token_scores, model_inputs=model_inputs - ) - - scores_for_all_vocab = next_token_scores_processed.clone() - - next_token_scores = next_token_scores_processed + beam_scores[ - :, None - ].expand_as(next_token_scores) - - # Store scores, attentions and hidden_states when required - if return_dict_in_generate: - if output_scores: - scores += ((next_token_logits_raw, next_token_scores),) - if output_attentions: - decoder_attentions += ( - (outputs.decoder_attentions,) - if self.config.is_encoder_decoder - else (outputs.attentions,) - ) - if self.config.is_encoder_decoder: - cross_attentions += (outputs.cross_attentions,) - - if output_hidden_states: - decoder_hidden_states += ( - (outputs.decoder_hidden_states,) - if self.config.is_encoder_decoder - else (outputs.hidden_states,) - ) - - # reshape for beam search - vocab_size = next_token_scores.shape[-1] - next_token_scores = next_token_scores.view( - batch_size, num_beams * vocab_size - ) - - next_token_scores, next_tokens = torch.topk( - next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True - ) - - next_indices = (next_tokens / vocab_size).long() - next_tokens = next_tokens % vocab_size - - # stateless - beam_outputs = constrained_beam_scorer.process( - input_ids, - next_token_scores, - next_tokens, - next_indices, - scores_for_all_vocab, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - ) - beam_scores = beam_outputs["next_beam_scores"] - beam_next_tokens = beam_outputs["next_beam_tokens"] - beam_idx = beam_outputs["next_beam_indices"] - - input_ids = torch.cat( - [input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1 - ) - model_kwargs = self._update_model_kwargs_for_generation( - outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder - ) - if model_kwargs["past"] is not None: - model_kwargs["past"] = self._reorder_cache( - model_kwargs["past"], beam_idx - ) - - # increase cur_len - cur_len = cur_len + 1 - - if constrained_beam_scorer.is_done or stopping_criteria(input_ids, scores): - if not synced_gpus: - break - else: - this_peer_finished = True - - sequence_outputs = constrained_beam_scorer.finalize( - input_ids, - beam_scores, - next_tokens, - next_indices, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - max_length=stopping_criteria.max_length, - ) - - if return_dict_in_generate: - if not output_scores: - sequence_outputs["sequence_scores"] = None - if self.config.is_encoder_decoder: - return BeamSearchEncoderDecoderOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - encoder_attentions=encoder_attentions, - encoder_hidden_states=encoder_hidden_states, - decoder_attentions=decoder_attentions, - cross_attentions=cross_attentions, - decoder_hidden_states=decoder_hidden_states, - ) - else: - return BeamSearchDecoderOnlyOutput( - sequences=sequence_outputs["sequences"], - sequences_scores=sequence_outputs["sequence_scores"], - scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, - ) - else: - return sequence_outputs["sequences"] - - -def top_k_top_p_filtering( - logits: torch.FloatTensor, - top_k: int = 0, - top_p: float = 1.0, - filter_value: float = -float("Inf"), - min_tokens_to_keep: int = 1, -) -> torch.FloatTensor: - """ - Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - - Args: - logits: logits distribution shape (batch size, vocabulary size) - top_k (`int`, *optional*, defaults to 0): - If > 0, only keep the top k tokens with highest probability (top-k filtering) - top_p (`float`, *optional*, defaults to 1.0): - If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus - filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - min_tokens_to_keep (`int`, *optional*, defaults to 1): - Minimumber of tokens we keep per batch example in the output. - - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - if top_k > 0: - logits = TopKLogitsWarper( - top_k=top_k, - filter_value=filter_value, - min_tokens_to_keep=min_tokens_to_keep, - )(None, logits) - - if 0 <= top_p <= 1.0: - logits = TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=min_tokens_to_keep)( - None, logits - ) - - return logits - - -def override_generation_routines(cls): - bases = list(cls.__bases__) - for base_ix in range(len(bases)): - if bases[base_ix] == GenerationMixin: - bases[base_ix] = GenerationMixinWithRawScores - - # recursively look up - if bases[base_ix] != object: - bases[base_ix] = override_generation_routines(bases[base_ix]) - - cls.__bases__ = tuple(bases) - return cls - - -def unwrap_generation_routines(cls): - bases = list(cls.__bases__) - for base_ix in range(len(bases)): - if bases[base_ix] == GenerationMixinWithRawScores: - bases[base_ix] = GenerationMixin - - # recursively look up - if bases[base_ix] != object: - bases[base_ix] = unwrap_generation_routines(bases[base_ix]) - - cls.__bases__ = tuple(bases) - return cls diff --git a/openrl/modules/networks/value_network.py b/openrl/modules/networks/value_network.py index 187eb465..bce574c5 100644 --- a/openrl/modules/networks/value_network.py +++ b/openrl/modules/networks/value_network.py @@ -49,6 +49,7 @@ def __init__( self._use_recurrent_policy = cfg.use_recurrent_policy self._use_influence_policy = cfg.use_influence_policy self._use_popart = cfg.use_popart + self._use_fp16 = cfg.use_fp16 and cfg.use_deepspeed self._influence_layer_N = cfg.influence_layer_N self._recurrent_N = cfg.recurrent_N self.tpdv = dict(dtype=torch.float32, device=device) @@ -118,6 +119,9 @@ def forward(self, critic_obs, rnn_states, masks): rnn_states = check(rnn_states).to(**self.tpdv) masks = check(masks).to(**self.tpdv) + if self._use_fp16: + critic_obs = critic_obs.half() + critic_features = self.base(critic_obs) if self._use_naive_recurrent_policy or self._use_recurrent_policy: diff --git a/openrl/modules/rl_module.py b/openrl/modules/rl_module.py index 430e60d6..a9ef7202 100644 --- a/openrl/modules/rl_module.py +++ b/openrl/modules/rl_module.py @@ -26,6 +26,23 @@ from openrl.modules.model_config import ModelTrainConfig +def get_train_ds_config(offload, use_fp16=False, stage=2): + + return { + "train_batch_size": 28, + "train_micro_batch_size_per_gpu": 7, + "steps_per_print": 10, + "zero_optimization": { + "stage": 2, + "reduce_bucket_size": 5e7, + "allgather_bucket_size": 5e7, + }, + "fp16": { + "enabled": use_fp16, + "loss_scale_window": 100 + }, + } + class RLModule(BaseModule): def __init__( self, @@ -55,6 +72,8 @@ def __init__( self.rank = rank self.world_size = world_size + self.use_deepspeed = cfg.use_deepspeed + use_half_actor = self.program_type == "actor" and cfg.use_half_actor if model_configs is None: @@ -70,18 +89,53 @@ def __init__( use_half=use_half_actor, extra_args=model_cg["extra_args"] if "extra_args" in model_cg else None, ) - self.models.update({model_key: model}) if self.program_type == "actor": continue - optimizer = torch.optim.Adam( - model.parameters(), - lr=model_cg["lr"], - eps=cfg.opti_eps, - weight_decay=cfg.weight_decay, - ) - self.optimizers.update({model_key: optimizer}) + if not self.use_deepspeed: + optimizer = torch.optim.Adam( + model.parameters(), + lr=model_cg["lr"], + eps=cfg.opti_eps, + weight_decay=cfg.weight_decay, + ) + self.models.update({model_key: model}) + self.optimizers.update({model_key: optimizer}) + else: + import deepspeed + from deepspeed.ops.adam import FusedAdam + from deepspeed.ops.adam import DeepSpeedCPUAdam + from transformers import get_constant_schedule + + self.offload = False + ds_config = get_train_ds_config( + offload=self.offload, + use_fp16=cfg.use_fp16, + ) + + AdamOptimizer = DeepSpeedCPUAdam if self.offload else FusedAdam + optim_params = filter(lambda p: p.requires_grad, model.parameters()) + optim = AdamOptimizer( + optim_params, + lr=model_cg["lr"], + betas=(0.9, 0.95) + ) + + # LR Scheduler + lr_scheduler = get_constant_schedule( + optimizer=optim, + ) + + engine, *_ = \ + deepspeed.initialize( + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + config=ds_config + ) + self.models.update({model_key: engine}) + self.optimizers.update({model_key: engine}) if cfg.use_amp: self.scaler = torch.cuda.amp.GradScaler() From 10baeace47e01c2287bbb306ad68e1d7e0fd66b0 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 17:04:13 +0800 Subject: [PATCH 10/78] - update test - format --- openrl/envs/nlp/daily_dialog_env.py | 17 ++++- openrl/envs/nlp/rewards/intent.py | 13 ++-- openrl/envs/nlp/rewards/kl_penalty.py | 21 ++---- openrl/envs/toy_envs/identity_env.py | 6 ++ openrl/envs/vec_env/async_venv.py | 3 +- openrl/envs/vec_env/base_venv.py | 2 +- openrl/envs/vec_env/sync_venv.py | 6 +- openrl/envs/wrappers/util.py | 4 +- .../networks/policy_value_network_gpt.py | 2 +- .../modules/networks/utils/distributions.py | 2 +- .../modules/networks/utils/nlp/base_policy.py | 4 +- .../networks/utils/nlp/causal_policy.py | 1 + openrl/modules/rl_module.py | 33 ++++------ setup.py | 1 + .../test_env/test_nlp/test_DailyDialogEnv.py | 44 +++++++++++++ tests/test_env/test_vec_env/test_async_env.py | 66 +++++++++++++++++++ tests/test_env/test_vec_env/test_sync_env.py | 57 ++++++++++++++++ tests/test_examples/test_nlp.py | 17 +++-- 18 files changed, 239 insertions(+), 60 deletions(-) create mode 100644 tests/test_env/test_nlp/test_DailyDialogEnv.py create mode 100644 tests/test_env/test_vec_env/test_async_env.py create mode 100644 tests/test_env/test_vec_env/test_sync_env.py diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index fa838a06..4cb49df1 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -36,11 +36,24 @@ def __init__( prompt_truncation_side (str): truncation side for prompt text (Defaults to "left") """ - self.debug = cfg.env.args["data_path"] is None + self.debug = ( + cfg.env.args["data_path"] is None or cfg.env.args["data_path"] == "None" + ) self.env_name = "daily_dialog" tokenizer_name = cfg.env.args["tokenizer_path"] - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + if tokenizer_name == "builtin_BPE": + from tokenizers import AddedToken, Tokenizer, models + + self.tokenizer = Tokenizer(models.BPE()) + + self.tokenizer.pad_token = "" + self.tokenizer.eos_token = "" + self.tokenizer.vocab_size = 2 + self.tokenizer.name_or_path = "builtin_BPE" + + else: + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token self.tokenizer.padding_side = "left" diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 10fa65d1..f2f9bf11 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -13,18 +13,14 @@ def get_eval_ds_config(offload, stage=0): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, - "offload_param": { - "device": device - }, + "offload_param": {"device": device}, } return { "train_batch_size": 28, "train_micro_batch_size_per_gpu": 7, "steps_per_print": 10, "zero_optimization": zero_opt_dict, - "fp16": { - "enabled": True - }, + "fp16": {"enabled": True}, } @@ -33,7 +29,7 @@ def __init__(self, intent_model: str, intent_coeff: float = 1.0) -> None: super().__init__() self._intent_coeff = intent_coeff - self.use_deepspeed = True # TODO + self.use_deepspeed = True # TODO model_path = data_abs_path(intent_model) self._tokenizer = AutoTokenizer.from_pretrained(intent_model) @@ -41,7 +37,8 @@ def __init__(self, intent_model: str, intent_coeff: float = 1.0) -> None: if self.use_deepspeed: import deepspeed - self._model = self._model.to('cuda') + + self._model = self._model.to("cuda") ds_config = get_eval_ds_config(offload=True, stage=0) self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) self._device = "cuda" diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index bf5a074f..ea109c45 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -14,18 +14,14 @@ def get_eval_ds_config(offload, stage=0): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, - "offload_param": { - "device": device - }, + "offload_param": {"device": device}, } return { - "train_batch_size": 28, # + "train_batch_size": 28, # "train_micro_batch_size_per_gpu": 7, "steps_per_print": 10, "zero_optimization": zero_opt_dict, - "fp16": { - "enabled": True - }, + "fp16": {"enabled": True}, } @@ -46,6 +42,7 @@ def __init__( self._ref_net = self._ref_net.eval() if self.use_deepspeed: import deepspeed + ds_config = get_eval_ds_config(offload=True, stage=0) self._ref_engine, *_ = deepspeed.initialize(model=self, config=ds_config) elif torch.cuda.is_available(): @@ -53,7 +50,6 @@ def __init__( self._ref_net.parallelize() else: # else defaults to data parallel self._ref_net = torch.nn.DataParallel(self._ref_net) - # alpha adjustment self._alpha = 0.2 @@ -139,16 +135,11 @@ def _prepare_inputs_for_model( ) for key, value in model_inputs.items() } - + if self.use_deepspeed: model_inputs = { - key: ( - value.to('cuda') - if isinstance(value, torch.Tensor) - else value - ) + key: value.to("cuda") if isinstance(value, torch.Tensor) else value for key, value in model_inputs.items() } - return model_inputs diff --git a/openrl/envs/toy_envs/identity_env.py b/openrl/envs/toy_envs/identity_env.py index 21a9bb6a..c3d4caa2 100644 --- a/openrl/envs/toy_envs/identity_env.py +++ b/openrl/envs/toy_envs/identity_env.py @@ -28,6 +28,7 @@ def __init__( ``dim`` and ``space``. :param ep_length: the length of each episode in time_steps """ + if space is None: if dim is None: dim = 2 @@ -36,12 +37,14 @@ def __init__( assert ( dim is None ), "arguments for both 'dim' and 'space' provided: at most one allowed" + self.dim = dim self.observation_space = spaces.Discrete(1) self.action_space = space self.ep_length = ep_length self.current_step = 0 self.num_resets = -1 # Becomes 0 after __init__ exits. + self.metadata.update({"name": IdentityEnv}) def reset( self, @@ -51,6 +54,8 @@ def reset( ) -> T: if seed is not None: self.seed(seed) + if self._np_random is None: + self.seed(0) self.current_step = 0 self.num_resets += 1 self._choose_next_state() @@ -65,6 +70,7 @@ def step(self, action: T) -> Tuple[T, float, bool, Dict[str, Any]]: def _choose_next_state(self) -> None: # self.state = [self.action_space.sample()] + assert self.dim is not None self.state = [self._np_random.integers(0, self.dim)] def _get_reward(self, action: T) -> float: diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index 02d6fec2..f746588c 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -674,6 +674,7 @@ def exec_func_fetch(self, timeout: Union[int, float, None] = None) -> list: ) results, successes = zip(*[pipe.recv() for pipe in self.parent_pipes]) + self._raise_if_errors(successes) self._state = AsyncState.DEFAULT @@ -837,7 +838,7 @@ def prepare_obs(observation): ) elif command == "_func_exec": function, indices, args, kwargs = data - if index in indices: + if indices is None or index in indices: if callable(function): pipe.send((function(env, *args, **kwargs), True)) else: diff --git a/openrl/envs/vec_env/base_venv.py b/openrl/envs/vec_env/base_venv.py index f2e54744..c10d0d0d 100644 --- a/openrl/envs/vec_env/base_venv.py +++ b/openrl/envs/vec_env/base_venv.py @@ -272,7 +272,7 @@ def exec_func_fetch(self, timeout: Union[int, float, None] = None) -> list: """ def exec_func( - self, func: Callable, indices: List[int], *args, **kwargs + self, func: Callable, indices: Optional[List[int]] = None, *args, **kwargs ) -> List[Any]: """Call a method, or get a property, from each parallel environment. diff --git a/openrl/envs/vec_env/sync_venv.py b/openrl/envs/vec_env/sync_venv.py index a670ec33..6a61d489 100644 --- a/openrl/envs/vec_env/sync_venv.py +++ b/openrl/envs/vec_env/sync_venv.py @@ -281,7 +281,9 @@ def env_name(self): else: return self.envs[0].unwrapped.spec.id - def exec_func(self, func: Callable, indices: List[int], *args, **kwargs) -> tuple: + def exec_func( + self, func: Callable, indices: Optional[List[int]] = None, *args, **kwargs + ) -> tuple: """Calls the method with name and applies args and kwargs. Args: @@ -294,7 +296,7 @@ def exec_func(self, func: Callable, indices: List[int], *args, **kwargs) -> tupl """ results = [] for i, env in enumerate(self.envs): - if i in indices: + if indices is None or i in indices: if callable(func): results.append(func(env, *args, **kwargs)) else: diff --git a/openrl/envs/wrappers/util.py b/openrl/envs/wrappers/util.py index a0a97576..614a5879 100644 --- a/openrl/envs/wrappers/util.py +++ b/openrl/envs/wrappers/util.py @@ -41,7 +41,9 @@ def nest_expand_dim(input: Any) -> Any: elif input is None: return [input] else: - raise NotImplementedError("Not support type: {}".format(type(input))) + raise NotImplementedError( + "Not support type: {}, value={}".format(type(input), input) + ) def unwrap_wrapper( diff --git a/openrl/modules/networks/policy_value_network_gpt.py b/openrl/modules/networks/policy_value_network_gpt.py index 1549b5b9..fdbc15b1 100644 --- a/openrl/modules/networks/policy_value_network_gpt.py +++ b/openrl/modules/networks/policy_value_network_gpt.py @@ -107,4 +107,4 @@ def get_values(self, obs, rnn_states, masks): value_output = super().forward_value(obs) values = value_output.values - return values, rnn_states \ No newline at end of file + return values, rnn_states diff --git a/openrl/modules/networks/utils/distributions.py b/openrl/modules/networks/utils/distributions.py index ebd7421f..fd3ef8ca 100644 --- a/openrl/modules/networks/utils/distributions.py +++ b/openrl/modules/networks/utils/distributions.py @@ -68,7 +68,7 @@ def init_(m): def forward(self, x, action_masks=None): x = self.linear(x) if action_masks is not None: - x[action_masks == 0] = -6e4 # fp16 + x[action_masks == 0] = -6e4 # fp16 return FixedCategorical(logits=x) diff --git a/openrl/modules/networks/utils/nlp/base_policy.py b/openrl/modules/networks/utils/nlp/base_policy.py index 02d99086..bd5fd6b2 100644 --- a/openrl/modules/networks/utils/nlp/base_policy.py +++ b/openrl/modules/networks/utils/nlp/base_policy.py @@ -130,7 +130,7 @@ def __init__( optimizer_kwargs: Dict[str, Any] = {}, weight_decay: float = 1e-6, use_sde: bool = None, - apply_model_parallel: bool = False, # TODO + apply_model_parallel: bool = False, # TODO optimizer_class: torch.optim.Optimizer = torch.optim.AdamW, generation_kwargs: Dict[str, Any] = {}, prompt_truncation_side: str = "left", @@ -152,7 +152,7 @@ def __init__( prompt_truncation_side (str, optional): truncation side for prompt text. Defaults to "left". """ super().__init__() - self._use_deepspeed = True # TODO + self._use_deepspeed = True # TODO self._action_space = action_space self._apply_model_parallel = apply_model_parallel self._build_model_heads(model_name, config, device) diff --git a/openrl/modules/networks/utils/nlp/causal_policy.py b/openrl/modules/networks/utils/nlp/causal_policy.py index 956d3471..e13b0ecc 100644 --- a/openrl/modules/networks/utils/nlp/causal_policy.py +++ b/openrl/modules/networks/utils/nlp/causal_policy.py @@ -93,6 +93,7 @@ def _build_model_heads(self, model_name: str, config: str, device: str): if self._use_deepspeed: if self.value_normalizer is not None: import deepspeed + para = self.value_normalizer.running_mean deepspeed.zero.register_external_parameter(self, para) para = self.value_normalizer.running_mean_sq diff --git a/openrl/modules/rl_module.py b/openrl/modules/rl_module.py index a9ef7202..47779426 100644 --- a/openrl/modules/rl_module.py +++ b/openrl/modules/rl_module.py @@ -27,7 +27,6 @@ def get_train_ds_config(offload, use_fp16=False, stage=2): - return { "train_batch_size": 28, "train_micro_batch_size_per_gpu": 7, @@ -37,12 +36,10 @@ def get_train_ds_config(offload, use_fp16=False, stage=2): "reduce_bucket_size": 5e7, "allgather_bucket_size": 5e7, }, - "fp16": { - "enabled": use_fp16, - "loss_scale_window": 100 - }, + "fp16": {"enabled": use_fp16, "loss_scale_window": 100}, } + class RLModule(BaseModule): def __init__( self, @@ -104,10 +101,9 @@ def __init__( self.optimizers.update({model_key: optimizer}) else: import deepspeed - from deepspeed.ops.adam import FusedAdam - from deepspeed.ops.adam import DeepSpeedCPUAdam + from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam from transformers import get_constant_schedule - + self.offload = False ds_config = get_train_ds_config( offload=self.offload, @@ -117,23 +113,20 @@ def __init__( AdamOptimizer = DeepSpeedCPUAdam if self.offload else FusedAdam optim_params = filter(lambda p: p.requires_grad, model.parameters()) optim = AdamOptimizer( - optim_params, - lr=model_cg["lr"], - betas=(0.9, 0.95) + optim_params, lr=model_cg["lr"], betas=(0.9, 0.95) ) - + # LR Scheduler lr_scheduler = get_constant_schedule( optimizer=optim, ) - - engine, *_ = \ - deepspeed.initialize( - model=model, - optimizer=optim, - lr_scheduler=lr_scheduler, - config=ds_config - ) + + engine, *_ = deepspeed.initialize( + model=model, + optimizer=optim, + lr_scheduler=lr_scheduler, + config=ds_config, + ) self.models.update({model_key: engine}) self.optimizers.update({model_key: engine}) diff --git a/setup.py b/setup.py index fa5d6d31..6916c573 100644 --- a/setup.py +++ b/setup.py @@ -72,6 +72,7 @@ def get_extra_requires() -> dict: } req["test"].extend(req["selfplay"]) req["test"].extend(req["atari"]) + req["test"].extend(req["nlp"]) return req diff --git a/tests/test_env/test_nlp/test_DailyDialogEnv.py b/tests/test_env/test_nlp/test_DailyDialogEnv.py new file mode 100644 index 00000000..6f0ac1df --- /dev/null +++ b/tests/test_env/test_nlp/test_DailyDialogEnv.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make + + +@pytest.fixture( + scope="module", + params=["--env.args {'data_path':None,'tokenizer_path':'builtin_BPE'}"], +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_DailyDialogEnv(config): + env = make("daily_dialog", env_num=1, asynchronous=False, cfg=config) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_env/test_vec_env/test_async_env.py b/tests/test_env/test_vec_env/test_async_env.py new file mode 100644 index 00000000..a69bc88b --- /dev/null +++ b/tests/test_env/test_vec_env/test_async_env.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest +from gymnasium.wrappers import EnvCompatibility + +from openrl.envs.toy_envs import make_toy_envs +from openrl.envs.vec_env.async_venv import AsyncVectorEnv + + +class CustomEnvCompatibility(EnvCompatibility): + def reset(self, **kwargs): + return super().reset(**kwargs)[0] + + +def init_envs(): + env_wrappers = [CustomEnvCompatibility] + env_fns = make_toy_envs( + id="IdentityEnv", + env_num=2, + env_wrappers=env_wrappers, + ) + return env_fns + + +def assert_env_name(env, env_name): + if isinstance(env.metadata["name"], str): + assert env.metadata["name"] == env_name + else: + assert env.metadata["name"].__name__ == env_name + + +@pytest.mark.unittest +def test_async_env(): + env_name = "IdentityEnv" + env = AsyncVectorEnv(init_envs(), shared_memory=True) + assert ( + env._env_name == env_name + ), "AsyncVectorEnv should have the same metadata as the wrapped env" + env.exec_func(assert_env_name, indices=None, env_name=env_name) + env.call("render") + env_name_new = "IdentityEnvNew" + env.set_attr("metadata", {"name": env_name_new}) + env.exec_func(assert_env_name, indices=None, env_name=env_name_new) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_env/test_vec_env/test_sync_env.py b/tests/test_env/test_vec_env/test_sync_env.py new file mode 100644 index 00000000..fb3d5d0b --- /dev/null +++ b/tests/test_env/test_vec_env/test_sync_env.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest +from gymnasium.wrappers import EnvCompatibility + +from openrl.envs.toy_envs import make_toy_envs +from openrl.envs.vec_env.sync_venv import SyncVectorEnv + + +class CustomEnvCompatibility(EnvCompatibility): + def reset(self, **kwargs): + return super().reset(**kwargs)[0] + + +def init_envs(): + env_wrappers = [CustomEnvCompatibility] + env_fns = make_toy_envs( + id="IdentityEnv", + env_num=2, + env_wrappers=env_wrappers, + ) + return env_fns + + +def assert_env_name(env, env_name): + assert env.metadata["name"].__name__ == env_name + + +@pytest.mark.unittest +def test_sync_env(): + env_name = "IdentityEnv" + env = SyncVectorEnv(init_envs()) + env.exec_func(assert_env_name, indices=None, env_name=env_name) + env.call("render") + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_examples/test_nlp.py b/tests/test_examples/test_nlp.py index 99111fdc..1524ae65 100644 --- a/tests/test_examples/test_nlp.py +++ b/tests/test_examples/test_nlp.py @@ -17,20 +17,26 @@ # """""" # +import os +import sys +import pytest + +from openrl.configs.config import create_config_parser from openrl.envs.common import make from openrl.modules.common import PPONet as Net from openrl.runners.common import PPOAgent as Agent -def config(): - from openrl.configs.config import create_config_parser - +# @pytest.fixture(scope="module", params=["--env.args {'data_path':None,'tokenizer_path':'builtin_BPE'}"]) +@pytest.fixture(scope="module", params=[""]) +def config(request): cfg_parser = create_config_parser() - cfg = cfg_parser.parse_args() + cfg = cfg_parser.parse_args(request.param.split()) return cfg +@pytest.mark.unittest def test_train_nlp(config): env = make("fake_dialog_data", env_num=3, cfg=config) agent = Agent(Net(env)) @@ -38,5 +44,4 @@ def test_train_nlp(config): if __name__ == "__main__": - cfg = config() - test_train_nlp(cfg) + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 6665505d6deba9e1b14b83d2db7b544da6c29940 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 19:17:17 +0800 Subject: [PATCH 11/78] fix atari training bugs --- examples/atari/train_ppo.py | 1 - openrl/envs/common/build_envs.py | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/atari/train_ppo.py b/examples/atari/train_ppo.py index 4f122c40..5920e819 100644 --- a/examples/atari/train_ppo.py +++ b/examples/atari/train_ppo.py @@ -59,7 +59,6 @@ def train(): agent = Agent(net, use_wandb=True) # start training, set total number of training steps to 20000 - # agent.train(total_time_steps=1000) agent.train(total_time_steps=5000000) env.close() agent.save("./ppo_agent/") diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index 94c34019..0893400a 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -33,6 +33,8 @@ def _make_env() -> Env: if need_env_id: new_kwargs["env_id"] = env_id new_kwargs["env_num"] = env_num + if id.startswith("ALE/"): + new_kwargs.pop("cfg", None) env = make( id, From 601c79595a9aa321e4db6a2eef3a1e63db969b6a Mon Sep 17 00:00:00 2001 From: huangshiyu13 Date: Thu, 19 Oct 2023 19:36:37 +0800 Subject: [PATCH 12/78] fix windows bugs --- openrl/configs/utils.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/openrl/configs/utils.py b/openrl/configs/utils.py index 53e1f4d2..8e07a2b9 100644 --- a/openrl/configs/utils.py +++ b/openrl/configs/utils.py @@ -16,7 +16,7 @@ """""" - +import os import re import tempfile @@ -84,8 +84,18 @@ def __call__(self, parser, cfg, values, option_string=None): data = yaml.safe_load(rendered_content) # Write the result to a temporary file - with tempfile.NamedTemporaryFile("w", delete=True, suffix=".yaml") as temp_file: + # with tempfile.NamedTemporaryFile("w", delete=True, suffix=".yaml") as temp_file: + # yaml.dump(data, temp_file) + # temp_file.seek(0) # Move to the beginning of the file + # # Use the default behavior of ActionConfigFile to handle the temporary file + # super().__call__(parser, cfg, temp_file.name, option_string) + + # Write the result to a temporary file + temp_fd, temp_filename = tempfile.mkstemp(suffix=".yaml") + with os.fdopen(temp_fd, 'w') as temp_file: yaml.dump(data, temp_file) - temp_file.seek(0) # Move to the beginning of the file + try: # Use the default behavior of ActionConfigFile to handle the temporary file - super().__call__(parser, cfg, temp_file.name, option_string) + super().__call__(parser, cfg, temp_filename, option_string) + finally: + os.remove(temp_filename) From 254cf262449b1bc0de5ca18e1e557ed04da3ffac Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 19:38:27 +0800 Subject: [PATCH 13/78] update --- openrl/configs/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openrl/configs/utils.py b/openrl/configs/utils.py index 8e07a2b9..b3198748 100644 --- a/openrl/configs/utils.py +++ b/openrl/configs/utils.py @@ -83,14 +83,14 @@ def __call__(self, parser, cfg, values, option_string=None): # Load the rendered content as a dictionary data = yaml.safe_load(rendered_content) - # Write the result to a temporary file + # Write the result to a temporary file. Not work on Windows. # with tempfile.NamedTemporaryFile("w", delete=True, suffix=".yaml") as temp_file: # yaml.dump(data, temp_file) # temp_file.seek(0) # Move to the beginning of the file # # Use the default behavior of ActionConfigFile to handle the temporary file # super().__call__(parser, cfg, temp_file.name, option_string) - # Write the result to a temporary file + # Write the result to a temporary file. This works on all platforms. temp_fd, temp_filename = tempfile.mkstemp(suffix=".yaml") with os.fdopen(temp_fd, 'w') as temp_file: yaml.dump(data, temp_file) From 4f110d6e82e208bbb6a89bd2377bc100b77ff2e7 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 19:53:31 +0800 Subject: [PATCH 14/78] update --- openrl/configs/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openrl/configs/utils.py b/openrl/configs/utils.py index b3198748..a8420767 100644 --- a/openrl/configs/utils.py +++ b/openrl/configs/utils.py @@ -92,7 +92,7 @@ def __call__(self, parser, cfg, values, option_string=None): # Write the result to a temporary file. This works on all platforms. temp_fd, temp_filename = tempfile.mkstemp(suffix=".yaml") - with os.fdopen(temp_fd, 'w') as temp_file: + with os.fdopen(temp_fd, "w") as temp_file: yaml.dump(data, temp_file) try: # Use the default behavior of ActionConfigFile to handle the temporary file From d32f067dfc73f8d0b115ee08519a2203aad0fa42 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 20:06:43 +0800 Subject: [PATCH 15/78] fix python3.11 test --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6916c573..ffb91a94 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,11 @@ def get_extra_requires() -> dict: "evaluate", "icetk", ], + "nlp_test": [ + "transformers", + "datasets", + "evaluate", + ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], @@ -72,7 +77,7 @@ def get_extra_requires() -> dict: } req["test"].extend(req["selfplay"]) req["test"].extend(req["atari"]) - req["test"].extend(req["nlp"]) + req["test"].extend(req["nlp_test"]) return req From 462dbd7af29784ce3147bd0f118f8e64dd0f9330 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 19 Oct 2023 20:19:27 +0800 Subject: [PATCH 16/78] format --- examples/behavior_cloning/test_env.py | 1 + examples/behavior_cloning/train_bc.py | 1 + examples/cartpole/train_a2c.py | 1 + examples/cartpole/train_dqn_beta.py | 1 + examples/cartpole/train_ppo.py | 1 + examples/ddpg/train_ddpg_beta.py | 1 + examples/gail/train_gail.py | 1 + examples/gridworld/train_dqn.py | 1 + examples/gridworld/train_ppo.py | 1 + examples/nlp/train_ppo.py | 1 + examples/retro/train_retro.py | 1 + examples/sac/train_ddpg.py | 1 + examples/sac/train_sac_beta.py | 1 + examples/toy_env/train_ppo.py | 1 + openrl/envs/mpe/rendering.py | 1 + openrl/envs/vec_env/async_venv.py | 1 + tests/test_examples/test_train_mpe.py | 1 + 17 files changed, 17 insertions(+) diff --git a/examples/behavior_cloning/test_env.py b/examples/behavior_cloning/test_env.py index 60b272c6..fe0fa1b7 100644 --- a/examples/behavior_cloning/test_env.py +++ b/examples/behavior_cloning/test_env.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/behavior_cloning/train_bc.py b/examples/behavior_cloning/train_bc.py index 0d562dee..16d2ef2f 100644 --- a/examples/behavior_cloning/train_bc.py +++ b/examples/behavior_cloning/train_bc.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/cartpole/train_a2c.py b/examples/cartpole/train_a2c.py index 415f0bba..35ca95a9 100644 --- a/examples/cartpole/train_a2c.py +++ b/examples/cartpole/train_a2c.py @@ -1,4 +1,5 @@ """""" + import numpy as np import torch diff --git a/examples/cartpole/train_dqn_beta.py b/examples/cartpole/train_dqn_beta.py index 2dffaa81..3e32ec28 100644 --- a/examples/cartpole/train_dqn_beta.py +++ b/examples/cartpole/train_dqn_beta.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/cartpole/train_ppo.py b/examples/cartpole/train_ppo.py index ee11f871..77e41008 100644 --- a/examples/cartpole/train_ppo.py +++ b/examples/cartpole/train_ppo.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/ddpg/train_ddpg_beta.py b/examples/ddpg/train_ddpg_beta.py index 2a19f557..7ba61ee0 100644 --- a/examples/ddpg/train_ddpg_beta.py +++ b/examples/ddpg/train_ddpg_beta.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/gail/train_gail.py b/examples/gail/train_gail.py index abe73039..4e227be9 100644 --- a/examples/gail/train_gail.py +++ b/examples/gail/train_gail.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/gridworld/train_dqn.py b/examples/gridworld/train_dqn.py index 2b859784..900a1287 100644 --- a/examples/gridworld/train_dqn.py +++ b/examples/gridworld/train_dqn.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/gridworld/train_ppo.py b/examples/gridworld/train_ppo.py index 683e9579..71f59bcb 100644 --- a/examples/gridworld/train_ppo.py +++ b/examples/gridworld/train_ppo.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index e6d115c1..e2fcc3d6 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -1,4 +1,5 @@ """""" + from openrl.configs.config import create_config_parser from openrl.envs.common import make from openrl.modules.common import PPONet as Net diff --git a/examples/retro/train_retro.py b/examples/retro/train_retro.py index ad13749a..0668b620 100644 --- a/examples/retro/train_retro.py +++ b/examples/retro/train_retro.py @@ -1,4 +1,5 @@ """""" + import numpy as np from custom_registration import make diff --git a/examples/sac/train_ddpg.py b/examples/sac/train_ddpg.py index 484a1f6d..5bc2bab8 100644 --- a/examples/sac/train_ddpg.py +++ b/examples/sac/train_ddpg.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/sac/train_sac_beta.py b/examples/sac/train_sac_beta.py index 9fa905a8..bc40c1dc 100644 --- a/examples/sac/train_sac_beta.py +++ b/examples/sac/train_sac_beta.py @@ -1,4 +1,5 @@ """""" + import numpy as np from openrl.configs.config import create_config_parser diff --git a/examples/toy_env/train_ppo.py b/examples/toy_env/train_ppo.py index 49cb0c9f..6410b52a 100644 --- a/examples/toy_env/train_ppo.py +++ b/examples/toy_env/train_ppo.py @@ -1,4 +1,5 @@ """""" + from train_and_eval import evaluation, train from openrl.modules.common import PPONet as Net diff --git a/openrl/envs/mpe/rendering.py b/openrl/envs/mpe/rendering.py index c66f2a0c..a7197dca 100644 --- a/openrl/envs/mpe/rendering.py +++ b/openrl/envs/mpe/rendering.py @@ -1,6 +1,7 @@ """ 2D rendering framework """ + from __future__ import division import os diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index f746588c..a244c1c9 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -1,4 +1,5 @@ """An async vector environment.""" + import multiprocessing as mp import sys import time diff --git a/tests/test_examples/test_train_mpe.py b/tests/test_examples/test_train_mpe.py index 419b3dab..36e3e689 100644 --- a/tests/test_examples/test_train_mpe.py +++ b/tests/test_examples/test_train_mpe.py @@ -1,4 +1,5 @@ """""" + import os import sys From ca32597b58369782c6d65756705889420b131fd5 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 20 Oct 2023 11:45:18 +0800 Subject: [PATCH 17/78] add test atari --- tests/test_examples/test_train_atari.py | 74 +++++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 tests/test_examples/test_train_atari.py diff --git a/tests/test_examples/test_train_atari.py b/tests/test_examples/test_train_atari.py new file mode 100644 index 00000000..4d8d2166 --- /dev/null +++ b/tests/test_examples/test_train_atari.py @@ -0,0 +1,74 @@ +"""""" + +import os +import sys + +import numpy as np +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.atari_wrappers import ( + ClipRewardEnv, + FireResetEnv, + NoopResetEnv, + WarpFrame, +) +from openrl.envs.wrappers.image_wrappers import TransposeImage +from openrl.envs.wrappers.monitor import Monitor +from openrl.modules.common import PPONet as Net +from openrl.runners.common import PPOAgent as Agent + +env_wrappers = [ + Monitor, + NoopResetEnv, + FireResetEnv, + WarpFrame, + ClipRewardEnv, + TransposeImage, +] + + +@pytest.fixture( + scope="module", + params=[ + "--episode_length 5 --use_recurrent_policy false --vec_info_class.id" + " EPS_RewardInfo --use_valuenorm true --use_adv_normalize true" + " --use_share_model True --entropy_coef 0.01" + ], +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_train_atari(config): + env_num = 2 + env = make( + "ALE/Pong-v5", + env_num=env_num, + cfg=config, + asynchronous=True, + env_wrappers=env_wrappers, + ) + net = Net(env, cfg=config) + agent = Agent(net) + agent.train(total_time_steps=30) + agent.save("./ppo_agent/") + agent.load("./ppo_agent/") + agent.set_env(env) + obs, info = env.reset(seed=0) + step = 0 + while step < 5: + action, _ = agent.act(obs, deterministic=True) + obs, r, done, info = env.step(action) + if np.any(done): + break + step += 1 + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From eeb026ffa9626812870a3d83c87b34ea16b8a316 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 20 Oct 2023 14:22:47 +0800 Subject: [PATCH 18/78] add test gpt work --- .../networks/utils/nlp/causal_policy.py | 27 ++++++-- .../test_policy_value_network_gpt.py | 65 +++++++++++++++++++ 2 files changed, 85 insertions(+), 7 deletions(-) create mode 100644 tests/test_modules/test_networks/test_policy_value_network_gpt.py diff --git a/openrl/modules/networks/utils/nlp/causal_policy.py b/openrl/modules/networks/utils/nlp/causal_policy.py index e13b0ecc..8dfc65dc 100644 --- a/openrl/modules/networks/utils/nlp/causal_policy.py +++ b/openrl/modules/networks/utils/nlp/causal_policy.py @@ -65,20 +65,33 @@ def policy(self): def _build_model_heads(self, model_name: str, config: str, device: str): if self.disable_drop_out: - config = AutoConfig.from_pretrained(model_name) + if model_name == "test_gpt2": + from transformers import GPT2Config + + config = GPT2Config() + + else: + config = AutoConfig.from_pretrained(model_name) config_dict = config.to_dict() for key in config_dict: if "drop" in key: config_dict[key] = 0.0 config = config.from_dict(config_dict) - self._policy_model = AutoModelForCausalLM.from_pretrained( - model_name, config=config - ) + if model_name == "test_gpt2": + from transformers import GPT2LMHeadModel - self._value_model = AutoModelForCausalLM.from_pretrained( - model_name, config=config - ) + self._policy_model = GPT2LMHeadModel(config) + self._value_model = GPT2LMHeadModel(config) + + else: + self._policy_model = AutoModelForCausalLM.from_pretrained( + model_name, config=config + ) + + self._value_model = AutoModelForCausalLM.from_pretrained( + model_name, config=config + ) self._value_head = nn.Linear( self._value_model.config.hidden_size, 1, bias=False diff --git a/tests/test_modules/test_networks/test_policy_value_network_gpt.py b/tests/test_modules/test_networks/test_policy_value_network_gpt.py new file mode 100644 index 00000000..66a6caaa --- /dev/null +++ b/tests/test_modules/test_networks/test_policy_value_network_gpt.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + +from openrl.configs.config import create_config_parser +from openrl.modules.networks.policy_value_network_gpt import ( + PolicyValueNetworkGPT as PolicyValueNetwork, +) + + +@pytest.fixture(scope="module", params=["--model_path test_gpt2"]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_gpt_network(config): + net = PolicyValueNetwork( + cfg=config, + input_space=spaces.Discrete(2), + action_space=spaces.Discrete(2), + ) + + net.get_actor_para() + net.get_critic_para() + + obs = { + "input_encoded_pt": np.zeros([1, 2]), + "input_attention_mask_pt": np.zeros([1, 2]), + } + rnn_states = np.zeros(2) + masks = np.zeros(2) + action = np.zeros(1) + net.get_actions(obs=obs, rnn_states=rnn_states, masks=masks) + net.eval_actions( + obs=obs, rnn_states=rnn_states, action=action, masks=masks, action_masks=None + ) + net.get_values(obs=obs, rnn_states=rnn_states, masks=masks) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 090b617c54308fbea2eb3b0fa3efe73ae447e918 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 20 Oct 2023 14:37:42 +0800 Subject: [PATCH 19/78] delete common.py --- openrl/envs/snake/common.py | 227 ------------------------------------ 1 file changed, 227 deletions(-) delete mode 100644 openrl/envs/snake/common.py diff --git a/openrl/envs/snake/common.py b/openrl/envs/snake/common.py deleted file mode 100644 index 6a67a0a3..00000000 --- a/openrl/envs/snake/common.py +++ /dev/null @@ -1,227 +0,0 @@ -import os -import sys - -import numpy as np - - -class HiddenPrints: - def __enter__(self): - self._original_stdout = sys.stdout - sys.stdout = open(os.devnull, "w") - - def __exit__(self, exc_type, exc_val, exc_tb): - sys.stdout.close() - sys.stdout = self._original_stdout - - -class Board: - def __init__(self, board_height, board_width, snakes, beans_positions, teams): - # print('create board, beans_position: ', beans_positions) - self.height = board_height - self.width = board_width - self.snakes = snakes - self.snakes_count = len(snakes) - self.beans_positions = beans_positions - self.blank_sign = -self.snakes_count - self.bean_sign = -self.snakes_count + 1 - self.board = np.zeros((board_height, board_width), dtype=int) + self.blank_sign - self.open = dict() - for key, snake in self.snakes.items(): - self.open[key] = [snake.head] # state 0 open list, heads, ready to spread - # see [A* Pathfinding (E01: algorithm explanation)](https://www.youtube.com/watch?v=-L-WgKMFuhE) - for x, y in snake.pos: - self.board[x][y] = key # obstacles, e.g. 0, 1, 2, 3, 4, 5 - # for x, y in beans_positions: - # self.board[x][y] = self.bean_sign # beans - - self.state = 0 - self.controversy = dict() - self.teams = teams - - # print('initial board') - # print(self.board) - - def step(self): # delay: prevent rear-end collision - new_open = {key: [] for key in self.snakes.keys()} - self.state += 1 # update state - # if self.state > delay: - # for key, snake in self.snakes.items(): # drop tail - # if snake.len >= self.state: - # self.board[snake.pos[-(self.state - delay)][0]][snake.pos[-(self.state - delay)][1]] \ - # = self.blank_sign - for key, snake in self.snakes.items(): - if snake.len >= self.state: - self.board[snake.pos[-self.state][0]][ - snake.pos[-self.state][1] - ] = self.blank_sign # drop tail - for key, value in self.open.items(): # value: e.g. [[8, 3], [6, 3], [7, 4]] - others_tail_pos = [ - ( - self.snakes[_].pos[-self.state] - if self.snakes[_].len >= self.state - else [] - ) - for _ in set(range(self.snakes_count)) - {key} - ] - for x, y in value: - # print('start to spread snake {} on grid ({}, {})'.format(key, x, y)) - for x_, y_ in [ - ((x + 1) % self.height, y), # down - ((x - 1) % self.height, y), # up - (x, (y + 1) % self.width), # right - (x, (y - 1) % self.width), - ]: # left - sign = self.board[x_][y_] - idx = ( - sign % self.snakes_count - ) # which snake, e.g. 0, 1, 2, 3, 4, 5 / number of claims - state = ( - sign // self.snakes_count - ) # manhattan distance to snake who claim the point or its negative - if sign == self.blank_sign: # grid in initial state - if [x_, y_] in others_tail_pos: - # print('do not spread other snakes tail, in case of rear-end collision') - continue # do not spread other snakes' tail, in case of rear-end collision - self.board[x_][y_] = self.state * self.snakes_count + key - self.snakes[key].claimed_count += 1 - new_open[key].append([x_, y_]) - - elif key != idx and self.state == state: - # second claim, init controversy, change grid value from + to - - # print( - # '\tgird ({}, {}) in the same state claimed by different snakes ' - # 'with sign {}, idx {} and state {}'.format( - # x_, y_, sign, idx, state)) - if ( - self.snakes[idx].len > self.snakes[key].len - ): # shorter snake claim the controversial grid - # print('\t\tsnake {} is shorter than snake {}'.format(key, idx)) - self.snakes[idx].claimed_count -= 1 - new_open[idx].remove([x_, y_]) - self.board[x_][y_] = self.state * self.snakes_count + key - self.snakes[key].claimed_count += 1 - new_open[key].append([x_, y_]) - elif ( - self.snakes[idx].len == self.snakes[key].len - ): # controversial claim - # print( - # '\t\tcontroversy! first claimed by snake {}, then claimed by snake {}'.format(idx, key)) - self.controversy[(x_, y_)] = { - "state": self.state, - "length": self.snakes[idx].len, - "indexes": [idx, key], - } - # first claim by snake idx, then claim by snake key - self.board[x_][y_] = -self.state * self.snakes_count + 1 - # if + 2, not enough for all snakes claim one grid!! - self.snakes[ - idx - ].claimed_count -= ( - 1 # controversy, no snake claim this grid!! - ) - new_open[key].append([x_, y_]) - else: # (self.snakes[idx].len < self.snakes[key].len) - pass # longer snake do not claim the controversial grid - - elif ( - (x_, y_) in self.controversy - and key not in self.controversy[(x_, y_)]["indexes"] - and self.state + state == 0 - ): # third claim or more - # print('snake {} meets third or more claim in grid ({}, {})'.format(key, x_, y_)) - controversy = self.controversy[(x_, y_)] - # pprint.pprint(controversy) - if ( - controversy["length"] > self.snakes[key].len - ): # shortest snake claim grid, do 4 things - # print('\t\tsnake {} is shortest'.format(key)) - indexes_count = len(controversy["indexes"]) - for i in controversy["indexes"]: - self.snakes[i].claimed_count -= ( - 1 / indexes_count - ) # update claimed_count ! - new_open[i].remove([x_, y_]) - del self.controversy[(x_, y_)] - self.board[x_][y_] = self.state * self.snakes_count + key - self.snakes[key].claimed_count += 1 - new_open[key].append([x_, y_]) - elif ( - controversy["length"] == self.snakes[key].len - ): # controversial claim - # print('\t\tcontroversy! multi claimed by snake {}'.format(key)) - self.controversy[(x_, y_)]["indexes"].append(key) - self.board[x_][y_] += 1 - new_open[key].append([x_, y_]) - else: # (controversy['length'] < self.snakes[key].len) - pass # longer snake do not claim the controversial grid - else: - pass # do nothing with lower state grids - - self.open = new_open # update open - # update controversial snakes' claimed_count (in fraction) in the end - for _, d in self.controversy.items(): - controversial_snake_count = len( - d["indexes"] - ) # number of controversial snakes - for idx in d["indexes"]: - self.snakes[idx].claimed_count += 1 / controversial_snake_count - - -class SnakePos: - def __init__(self, snake_positions, board_height, board_width, beans_positions): - self.pos = snake_positions # [[2, 9], [2, 8], [2, 7]] - self.len = len(snake_positions) # >= 3 - self.head = snake_positions[0] - self.beans_positions = beans_positions - self.claimed_count = 0 - - displace = [ - (self.head[0] - snake_positions[1][0]) % board_height, - (self.head[1] - snake_positions[1][1]) % board_width, - ] - # print('creat snake, pos: ', self.pos, 'displace:', displace) - if displace == [ - board_height - 1, - 0, - ]: # all action are ordered by left, up, right, relative to the body - self.dir = 0 # up - self.legal_action = [2, 0, 3] - elif displace == [1, 0]: - self.dir = 1 # down - self.legal_action = [3, 1, 2] - elif displace == [0, board_width - 1]: - self.dir = 2 # left - self.legal_action = [1, 2, 0] - elif displace == [0, 1]: - self.dir = 3 # right - self.legal_action = [0, 3, 1] - else: - assert False, "snake positions error" - positions = [ - [(self.head[0] - 1) % board_height, self.head[1]], - [(self.head[0] + 1) % board_height, self.head[1]], - [self.head[0], (self.head[1] - 1) % board_width], - [self.head[0], (self.head[1] + 1) % board_width], - ] - self.legal_position = [positions[_] for _ in self.legal_action] - - def get_action(self, position): - if position not in self.legal_position: - assert False, "the start and end points do not match" - idx = self.legal_position.index(position) - return self.legal_action[idx] # 0, 1, 2, 3: up, down, left, right - - def step(self, legal_input): - if legal_input in self.legal_position: - position = legal_input - elif legal_input in self.legal_action: - idx = self.legal_action.index(legal_input) - position = self.legal_position[idx] - else: - assert False, "illegal snake move" - self.head = position - self.pos.insert(0, position) - if position in self.beans_positions: # eat a bean - self.len += 1 - else: # do not eat a bean - self.pos.pop() From 05e7d8bb725a8f007e889514574201e74d82e53a Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 20 Oct 2023 14:51:41 +0800 Subject: [PATCH 20/78] init v0.1.9 --- README.md | 2 +- README_zh.md | 2 +- openrl/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3f855903..07eacdf6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.8 is updated on Oct 13, 2023 +OpenRL-v0.1.9 is updated on Oct 20, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/README_zh.md b/README_zh.md index b6fd07b1..531c8c34 100644 --- a/README_zh.md +++ b/README_zh.md @@ -29,7 +29,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.8 is updated on Oct 13, 2023 +OpenRL-v0.1.9 is updated on Oct 20, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/openrl/__init__.py b/openrl/__init__.py index 254628c0..89c0f7f9 100644 --- a/openrl/__init__.py +++ b/openrl/__init__.py @@ -1,5 +1,5 @@ __TITLE__ = "openrl" -__VERSION__ = "v0.1.8" +__VERSION__ = "v0.1.9" __DESCRIPTION__ = "Distributed Deep RL Framework" __AUTHOR__ = "OpenRL Contributors" __EMAIL__ = "huangshiyu@4paradigm.com" From c7f8f3a61b25dc7bdbe54dc3ad7afb75108856ee Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 23 Oct 2023 15:24:55 +0800 Subject: [PATCH 21/78] fix typo: change loss rate to lose rate --- openrl/selfplay/selfplay_api/opponent_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openrl/selfplay/selfplay_api/opponent_model.py b/openrl/selfplay/selfplay_api/opponent_model.py index af9ec6b9..b836519b 100644 --- a/openrl/selfplay/selfplay_api/opponent_model.py +++ b/openrl/selfplay/selfplay_api/opponent_model.py @@ -49,7 +49,7 @@ def get_battle_info(self) -> Dict[str, Any]: result = {} result["win_rate"] = float(self.num_wins) / max(self.num_games, 1) result["draw_rate"] = float(self.num_draws) / max(self.num_games, 1) - result["loss_rate"] = float(self.num_losses) / max(self.num_games, 1) + result["lose_rate"] = float(self.num_losses) / max(self.num_games, 1) result["total_games"] = self.num_games return result From 5aa7d9e5d2f2638403886739ac995fadf082e311 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 23 Oct 2023 16:45:26 +0800 Subject: [PATCH 22/78] update _worker test --- openrl/envs/vec_env/async_venv.py | 9 ++- tests/test_env/test_vec_env/test_async_env.py | 75 ++++++++++++++++--- 2 files changed, 69 insertions(+), 15 deletions(-) diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index a244c1c9..961a8efd 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -734,7 +734,7 @@ def _worker( index: int, env_fn: callable, pipe: Connection, - parent_pipe: Connection, + parent_pipe: Optional[Connection], shared_memory: bool, error_queue: Queue, auto_reset: bool = True, @@ -757,11 +757,14 @@ def prepare_obs(observation): ) observation = None return observation - - parent_pipe.close() + if parent_pipe is not None: + parent_pipe.close() try: while True: command, data = pipe.recv() + print(command) + + if command == "reset": result = env.reset(**data) diff --git a/tests/test_env/test_vec_env/test_async_env.py b/tests/test_env/test_vec_env/test_async_env.py index a69bc88b..5cdd534e 100644 --- a/tests/test_env/test_vec_env/test_async_env.py +++ b/tests/test_env/test_vec_env/test_async_env.py @@ -24,7 +24,8 @@ from openrl.envs.toy_envs import make_toy_envs from openrl.envs.vec_env.async_venv import AsyncVectorEnv - +from openrl.envs.vec_env.async_venv import _worker +import multiprocessing as mp class CustomEnvCompatibility(EnvCompatibility): def reset(self, **kwargs): @@ -48,18 +49,68 @@ def assert_env_name(env, env_name): assert env.metadata["name"].__name__ == env_name +# @pytest.mark.unittest +# def test_async_env(): +# env_name = "IdentityEnv" +# env = AsyncVectorEnv(init_envs(), shared_memory=True) +# assert ( +# env._env_name == env_name +# ), "AsyncVectorEnv should have the same metadata as the wrapped env" +# env.exec_func(assert_env_name, indices=None, env_name=env_name) +# env.call("render") +# env_name_new = "IdentityEnvNew" +# env.set_attr("metadata", {"name": env_name_new}) +# env.exec_func(assert_env_name, indices=None, env_name=env_name_new) + +def main_control(parent_pipe,child_pipe): + child_pipe.close() + + parent_pipe.send(("reset", {"seed":0})) + result, success = parent_pipe.recv() + assert success, result + + parent_pipe.send(("step", [0])) + result, success = parent_pipe.recv() + assert success, result + + parent_pipe.send(("_call", ("render",[],{}))) + result, success = parent_pipe.recv() + assert success, result + + parent_pipe.send(("_setattr", ("metadata", {"name": "IdentityEnvNew"}))) + result, success = parent_pipe.recv() + assert success, result + + parent_pipe.send(("_func_exec",(assert_env_name,None,[],{"env_name":"IdentityEnvNew"}))) + result, success = parent_pipe.recv() + assert success, result + + parent_pipe.send(("close",None)) + result, success = parent_pipe.recv() + assert success, result + + @pytest.mark.unittest -def test_async_env(): - env_name = "IdentityEnv" - env = AsyncVectorEnv(init_envs(), shared_memory=True) - assert ( - env._env_name == env_name - ), "AsyncVectorEnv should have the same metadata as the wrapped env" - env.exec_func(assert_env_name, indices=None, env_name=env_name) - env.call("render") - env_name_new = "IdentityEnvNew" - env.set_attr("metadata", {"name": env_name_new}) - env.exec_func(assert_env_name, indices=None, env_name=env_name_new) +def test_worker(): + for auto_reset in [True,False]: + ctx = mp.get_context(None) + parent_pipe, child_pipe = ctx.Pipe() + + error_queue = ctx.Queue() + + process = ctx.Process( + target=main_control, + name="test", + args=( + parent_pipe, + child_pipe + ), + ) + process.daemon = True + process.start() + _worker(0, init_envs()[0], child_pipe, None, False, error_queue, auto_reset) + + if __name__ == "__main__": From d2ff6b310a44d599f4d6dadef6317966a14071b7 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 23 Oct 2023 16:45:43 +0800 Subject: [PATCH 23/78] update _worker test --- openrl/envs/vec_env/async_venv.py | 3 +- tests/test_env/test_vec_env/test_async_env.py | 28 +++++++++---------- 2 files changed, 14 insertions(+), 17 deletions(-) diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index 961a8efd..54ab2c80 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -757,6 +757,7 @@ def prepare_obs(observation): ) observation = None return observation + if parent_pipe is not None: parent_pipe.close() try: @@ -764,8 +765,6 @@ def prepare_obs(observation): command, data = pipe.recv() print(command) - - if command == "reset": result = env.reset(**data) diff --git a/tests/test_env/test_vec_env/test_async_env.py b/tests/test_env/test_vec_env/test_async_env.py index 5cdd534e..ed514352 100644 --- a/tests/test_env/test_vec_env/test_async_env.py +++ b/tests/test_env/test_vec_env/test_async_env.py @@ -16,6 +16,7 @@ """""" +import multiprocessing as mp import os import sys @@ -23,9 +24,8 @@ from gymnasium.wrappers import EnvCompatibility from openrl.envs.toy_envs import make_toy_envs -from openrl.envs.vec_env.async_venv import AsyncVectorEnv -from openrl.envs.vec_env.async_venv import _worker -import multiprocessing as mp +from openrl.envs.vec_env.async_venv import AsyncVectorEnv, _worker + class CustomEnvCompatibility(EnvCompatibility): def reset(self, **kwargs): @@ -62,10 +62,11 @@ def assert_env_name(env, env_name): # env.set_attr("metadata", {"name": env_name_new}) # env.exec_func(assert_env_name, indices=None, env_name=env_name_new) -def main_control(parent_pipe,child_pipe): + +def main_control(parent_pipe, child_pipe): child_pipe.close() - parent_pipe.send(("reset", {"seed":0})) + parent_pipe.send(("reset", {"seed": 0})) result, success = parent_pipe.recv() assert success, result @@ -73,7 +74,7 @@ def main_control(parent_pipe,child_pipe): result, success = parent_pipe.recv() assert success, result - parent_pipe.send(("_call", ("render",[],{}))) + parent_pipe.send(("_call", ("render", [], {}))) result, success = parent_pipe.recv() assert success, result @@ -81,18 +82,20 @@ def main_control(parent_pipe,child_pipe): result, success = parent_pipe.recv() assert success, result - parent_pipe.send(("_func_exec",(assert_env_name,None,[],{"env_name":"IdentityEnvNew"}))) + parent_pipe.send( + ("_func_exec", (assert_env_name, None, [], {"env_name": "IdentityEnvNew"})) + ) result, success = parent_pipe.recv() assert success, result - parent_pipe.send(("close",None)) + parent_pipe.send(("close", None)) result, success = parent_pipe.recv() assert success, result @pytest.mark.unittest def test_worker(): - for auto_reset in [True,False]: + for auto_reset in [True, False]: ctx = mp.get_context(None) parent_pipe, child_pipe = ctx.Pipe() @@ -101,17 +104,12 @@ def test_worker(): process = ctx.Process( target=main_control, name="test", - args=( - parent_pipe, - child_pipe - ), + args=(parent_pipe, child_pipe), ) process.daemon = True process.start() _worker(0, init_envs()[0], child_pipe, None, False, error_queue, auto_reset) - - if __name__ == "__main__": sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 888930219ebbdd09be91e48dc6b92281eea8f2c9 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 23 Oct 2023 16:47:03 +0800 Subject: [PATCH 24/78] update _worker test --- tests/test_env/test_vec_env/test_async_env.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tests/test_env/test_vec_env/test_async_env.py b/tests/test_env/test_vec_env/test_async_env.py index ed514352..2c2301d3 100644 --- a/tests/test_env/test_vec_env/test_async_env.py +++ b/tests/test_env/test_vec_env/test_async_env.py @@ -49,18 +49,18 @@ def assert_env_name(env, env_name): assert env.metadata["name"].__name__ == env_name -# @pytest.mark.unittest -# def test_async_env(): -# env_name = "IdentityEnv" -# env = AsyncVectorEnv(init_envs(), shared_memory=True) -# assert ( -# env._env_name == env_name -# ), "AsyncVectorEnv should have the same metadata as the wrapped env" -# env.exec_func(assert_env_name, indices=None, env_name=env_name) -# env.call("render") -# env_name_new = "IdentityEnvNew" -# env.set_attr("metadata", {"name": env_name_new}) -# env.exec_func(assert_env_name, indices=None, env_name=env_name_new) +@pytest.mark.unittest +def test_async_env(): + env_name = "IdentityEnv" + env = AsyncVectorEnv(init_envs(), shared_memory=True) + assert ( + env._env_name == env_name + ), "AsyncVectorEnv should have the same metadata as the wrapped env" + env.exec_func(assert_env_name, indices=None, env_name=env_name) + env.call("render") + env_name_new = "IdentityEnvNew" + env.set_attr("metadata", {"name": env_name_new}) + env.exec_func(assert_env_name, indices=None, env_name=env_name_new) def main_control(parent_pipe, child_pipe): From 0228e51e8fab3a02c4e841cbbb09b7dc2c1b4e02 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 23 Oct 2023 17:15:01 +0800 Subject: [PATCH 25/78] fix arena petting zoo import error format --- openrl/algorithms/dqn.py | 4 ++- openrl/algorithms/vdn.py | 4 ++- openrl/arena/__init__.py | 4 ++- openrl/envs/mpe/rendering.py | 10 +++--- openrl/envs/snake/snake.py | 4 ++- openrl/envs/vec_env/async_venv.py | 34 ++++++------------- openrl/utils/callbacks/checkpoint_callback.py | 4 +-- openrl/utils/evaluation.py | 10 +++--- 8 files changed, 32 insertions(+), 42 deletions(-) diff --git a/openrl/algorithms/dqn.py b/openrl/algorithms/dqn.py index bbca547b..ebd8d727 100644 --- a/openrl/algorithms/dqn.py +++ b/openrl/algorithms/dqn.py @@ -167,7 +167,9 @@ def prepare_loss( ) q_targets = rewards_batch + self.gamma * max_next_q_values * next_masks_batch - q_loss = torch.mean(F.mse_loss(q_values, q_targets.detach())) # 均方误差损失函数 + q_loss = torch.mean( + F.mse_loss(q_values, q_targets.detach()) + ) # 均方误差损失函数 loss_list.append(q_loss) diff --git a/openrl/algorithms/vdn.py b/openrl/algorithms/vdn.py index f1215c03..83bdb5ed 100644 --- a/openrl/algorithms/vdn.py +++ b/openrl/algorithms/vdn.py @@ -211,7 +211,9 @@ def prepare_loss( rewards_batch = rewards_batch.reshape(-1, self.n_agent, 1) rewards_batch = torch.sum(rewards_batch, dim=1, keepdim=True).view(-1, 1) q_targets = rewards_batch + self.gamma * max_next_q_values * next_masks_batch - q_loss = torch.mean(F.mse_loss(q_values, q_targets.detach())) # 均方误差损失函数 + q_loss = torch.mean( + F.mse_loss(q_values, q_targets.detach()) + ) # 均方误差损失函数 loss_list.append(q_loss) return loss_list diff --git a/openrl/arena/__init__.py b/openrl/arena/__init__.py index 4bea924d..cb154a9f 100644 --- a/openrl/arena/__init__.py +++ b/openrl/arena/__init__.py @@ -30,9 +30,11 @@ def make_arena( **kwargs, ): if custom_build_env is None: + from openrl.envs import PettingZoo + if ( env_id in pettingzoo_all_envs - or env_id in openrl.envs.PettingZoo.registration.pettingzoo_env_dict.keys() + or env_id in PettingZoo.registration.pettingzoo_env_dict.keys() ): from openrl.envs.PettingZoo import make_PettingZoo_env diff --git a/openrl/envs/mpe/rendering.py b/openrl/envs/mpe/rendering.py index a7197dca..6dae5d66 100644 --- a/openrl/envs/mpe/rendering.py +++ b/openrl/envs/mpe/rendering.py @@ -31,12 +31,10 @@ except ImportError: print( "Error occured while running `from pyglet.gl import *`", - ( - "HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get" - " install python-opengl'. If you're running on a server, you may need a" - " virtual frame buffer; something like this should work: 'xvfb-run -s" - ' "-screen 0 1400x900x24" python \'' - ), + "HINT: make sure you have OpenGL install. On Ubuntu, you can run 'apt-get" + " install python-opengl'. If you're running on a server, you may need a" + " virtual frame buffer; something like this should work: 'xvfb-run -s" + ' "-screen 0 1400x900x24" python \'', ) import math diff --git a/openrl/envs/snake/snake.py b/openrl/envs/snake/snake.py index 73e81229..4a5be6a5 100644 --- a/openrl/envs/snake/snake.py +++ b/openrl/envs/snake/snake.py @@ -674,7 +674,9 @@ class Snake: def __init__(self, player_id, board_width, board_height, init_len): self.actions = [-2, 2, -1, 1] self.actions_name = {-2: "up", 2: "down", -1: "left", 1: "right"} - self.direction = random.choice(self.actions) # 方向[-2,2,-1,1]分别表示[上,下,左,右] + self.direction = random.choice( + self.actions + ) # 方向[-2,2,-1,1]分别表示[上,下,左,右] self.board_width = board_width self.board_height = board_height x = random.randrange(0, board_height) diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index 54ab2c80..dd654599 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -234,10 +234,8 @@ def reset_send( if self._state != AsyncState.DEFAULT: raise AlreadyPendingCallError( - ( - "Calling `reset_send` while waiting for a pending call to" - f" `{self._state.value}` to complete" - ), + "Calling `reset_send` while waiting for a pending call to" + f" `{self._state.value}` to complete", self._state.value, ) @@ -329,10 +327,8 @@ def step_send(self, actions: np.ndarray): self._assert_is_running() if self._state != AsyncState.DEFAULT: raise AlreadyPendingCallError( - ( - "Calling `step_send` while waiting for a pending call to" - f" `{self._state.value}` to complete." - ), + "Calling `step_send` while waiting for a pending call to" + f" `{self._state.value}` to complete.", self._state.value, ) @@ -342,9 +338,7 @@ def step_send(self, actions: np.ndarray): pipe.send(("step", action)) self._state = AsyncState.WAITING_STEP - def step_fetch( - self, timeout: Optional[Union[int, float]] = None - ) -> Union[ + def step_fetch(self, timeout: Optional[Union[int, float]] = None) -> Union[ Tuple[Any, NDArray[Any], NDArray[Any], List[Dict[str, Any]]], Tuple[Any, NDArray[Any], NDArray[Any], NDArray[Any], List[Dict[str, Any]]], ]: @@ -576,10 +570,8 @@ def call_send(self, name: str, *args, **kwargs): self._assert_is_running() if self._state != AsyncState.DEFAULT: raise AlreadyPendingCallError( - ( - "Calling `call_send` while waiting " - f"for a pending call to `{self._state.value}` to complete." - ), + "Calling `call_send` while waiting " + f"for a pending call to `{self._state.value}` to complete.", str(self._state.value), ) @@ -636,10 +628,8 @@ def exec_func_send(self, func: Callable, indices, *args, **kwargs): self._assert_is_running() if self._state != AsyncState.DEFAULT: raise AlreadyPendingCallError( - ( - "Calling `exec_func_send` while waiting " - f"for a pending call to `{self._state.value}` to complete." - ), + "Calling `exec_func_send` while waiting " + f"for a pending call to `{self._state.value}` to complete.", str(self._state.value), ) @@ -717,10 +707,8 @@ def set_attr(self, name: str, values: Union[List[Any], Tuple[Any], object]): if self._state != AsyncState.DEFAULT: raise AlreadyPendingCallError( - ( - "Calling `set_attr` while waiting " - f"for a pending call to `{self._state.value}` to complete." - ), + "Calling `set_attr` while waiting " + f"for a pending call to `{self._state.value}` to complete.", str(self._state.value), ) diff --git a/openrl/utils/callbacks/checkpoint_callback.py b/openrl/utils/callbacks/checkpoint_callback.py index a4b3f5b6..56bf31b8 100644 --- a/openrl/utils/callbacks/checkpoint_callback.py +++ b/openrl/utils/callbacks/checkpoint_callback.py @@ -72,9 +72,7 @@ def _checkpoint_path(self, checkpoint_type: str = "", extension: str = "") -> st """ return os.path.join( self.save_path, - ( - f"{self.name_prefix}_{checkpoint_type}{self.num_time_steps}_steps{'.' if extension else ''}{extension}" - ), + f"{self.name_prefix}_{checkpoint_type}{self.num_time_steps}_steps{'.' if extension else ''}{extension}", ) def _on_step(self) -> bool: diff --git a/openrl/utils/evaluation.py b/openrl/utils/evaluation.py index d603daa5..391ba10f 100644 --- a/openrl/utils/evaluation.py +++ b/openrl/utils/evaluation.py @@ -68,12 +68,10 @@ def evaluate_policy( if not is_monitor_wrapped and warn: warnings.warn( - ( - "Evaluation environment is not wrapped with a ``Monitor`` wrapper. This" - " may result in reporting modified episode lengths and rewards, if" - " other wrappers happen to modify these. Consider wrapping environment" - " first with ``Monitor`` wrapper." - ), + "Evaluation environment is not wrapped with a ``Monitor`` wrapper. This" + " may result in reporting modified episode lengths and rewards, if" + " other wrappers happen to modify these. Consider wrapping environment" + " first with ``Monitor`` wrapper.", UserWarning, ) From 537f822d79f8d89a11066b444b4339128c2baabf Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 24 Oct 2023 13:04:18 +0800 Subject: [PATCH 26/78] arena add test more envs --- examples/arena/README.md | 9 ++ examples/arena/evaluate_more_envs.py | 122 +++++++++++++++++++ examples/custom_env/rock_paper_scissors.py | 1 + openrl/arena/games/two_player_game.py | 14 ++- openrl/envs/wrappers/pettingzoo_wrappers.py | 3 +- openrl/selfplay/opponents/random_opponent.py | 17 ++- setup.py | 2 + tests/test_arena/test_new_envs.py | 107 ++++++++++++++++ 8 files changed, 264 insertions(+), 11 deletions(-) create mode 100644 examples/arena/evaluate_more_envs.py create mode 100644 tests/test_arena/test_new_envs.py diff --git a/examples/arena/README.md b/examples/arena/README.md index e9d59b91..940bea33 100644 --- a/examples/arena/README.md +++ b/examples/arena/README.md @@ -3,6 +3,7 @@ ```bash pip install "openrl[selfplay]" +pip install "pettingzoo[mpe]","pettingzoo[butterfly]" ``` ### Usage @@ -15,3 +16,11 @@ python run_arena.py ### Evaluate Google Research Football submissions for JiDi locally If you want to evaluate your Google Research Football submissions for JiDi locally, please try to use tizero as illustrated [here](foothttps://github.com/OpenRL-Lab/TiZero#evaluate-jidi-submissions-locally). + +### Evaluate more environments + +We also provide a script to evaluate more environments, including MPE, Go, Texas Holdem, Butterfly. You can run the script as follows: + +```shell +python evaluate_more_envs.py +``` \ No newline at end of file diff --git a/examples/arena/evaluate_more_envs.py b/examples/arena/evaluate_more_envs.py new file mode 100644 index 00000000..1433bf06 --- /dev/null +++ b/examples/arena/evaluate_more_envs.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +from pettingzoo.butterfly import cooperative_pong_v5 +from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6 +from pettingzoo.mpe import simple_push_v3 + +from examples.custom_env.rock_paper_scissors import RockPaperScissors +from openrl.arena import make_arena +from openrl.arena.agents.local_agent import LocalAgent +from openrl.envs.PettingZoo.registration import register +from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner + + +def ConnectFourEnv(render_mode, **kwargs): + return connect_four_v3.env(render_mode) + + +def RockPaperScissorsEnv(render_mode, **kwargs): + return RockPaperScissors(render_mode) + + +def GoEnv(render_mode, **kwargs): + return go_v5.env(render_mode=render_mode, board_size=5, komi=7.5) + + +def TexasHoldemEnv(render_mode, **kwargs): + return texas_holdem_no_limit_v6.env(render_mode=render_mode) + + +# MPE +def SimplePushEnv(render_mode, **kwargs): + return simple_push_v3.env(render_mode=render_mode) + + +def CooperativePongEnv(render_mode, **kwargs): + return cooperative_pong_v5.env(render_mode=render_mode) + + +def register_new_envs(): + new_env_dict = { + "connect_four_v3": ConnectFourEnv, + "RockPaperScissors": RockPaperScissorsEnv, + "go_v5": GoEnv, + "texas_holdem_no_limit_v6": TexasHoldemEnv, + "simple_push_v3": SimplePushEnv, + "cooperative_pong_v5": CooperativePongEnv, + } + + for env_id, env in new_env_dict.items(): + register(env_id, env) + return new_env_dict.keys() + + +def run_arena( + env_id: str, + parallel: bool = True, + seed=0, + total_games: int = 10, + max_game_onetime: int = 5, +): + env_wrappers = [RecordWinner] + + arena = make_arena(env_id, env_wrappers=env_wrappers, use_tqdm=False) + + agent1 = LocalAgent("../selfplay/opponent_templates/random_opponent") + agent2 = LocalAgent("../selfplay/opponent_templates/random_opponent") + + arena.reset( + agents={"agent1": agent1, "agent2": agent2}, + total_games=total_games, + max_game_onetime=max_game_onetime, + seed=seed, + ) + result = arena.run(parallel=parallel) + arena.close() + print(result) + return result + + +def test_new_envs(): + env_ids = register_new_envs() + seed = 0 + for env_id in env_ids: + run_arena(env_id=env_id, seed=seed, parallel=False, total_games=1) + + +if __name__ == "__main__": + test_new_envs() diff --git a/examples/custom_env/rock_paper_scissors.py b/examples/custom_env/rock_paper_scissors.py index 2811a1ff..7d5649d1 100644 --- a/examples/custom_env/rock_paper_scissors.py +++ b/examples/custom_env/rock_paper_scissors.py @@ -182,6 +182,7 @@ def step(self, action): # handles stepping an agent which is already dead # accepts a None action for the one agent, and moves the agent_selection to # the next dead agent, or if there are no more dead agents, to the next live agent + action = None self._was_dead_step(action) return diff --git a/openrl/arena/games/two_player_game.py b/openrl/arena/games/two_player_game.py index 7a1b4e0e..40585393 100644 --- a/openrl/arena/games/two_player_game.py +++ b/openrl/arena/games/two_player_game.py @@ -31,9 +31,10 @@ def default_dispatch_func( players: List[str], agent_names: List[str], ) -> Dict[str, str]: - assert len(players) == len( - agent_names - ), "The number of players must be equal to the number of agents." + assert len(players) == len(agent_names), ( + f"The number of players {len(players)} must be equal to the number of" + f" agents: {len(agent_names)}." + ) assert len(players) == 2, "The number of players must be equal to 2." np_random.shuffle(agent_names) return dict(zip(players, agent_names)) @@ -49,20 +50,21 @@ def _run(self, env_fn: Callable, agents: List[BaseAgent]): for player, agent in player2agent.items(): agent.reset(env, player) result = {} + truncation_dict = {} while True: termination = False info = {} for player_name in env.agent_iter(): observation, reward, termination, truncation, info = env.last() - - if termination: + truncation_dict[player_name] = truncation + if termination or all(truncation_dict.values()): break action = player2agent[player_name].act( player_name, observation, reward, termination, truncation, info ) env.step(action) - if termination: + if termination or all(truncation_dict.values()): assert "winners" in info, "The game is terminated but no winners." assert "losers" in info, "The game is terminated but no losers." diff --git a/openrl/envs/wrappers/pettingzoo_wrappers.py b/openrl/envs/wrappers/pettingzoo_wrappers.py index 226fdb9f..c571ff79 100644 --- a/openrl/envs/wrappers/pettingzoo_wrappers.py +++ b/openrl/envs/wrappers/pettingzoo_wrappers.py @@ -96,8 +96,9 @@ def last(self, observe: bool = True): winners = None losers = None + for agent in self.terminations: - if self.terminations[agent]: + if self.terminations[agent] or all(self.truncations): if winners is None: winners = self.get_winners() losers = [player for player in self.agents if player not in winners] diff --git a/openrl/selfplay/opponents/random_opponent.py b/openrl/selfplay/opponents/random_opponent.py index 1f396c34..501d571a 100644 --- a/openrl/selfplay/opponents/random_opponent.py +++ b/openrl/selfplay/opponents/random_opponent.py @@ -47,11 +47,20 @@ def _sample_random_action( action = [] for obs, space in zip(observation, action_space): - mask = obs.get("action_mask", None) - action.append(space.sample(mask)) + if termination or truncation: + action.append(None) + else: + if isinstance(obs, dict): + mask = obs.get("action_mask", None) + else: + mask = None + action.append(space.sample(mask)) else: - mask = observation.get("action_mask", None) - action = action_space.sample(mask) + if termination or truncation: + action = None + else: + mask = observation.get("action_mask", None) + action = action_space.sample(mask) return action def _load(self, opponent_path: Union[str, Path]): diff --git a/setup.py b/setup.py index ffb91a94..172343bf 100644 --- a/setup.py +++ b/setup.py @@ -71,11 +71,13 @@ def get_extra_requires() -> dict: "evaluate", ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], + "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], "atari": ["gymnasium[atari]", "gymnasium[accept-rom-license]"], } req["test"].extend(req["selfplay"]) + req["test"].extend(req["selfplay_test"]) req["test"].extend(req["atari"]) req["test"].extend(req["nlp_test"]) return req diff --git a/tests/test_arena/test_new_envs.py b/tests/test_arena/test_new_envs.py new file mode 100644 index 00000000..5dc6231e --- /dev/null +++ b/tests/test_arena/test_new_envs.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import pytest +from pettingzoo.butterfly import cooperative_pong_v5 +from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6 +from pettingzoo.mpe import simple_push_v3 + +from examples.custom_env.rock_paper_scissors import RockPaperScissors +from openrl.arena import make_arena +from openrl.arena.agents.local_agent import LocalAgent +from openrl.envs.PettingZoo.registration import register +from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner + + +def ConnectFourEnv(render_mode, **kwargs): + return connect_four_v3.env(render_mode) + + +def RockPaperScissorsEnv(render_mode, **kwargs): + return RockPaperScissors(render_mode) + + +def GoEnv(render_mode, **kwargs): + return go_v5.env(render_mode=render_mode, board_size=5, komi=7.5) + + +def TexasHoldemEnv(render_mode, **kwargs): + return texas_holdem_no_limit_v6.env(render_mode=render_mode) + + +# MPE +def SimplePushEnv(render_mode, **kwargs): + return simple_push_v3.env(render_mode=render_mode) + + +def CooperativePongEnv(render_mode, **kwargs): + return cooperative_pong_v5.env(render_mode=render_mode) + + +def register_new_envs(): + new_env_dict = { + "connect_four_v3": ConnectFourEnv, + "RockPaperScissors": RockPaperScissorsEnv, + "go_v5": GoEnv, + "texas_holdem_no_limit_v6": TexasHoldemEnv, + "simple_push_v3": SimplePushEnv, + "cooperative_pong_v5": CooperativePongEnv, + } + + for env_id, env in new_env_dict.items(): + register(env_id, env) + return new_env_dict.keys() + + +def run_arena( + env_id: str, + parallel: bool = True, + seed=0, + total_games: int = 10, + max_game_onetime: int = 5, +): + env_wrappers = [RecordWinner] + + arena = make_arena(env_id, env_wrappers=env_wrappers, use_tqdm=False) + + agent1 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") + agent2 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") + + arena.reset( + agents={"agent1": agent1, "agent2": agent2}, + total_games=total_games, + max_game_onetime=max_game_onetime, + seed=seed, + ) + result = arena.run(parallel=parallel) + arena.close() + return result + + +@pytest.mark.unittest +def test_new_envs(): + env_ids = register_new_envs() + seed = 0 + for env_id in env_ids: + run_arena(env_id=env_id, seed=seed, parallel=False, total_games=1) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 5ec7e29fab11df78d4f7b9cbeb5460129de01a7d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 24 Oct 2023 13:05:59 +0800 Subject: [PATCH 27/78] update --- examples/arena/evaluate_more_envs.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/examples/arena/evaluate_more_envs.py b/examples/arena/evaluate_more_envs.py index 1433bf06..1f981346 100644 --- a/examples/arena/evaluate_more_envs.py +++ b/examples/arena/evaluate_more_envs.py @@ -16,24 +16,6 @@ """""" -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" - from pettingzoo.butterfly import cooperative_pong_v5 from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6 from pettingzoo.mpe import simple_push_v3 From b41fbc8c59c4279e7db722f2635135a0d4cdcc2b Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 24 Oct 2023 13:10:55 +0800 Subject: [PATCH 28/78] update --- examples/arena/evaluate_more_envs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/arena/evaluate_more_envs.py b/examples/arena/evaluate_more_envs.py index 1f981346..f55dc576 100644 --- a/examples/arena/evaluate_more_envs.py +++ b/examples/arena/evaluate_more_envs.py @@ -17,10 +17,10 @@ """""" from pettingzoo.butterfly import cooperative_pong_v5 -from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6 +from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6,rps_v2 from pettingzoo.mpe import simple_push_v3 -from examples.custom_env.rock_paper_scissors import RockPaperScissors + from openrl.arena import make_arena from openrl.arena.agents.local_agent import LocalAgent from openrl.envs.PettingZoo.registration import register @@ -32,7 +32,7 @@ def ConnectFourEnv(render_mode, **kwargs): def RockPaperScissorsEnv(render_mode, **kwargs): - return RockPaperScissors(render_mode) + return rps_v2.env(num_actions=3, max_cycles=15) def GoEnv(render_mode, **kwargs): From a2df15d4741770b7ff7e935c6cc70cadf3a30a2a Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 25 Oct 2023 11:29:14 +0800 Subject: [PATCH 29/78] add RandomAgent for Arena --- examples/arena/evaluate_more_envs.py | 6 ++--- examples/arena/run_arena.py | 14 +++++++++--- openrl/arena/agents/random_agent.py | 29 ++++++++++++++++++++++++ tests/test_arena/test_new_envs.py | 3 ++- tests/test_arena/test_reproducibility.py | 3 ++- 5 files changed, 47 insertions(+), 8 deletions(-) create mode 100644 openrl/arena/agents/random_agent.py diff --git a/examples/arena/evaluate_more_envs.py b/examples/arena/evaluate_more_envs.py index f55dc576..3b7bfe07 100644 --- a/examples/arena/evaluate_more_envs.py +++ b/examples/arena/evaluate_more_envs.py @@ -17,12 +17,12 @@ """""" from pettingzoo.butterfly import cooperative_pong_v5 -from pettingzoo.classic import connect_four_v3, go_v5, texas_holdem_no_limit_v6,rps_v2 +from pettingzoo.classic import connect_four_v3, go_v5, rps_v2, texas_holdem_no_limit_v6 from pettingzoo.mpe import simple_push_v3 - from openrl.arena import make_arena from openrl.arena.agents.local_agent import LocalAgent +from openrl.arena.agents.random_agent import RandomAgent from openrl.envs.PettingZoo.registration import register from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner @@ -79,7 +79,7 @@ def run_arena( arena = make_arena(env_id, env_wrappers=env_wrappers, use_tqdm=False) agent1 = LocalAgent("../selfplay/opponent_templates/random_opponent") - agent2 = LocalAgent("../selfplay/opponent_templates/random_opponent") + agent2 = RandomAgent() arena.reset( agents={"agent1": agent1, "agent2": agent2}, diff --git a/examples/arena/run_arena.py b/examples/arena/run_arena.py index e880884c..fdc0776a 100644 --- a/examples/arena/run_arena.py +++ b/examples/arena/run_arena.py @@ -17,6 +17,7 @@ """""" from openrl.arena import make_arena from openrl.arena.agents.local_agent import LocalAgent +from openrl.arena.agents.random_agent import RandomAgent from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner @@ -37,7 +38,7 @@ def run_arena( arena = make_arena("tictactoe_v3", env_wrappers=env_wrappers, use_tqdm=use_tqdm) agent1 = LocalAgent("../selfplay/opponent_templates/random_opponent") - agent2 = LocalAgent("../selfplay/opponent_templates/random_opponent") + agent2 = RandomAgent() arena.reset( agents={"agent1": agent1, "agent2": agent2}, @@ -52,5 +53,12 @@ def run_arena( if __name__ == "__main__": - run_arena(render=False, parallel=True, seed=0, total_games=100, max_game_onetime=10) - # run_arena(render=False, parallel=False, seed=1, total_games=1, max_game_onetime=1,use_tqdm=False) + # run_arena(render=False, parallel=True, seed=0, total_games=100, max_game_onetime=10) + run_arena( + render=False, + parallel=False, + seed=1, + total_games=300, + max_game_onetime=1, + use_tqdm=False, + ) diff --git a/openrl/arena/agents/random_agent.py b/openrl/arena/agents/random_agent.py new file mode 100644 index 00000000..d09e5e15 --- /dev/null +++ b/openrl/arena/agents/random_agent.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +from openrl.arena.agents.base_agent import BaseAgent +from openrl.selfplay.opponents.base_opponent import BaseOpponent +from openrl.selfplay.opponents.random_opponent import RandomOpponent +from openrl.selfplay.opponents.utils import load_opponent_from_path + + +class RandomAgent(BaseAgent): + def __init__(self): + super().__init__() + + def _new_agent(self) -> BaseOpponent: + return RandomOpponent() diff --git a/tests/test_arena/test_new_envs.py b/tests/test_arena/test_new_envs.py index 5dc6231e..7a5dc01d 100644 --- a/tests/test_arena/test_new_envs.py +++ b/tests/test_arena/test_new_envs.py @@ -26,6 +26,7 @@ from examples.custom_env.rock_paper_scissors import RockPaperScissors from openrl.arena import make_arena from openrl.arena.agents.local_agent import LocalAgent +from openrl.arena.agents.random_agent import RandomAgent from openrl.envs.PettingZoo.registration import register from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner @@ -82,7 +83,7 @@ def run_arena( arena = make_arena(env_id, env_wrappers=env_wrappers, use_tqdm=False) agent1 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") - agent2 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") + agent2 = RandomAgent() arena.reset( agents={"agent1": agent1, "agent2": agent2}, diff --git a/tests/test_arena/test_reproducibility.py b/tests/test_arena/test_reproducibility.py index 0d186ab0..9ced525c 100644 --- a/tests/test_arena/test_reproducibility.py +++ b/tests/test_arena/test_reproducibility.py @@ -22,6 +22,7 @@ from openrl.arena import make_arena from openrl.arena.agents.local_agent import LocalAgent +from openrl.arena.agents.random_agent import RandomAgent from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner @@ -41,7 +42,7 @@ def run_arena( arena = make_arena("tictactoe_v3", env_wrappers=env_wrappers, use_tqdm=False) agent1 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") - agent2 = LocalAgent("./examples/selfplay/opponent_templates/random_opponent") + agent2 = RandomAgent() arena.reset( agents={"agent1": agent1, "agent2": agent2}, From f6c608049423db9c127080177e9c1aff139d7e8f Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 25 Oct 2023 14:58:12 +0800 Subject: [PATCH 30/78] add test attention.py --- openrl/configs/config.py | 2 +- openrl/modules/networks/utils/attention.py | 17 ++++--- .../test_networks/test_attention.py | 46 +++++++++++++++++++ 3 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 tests/test_modules/test_networks/test_attention.py diff --git a/openrl/configs/config.py b/openrl/configs/config.py index 457e69c1..c0fd176d 100644 --- a/openrl/configs/config.py +++ b/openrl/configs/config.py @@ -618,7 +618,7 @@ def create_config_parser(): ) parser.add_argument( "--use_average_pool", - action="store_false", + type=bool, default=True, help="by default True, use average pooling for attn model.", ) diff --git a/openrl/modules/networks/utils/attention.py b/openrl/modules/networks/utils/attention.py index c00a3f24..a05a9a84 100644 --- a/openrl/modules/networks/utils/attention.py +++ b/openrl/modules/networks/utils/attention.py @@ -234,10 +234,13 @@ def forward(self, x, self_idx=-1): K = self.split_shape[i][0] L = self.split_shape[i][1] for j in range(K): - torch.cat((x[i][:, (L * j) : (L * j + L)], self_x), dim=-1) - exec("x1.append(self.fc_{}(temp))".format(i)) - x[self_idx] - exec("x1.append(self.fc_{}(temp))".format(N - 1)) + # torch.cat((x[i][:, (L * j) : (L * j + L)], self_x), dim=-1) + # exec("x1.append(self.fc_{}(temp))".format(i)) + temp = torch.cat((x[i][:, (L * j) : (L * j + L)], self_x), dim=-1) + x1.append(getattr(self, "fc_" + str(i))(temp)) + x1.append(getattr(self, "fc_" + str(N - 1))(self_x)) + # x[self_idx] + # exec("x1.append(self.fc_{}(temp))".format(N - 1)) out = torch.stack(x1, 1) @@ -278,8 +281,10 @@ def forward(self, x, self_idx=None): K = self.split_shape[i][0] L = self.split_shape[i][1] for j in range(K): - x[i][:, (L * j) : (L * j + L)] - exec("x1.append(self.fc_{}(temp))".format(i)) + # x[i][:, (L * j) : (L * j + L)] + # exec("x1.append(self.fc_{}(temp))".format(i)) + temp = x[i][:, (L * j) : (L * j + L)] + x1.append(getattr(self, "fc_" + str(i))(temp)) out = torch.stack(x1, 1) diff --git a/tests/test_modules/test_networks/test_attention.py b/tests/test_modules/test_networks/test_attention.py new file mode 100644 index 00000000..599d0000 --- /dev/null +++ b/tests/test_modules/test_networks/test_attention.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest +import torch + +from openrl.configs.config import create_config_parser +from openrl.modules.networks.utils.attention import Encoder + + +@pytest.fixture( + scope="module", params=["--use_average_pool True", "--use_average_pool False"] +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_attention(config): + for cat_self in [False, True]: + net = Encoder(cfg=config, split_shape=[[1, 1], [1, 1]], cat_self=cat_self) + net(torch.zeros((1, 1))) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From f3667ab71161c7a3621705935309aecc05aad25b Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 26 Oct 2023 17:17:20 +0800 Subject: [PATCH 31/78] update test --- .gitignore | 1 - examples/nlp/nlp_ppo.yaml | 1 + openrl/envs/__init__.py | 3 - openrl/envs/nlp/daily_dialog_env.py | 2 +- openrl/envs/nlp/rewards/intent.py | 38 ++++- openrl/envs/nlp/rewards/kl_penalty.py | 11 +- openrl/envs/nlp/rewards/meteor.py | 16 +- openrl/envs/toy_envs/__init__.py | 20 +-- openrl/envs/toy_envs/identity_env.py | 111 ------------- openrl/envs/toy_envs/multi_input_envs.py | 185 ---------------------- openrl/envs/vec_env/async_venv.py | 1 - openrl/rewards/nlp_reward.py | 7 +- openrl/utils/logger.py | 6 +- setup.py | 9 +- tests/test_dataset/test_expert_dataset.py | 89 +++++++++++ tests/test_rewards/test_nlp_reward.py | 73 +++++++++ 16 files changed, 235 insertions(+), 338 deletions(-) delete mode 100644 openrl/envs/toy_envs/multi_input_envs.py create mode 100644 tests/test_dataset/test_expert_dataset.py create mode 100644 tests/test_rewards/test_nlp_reward.py diff --git a/.gitignore b/.gitignore index c92a6657..80ced1f6 100644 --- a/.gitignore +++ b/.gitignore @@ -153,7 +153,6 @@ run_results/ api_docs .vscode *.pkl -api_docs *.json opponent_pool !/examples/selfplay/opponent_templates/tictactoe_opponent/info.json diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index 0b4e0f56..1ea1cc5b 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -28,5 +28,6 @@ reward_class: args: { "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", + "use_deepspeed": true, } \ No newline at end of file diff --git a/openrl/envs/__init__.py b/openrl/envs/__init__.py index a2eb835f..d12c493a 100644 --- a/openrl/envs/__init__.py +++ b/openrl/envs/__init__.py @@ -16,12 +16,9 @@ toy_all_envs = [ "BitFlippingEnv", - "FakeImageEnv", "IdentityEnv", "IdentityEnvcontinuous", "IdentityEnvBox", - "IdentityEnvMultiBinary", - "IdentityEnvMultiDiscrete", "SimpleMultiObsEnv", "SimpleMultiObsEnv", ] diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 4cb49df1..61e68946 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -43,7 +43,7 @@ def __init__( self.env_name = "daily_dialog" tokenizer_name = cfg.env.args["tokenizer_path"] if tokenizer_name == "builtin_BPE": - from tokenizers import AddedToken, Tokenizer, models + from tokenizers import Tokenizer, models self.tokenizer = Tokenizer(models.BPE()) diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index f2f9bf11..812cc5f4 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -25,15 +25,42 @@ def get_eval_ds_config(offload, stage=0): class Intent: - def __init__(self, intent_model: str, intent_coeff: float = 1.0) -> None: + def __init__( + self, intent_model: str, intent_coeff: float = 1.0, use_deepspeed: bool = True + ) -> None: super().__init__() self._intent_coeff = intent_coeff - self.use_deepspeed = True # TODO + self.use_deepspeed = use_deepspeed + if intent_model == "builtin_intent": + from transformers import GPT2Config, GPT2LMHeadModel + + class TestTokenizer: + def __call__( + self, + input_texts, + return_tensors="pt", + truncation=True, + padding=True, + max_length=None, + ): + class EncodedOutput: + def __init__(self, input_ids, attention_mask): + self.input_ids = input_ids + self.attention_mask = attention_mask + + input_ids = torch.zeros((32), dtype=torch.long) + attention_masks = torch.zeros((32), dtype=torch.long) + return EncodedOutput(input_ids, attention_masks) + + self._tokenizer = TestTokenizer() + config = GPT2Config() + self._model = GPT2LMHeadModel(config) - model_path = data_abs_path(intent_model) - self._tokenizer = AutoTokenizer.from_pretrained(intent_model) - self._model = AutoModelForSequenceClassification.from_pretrained(model_path) + else: + model_path = data_abs_path(intent_model) + self._tokenizer = AutoTokenizer.from_pretrained(intent_model) + self._model = AutoModelForSequenceClassification.from_pretrained(model_path) if self.use_deepspeed: import deepspeed @@ -87,6 +114,7 @@ def get_input_for_classifier(prompt, generated_text): input_ids=encoded.input_ids.to(self._device), attention_mask=encoded.attention_mask.to(self._device), ) + pred_labels = torch.argmax(outputs.logits, dim=1).tolist() score = (np.array(pred_labels) == np.array(target_intents)) * 1.0 diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index ea109c45..643d263d 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -31,14 +31,21 @@ def __init__( action_space: gym.Space, ref_model: str, apply_model_parallel: bool = True, + use_deepspeed: bool = True, ): super().__init__() - self.use_deepspeed = True + self.use_deepspeed = use_deepspeed self.use_fp16 = True # reference model self._apply_model_parallel = apply_model_parallel - self._ref_net = AutoModelForCausalLM.from_pretrained(ref_model) + if ref_model == "builtin_ref": + from transformers import GPT2Config, GPT2LMHeadModel + + config = GPT2Config() + self._ref_net = GPT2LMHeadModel(config) + else: + self._ref_net = AutoModelForCausalLM.from_pretrained(ref_model) self._ref_net = self._ref_net.eval() if self.use_deepspeed: import deepspeed diff --git a/openrl/envs/nlp/rewards/meteor.py b/openrl/envs/nlp/rewards/meteor.py index c9acd16f..5bd169ad 100644 --- a/openrl/envs/nlp/rewards/meteor.py +++ b/openrl/envs/nlp/rewards/meteor.py @@ -6,13 +6,21 @@ import openrl.envs.nlp as nlp +class VirtualMetric: + def compute(self, predictions: Any, references: Any) -> Dict[str, float]: + return {"meteor": 0.0} + + class Meteor: - def __init__(self, meteor_coeff: int) -> None: + def __init__(self, meteor_coeff: int, test: bool = False) -> None: super().__init__() self._meteor_coeff = meteor_coeff - self._metric = evaluate.load( - str(Path(nlp.__file__).parent / "utils/metrics/meteor.py") - ) + if test: + self._metric = VirtualMetric() + else: + self._metric = evaluate.load( + str(Path(nlp.__file__).parent / "utils/metrics/meteor.py") + ) def __call__( self, diff --git a/openrl/envs/toy_envs/__init__.py b/openrl/envs/toy_envs/__init__.py index 4e6588ef..cf785cc5 100644 --- a/openrl/envs/toy_envs/__init__.py +++ b/openrl/envs/toy_envs/__init__.py @@ -18,25 +18,12 @@ from typing import Any from openrl.envs.toy_envs.bit_flipping_env import BitFlippingEnv -from openrl.envs.toy_envs.identity_env import ( - FakeImageEnv, - IdentityEnv, - IdentityEnvBox, - IdentityEnvcontinuous, - IdentityEnvMultiBinary, - IdentityEnvMultiDiscrete, -) -from openrl.envs.toy_envs.multi_input_envs import SimpleMultiObsEnv +from openrl.envs.toy_envs.identity_env import IdentityEnv, IdentityEnvcontinuous __all__ = [ "BitFlippingEnv", - "FakeImageEnv", "IdentityEnv", "IdentityEnvcontinuous", - "IdentityEnvBox", - "IdentityEnvMultiBinary", - "IdentityEnvMultiDiscrete", - "SimpleMultiObsEnv", ] @@ -49,13 +36,8 @@ env_dict = { "BitFlippingEnv": BitFlippingEnv, - "FakeImageEnv": FakeImageEnv, "IdentityEnv": IdentityEnv, "IdentityEnvcontinuous": IdentityEnvcontinuous, - "IdentityEnvBox": IdentityEnvBox, - "IdentityEnvMultiBinary": IdentityEnvMultiBinary, - "IdentityEnvMultiDiscrete": IdentityEnvMultiDiscrete, - "SimpleMultiObsEnv": SimpleMultiObsEnv, } diff --git a/openrl/envs/toy_envs/identity_env.py b/openrl/envs/toy_envs/identity_env.py index c3d4caa2..c3867756 100644 --- a/openrl/envs/toy_envs/identity_env.py +++ b/openrl/envs/toy_envs/identity_env.py @@ -157,114 +157,3 @@ def _get_reward(self, action: T) -> float: def render(self, mode: str = "human") -> None: pass - - -# Not Work Yet -class IdentityEnvBox(IdentityEnv[np.ndarray]): - def __init__( - self, - low: float = -1.0, - high: float = 1.0, - eps: float = 0.05, - ep_length: int = 100, - ): - """ - Identity environment for testing purposes - - :param low: the lower bound of the box dim - :param high: the upper bound of the box dim - :param eps: the epsilon bound for correct value - :param ep_length: the length of each episode in timesteps - """ - space = spaces.Box(low=low, high=high, shape=(1,), dtype=np.float32) - super().__init__(ep_length=ep_length, space=space) - self.eps = eps - - def step( - self, action: np.ndarray - ) -> Tuple[np.ndarray, float, bool, Dict[str, Any]]: - reward = self._get_reward(action) - self._choose_next_state() - self.current_step += 1 - done = self.current_step >= self.ep_length - return self.state, reward, done, {} - - def _get_reward(self, action: np.ndarray) -> float: - return ( - 1.0 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0.0 - ) - - -# Not Work Yet -class IdentityEnvMultiDiscrete(IdentityEnv[np.ndarray]): - def __init__(self, dim: int = 1, ep_length: int = 100) -> None: - """ - Identity environment for testing purposes - - :param dim: the size of the dimensions you want to learn - :param ep_length: the length of each episode in timesteps - """ - space = spaces.MultiDiscrete([dim, dim]) - super().__init__(ep_length=ep_length, space=space) - - -# Not Work Yet -class IdentityEnvMultiBinary(IdentityEnv[np.ndarray]): - def __init__(self, dim: int = 1, ep_length: int = 100) -> None: - """ - Identity environment for testing purposes - - :param dim: the size of the dimensions you want to learn - :param ep_length: the length of each episode in timesteps - """ - space = spaces.MultiBinary(dim) - super().__init__(ep_length=ep_length, space=space) - - -# Not Work Yet -class FakeImageEnv(gym.Env): - """ - Fake image environment for testing purposes, it mimics Atari games. - - :param action_dim: Number of discrete actions - :param screen_height: Height of the image - :param screen_width: Width of the image - :param n_channels: Number of color channels - :param discrete: Create discrete action space instead of continuous - :param channel_first: Put channels on first axis instead of last - """ - - def __init__( - self, - action_dim: int = 6, - screen_height: int = 84, - screen_width: int = 84, - n_channels: int = 1, - discrete: bool = True, - channel_first: bool = False, - ) -> None: - self.observation_shape = (screen_height, screen_width, n_channels) - if channel_first: - self.observation_shape = (n_channels, screen_height, screen_width) - self.observation_space = spaces.Box( - low=0, high=255, shape=self.observation_shape, dtype=np.uint8 - ) - if discrete: - self.action_space = spaces.Discrete(action_dim) - else: - self.action_space = spaces.Box(low=-1, high=1, shape=(5,), dtype=np.float32) - self.ep_length = 10 - self.current_step = 0 - - def reset(self) -> np.ndarray: - self.current_step = 0 - return self.observation_space.sample() - - def step(self, action: Union[np.ndarray, int]): - reward = 0.0 - self.current_step += 1 - done = self.current_step >= self.ep_length - return self.observation_space.sample(), reward, done, {} - - def render(self, mode: str = "human") -> None: - pass diff --git a/openrl/envs/toy_envs/multi_input_envs.py b/openrl/envs/toy_envs/multi_input_envs.py deleted file mode 100644 index eccb1f6f..00000000 --- a/openrl/envs/toy_envs/multi_input_envs.py +++ /dev/null @@ -1,185 +0,0 @@ -from typing import Dict, Union - -import gymnasium as gym -import numpy as np -from gymnasium import spaces - - -# Not Work Yet -class SimpleMultiObsEnv(gym.Env): - """ - Base class for GridWorld-based MultiObs Environments 4x4 grid world. - - .. code-block:: text - - ____________ - | 0 1 2 3| - | 4|¯5¯¯6¯| 7| - | 8|_9_10_|11| - |12 13 14 15| - ¯¯¯¯¯¯¯¯¯¯¯¯¯¯ - - start is 0 - states 5, 6, 9, and 10 are blocked - goal is 15 - actions are = [left, down, right, up] - - simple linear state env of 15 states but encoded with a vector and an image observation: - each column is represented by a random vector and each row is - represented by a random image, both sampled once at creation time. - - :param num_col: Number of columns in the grid - :param num_row: Number of rows in the grid - :param random_start: If true, agent starts in random position - :param channel_last: If true, the image will be channel last, else it will be channel first - """ - - def __init__( - self, - num_col: int = 4, - num_row: int = 4, - random_start: bool = True, - discrete_actions: bool = True, - channel_last: bool = True, - ): - super().__init__() - - self.vector_size = 5 - if channel_last: - self.img_size = [64, 64, 1] - else: - self.img_size = [1, 64, 64] - - self.random_start = random_start - self.discrete_actions = discrete_actions - if discrete_actions: - self.action_space = spaces.Discrete(4) - else: - self.action_space = spaces.Box(0, 1, (4,)) - - self.observation_space = spaces.Dict( - spaces={ - "vec": spaces.Box(0, 1, (self.vector_size,), dtype=np.float64), - "img": spaces.Box(0, 255, self.img_size, dtype=np.uint8), - } - ) - self.count = 0 - # Timeout - self.max_count = 100 - self.log = "" - self.state = 0 - self.action2str = ["left", "down", "right", "up"] - self.init_possible_transitions() - - self.num_col = num_col - self.state_mapping = [] - self.init_state_mapping(num_col, num_row) - - self.max_state = len(self.state_mapping) - 1 - - def init_state_mapping(self, num_col: int, num_row: int) -> None: - """ - Initializes the state_mapping array which holds the observation values for each state - - :param num_col: Number of columns. - :param num_row: Number of rows. - """ - # Each column is represented by a random vector - col_vecs = np.random.random((num_col, self.vector_size)) - # Each row is represented by a random image - row_imgs = np.random.randint(0, 255, (num_row, 64, 64), dtype=np.uint8) - - for i in range(num_col): - for j in range(num_row): - self.state_mapping.append( - {"vec": col_vecs[i], "img": row_imgs[j].reshape(self.img_size)} - ) - - def get_state_mapping(self) -> Dict[str, np.ndarray]: - """ - Uses the state to get the observation mapping. - - :return: observation dict {'vec': ..., 'img': ...} - """ - return self.state_mapping[self.state] - - def init_possible_transitions(self) -> None: - """ - Initializes the transitions of the environment - The environment exploits the cardinal directions of the grid by noting that - they correspond to simple addition and subtraction from the cell id within the grid - - - up => means moving up a row => means subtracting the length of a column - - down => means moving down a row => means adding the length of a column - - left => means moving left by one => means subtracting 1 - - right => means moving right by one => means adding 1 - - Thus one only needs to specify in which states each action is possible - in order to define the transitions of the environment - """ - self.left_possible = [1, 2, 3, 13, 14, 15] - self.down_possible = [0, 4, 8, 3, 7, 11] - self.right_possible = [0, 1, 2, 12, 13, 14] - self.up_possible = [4, 8, 12, 7, 11, 15] - - def step(self, action: Union[float, np.ndarray]): - """ - Run one timestep of the environment's dynamics. When end of - episode is reached, you are responsible for calling `reset()` - to reset this environment's state. - Accepts an action and returns a tuple (observation, reward, done, info). - - :param action: - :return: tuple (observation, reward, done, info). - """ - if not self.discrete_actions: - action = np.argmax(action) - else: - action = int(action) - - self.count += 1 - - prev_state = self.state - - reward = -0.1 - # define state transition - if self.state in self.left_possible and action == 0: # left - self.state -= 1 - elif self.state in self.down_possible and action == 1: # down - self.state += self.num_col - elif self.state in self.right_possible and action == 2: # right - self.state += 1 - elif self.state in self.up_possible and action == 3: # up - self.state -= self.num_col - - got_to_end = self.state == self.max_state - reward = 1 if got_to_end else reward - done = self.count > self.max_count or got_to_end - - self.log = ( - f"Went {self.action2str[action]} in state {prev_state}, got to state" - f" {self.state}" - ) - - return self.get_state_mapping(), reward, done, {"got_to_end": got_to_end} - - def render(self, mode: str = "human") -> None: - """ - Prints the log of the environment. - - :param mode: - """ - print(self.log) - - def reset(self) -> Dict[str, np.ndarray]: - """ - Resets the environment state and step count and returns reset observation. - - :return: observation dict {'vec': ..., 'img': ...} - """ - self.count = 0 - if not self.random_start: - self.state = 0 - else: - self.state = np.random.randint(0, self.max_state) - return self.state_mapping[self.state] diff --git a/openrl/envs/vec_env/async_venv.py b/openrl/envs/vec_env/async_venv.py index dd654599..141532ba 100644 --- a/openrl/envs/vec_env/async_venv.py +++ b/openrl/envs/vec_env/async_venv.py @@ -751,7 +751,6 @@ def prepare_obs(observation): try: while True: command, data = pipe.recv() - print(command) if command == "reset": result = env.reset(**data) diff --git a/openrl/rewards/nlp_reward.py b/openrl/rewards/nlp_reward.py index c653c7c8..467f2a16 100644 --- a/openrl/rewards/nlp_reward.py +++ b/openrl/rewards/nlp_reward.py @@ -10,12 +10,15 @@ class NLPReward(BaseReward): - def __init__(self, env: Env, ref_model: str, intent_model: str): + def __init__( + self, env: Env, ref_model: str, intent_model: str, use_deepspeed: bool = True + ): self.rew_infos = [] self.env_infos = [] meteor_config = { "meteor_coeff": 0.5, + "test": ref_model == "builtin_ref", } self.inner_rew_funcs = { "meteor": Meteor(**meteor_config), @@ -24,6 +27,7 @@ def __init__(self, env: Env, ref_model: str, intent_model: str): kl_config = { "action_space": env.action_space, "ref_model": ref_model, + "use_deepspeed": use_deepspeed, } self.step_rew_funcs = { "kl_pen": KLPenalty(**kl_config), @@ -32,6 +36,7 @@ def __init__(self, env: Env, ref_model: str, intent_model: str): intent_config = { "intent_model": intent_model, "intent_coeff": 0.5, + "use_deepspeed": use_deepspeed, } self.batch_rew_funcs = { "intent_acc": Intent(**intent_config), diff --git a/openrl/utils/logger.py b/openrl/utils/logger.py index 0f2f0e2e..3fe61b53 100644 --- a/openrl/utils/logger.py +++ b/openrl/utils/logger.py @@ -32,9 +32,9 @@ class Logger: def __init__( self, cfg, - project_name: str, - scenario_name: str, - wandb_entity: str, + project_name: str = "openrl", + scenario_name: str = "openrl", + wandb_entity: str = "openrl", exp_name: Optional[str] = None, log_path: Optional[str] = None, use_wandb: bool = False, diff --git a/setup.py b/setup.py index 172343bf..89c839a5 100644 --- a/setup.py +++ b/setup.py @@ -60,16 +60,20 @@ def get_extra_requires() -> dict: "mpe": ["pyglet==1.5.27"], "nlp": [ "transformers==4.18.0", - "datasets", + "datasets==2.13", "nltk", "evaluate", "icetk", ], "nlp_test": [ "transformers", - "datasets", + "datasets==2.13", "evaluate", ], + "deep_speed_test": [ + "deepspeed", + "mpi4py", + ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"], "retro": ["gym-retro"], @@ -80,6 +84,7 @@ def get_extra_requires() -> dict: req["test"].extend(req["selfplay_test"]) req["test"].extend(req["atari"]) req["test"].extend(req["nlp_test"]) + req["test"].extend(req["deep_speed_test"]) return req diff --git a/tests/test_dataset/test_expert_dataset.py b/tests/test_dataset/test_expert_dataset.py new file mode 100644 index 00000000..1eed9125 --- /dev/null +++ b/tests/test_dataset/test_expert_dataset.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import pytest +import torch + +from openrl.datasets.expert_dataset import ExpertDataset +from openrl.envs.common import make +from openrl.envs.vec_env.wrappers.gen_data import GenDataWrapper +from openrl.envs.wrappers.monitor import Monitor + +env_wrappers = [ + Monitor, +] + + +def gen_data(total_episode, data_save_path): + # begin to test + # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human. + + env = make( + "IdentityEnv", + env_num=1, + asynchronous=True, + env_wrappers=env_wrappers, + ) + + env = GenDataWrapper( + env, data_save_path=data_save_path, total_episode=total_episode + ) + env.reset() + done = False + ep_length = 0 + while not done: + obs, r, done, info = env.step(env.random_action()) + ep_length += 1 + env.close() + return ep_length + + +@pytest.mark.unittest +def test_expert_dataset(tmp_path): + total_episode = 1 + data_save_path = tmp_path / "data.pkl" + ep_length = gen_data(total_episode, data_save_path) + + dataset = ExpertDataset( + data_save_path, + num_trajectories=None, + subsample_frequency=1, + seed=None, + env_id=0, + env_num=1, + ) + assert len(dataset) == ep_length, "len(dataset)={},data_length={}".format( + len(dataset), ep_length + ) + assert len(dataset[0]) == 2, "len(dataset[0])={}".format(len(dataset[0])) + + data_loader = torch.utils.data.DataLoader( + dataset=dataset, batch_size=1, shuffle=False, drop_last=True + ) + + step = 0 + for batch_data in data_loader: + assert len(batch_data) == 2, "len(batch_data)={}".format(len(batch_data)) + step += 1 + assert step == ep_length, "step={},ep_length={}".format(step, ep_length) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_rewards/test_nlp_reward.py b/tests/test_rewards/test_nlp_reward.py new file mode 100644 index 00000000..739943ef --- /dev/null +++ b/tests/test_rewards/test_nlp_reward.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest + +from openrl.buffers.normal_buffer import NormalReplayBuffer +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.rewards import RewardFactory + + +@pytest.fixture( + scope="module", + params=[ + "--reward_class.id NLPReward --reward_class.args" + " {'intent_model':'builtin_intent','ref_model':'builtin_ref','use_deepspeed':False}" + ], +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_nlp_reward(config): + env = make("fake_dialog_data", env_num=1) + reward = RewardFactory.get_reward_class(config.reward_class, env) + data = {} + data["rewards"] = np.zeros(32) + env_info = {} + env_info["final_info"] = { + "prompt_texts": "hello", + "generated_texts": "hello", + "meta_infos": {"intent": [1]}, + } + data["infos"] = [env_info] * 32 + data["step"] = 0 + data["actions"] = [0] + data["action_log_probs"] = np.zeros(32) + buffer = NormalReplayBuffer( + config, + num_agents=env.agent_num, + obs_space=env.observation_space, + act_space=env.action_space, + data_client=None, + episode_length=1, + ) + data["buffer"] = buffer + reward.step_reward(data=data) + reward.batch_rewards(buffer=buffer) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 0707ba6a5aeb4650e7f445e2a8e9b063b54b436d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 26 Oct 2023 17:25:31 +0800 Subject: [PATCH 32/78] update test --- setup.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/setup.py b/setup.py index 89c839a5..84da342e 100644 --- a/setup.py +++ b/setup.py @@ -70,10 +70,6 @@ def get_extra_requires() -> dict: "datasets==2.13", "evaluate", ], - "deep_speed_test": [ - "deepspeed", - "mpi4py", - ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"], "retro": ["gym-retro"], @@ -84,7 +80,6 @@ def get_extra_requires() -> dict: req["test"].extend(req["selfplay_test"]) req["test"].extend(req["atari"]) req["test"].extend(req["nlp_test"]) - req["test"].extend(req["deep_speed_test"]) return req From 4da38940e4eb128fa0e5682185c3a6720fa0e7f4 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 27 Oct 2023 11:53:10 +0800 Subject: [PATCH 33/78] init v0.1.10 --- README.md | 2 +- README_zh.md | 2 +- openrl/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 07eacdf6..7c5cfbbe 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.9 is updated on Oct 20, 2023 +OpenRL-v0.1.10 is updated on Oct 27, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/README_zh.md b/README_zh.md index 531c8c34..aa193ed7 100644 --- a/README_zh.md +++ b/README_zh.md @@ -29,7 +29,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.9 is updated on Oct 20, 2023 +OpenRL-v0.1.10 is updated on Oct 27, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/openrl/__init__.py b/openrl/__init__.py index 89c0f7f9..53ded95e 100644 --- a/openrl/__init__.py +++ b/openrl/__init__.py @@ -1,5 +1,5 @@ __TITLE__ = "openrl" -__VERSION__ = "v0.1.9" +__VERSION__ = "v0.1.10" __DESCRIPTION__ = "Distributed Deep RL Framework" __AUTHOR__ = "OpenRL Contributors" __EMAIL__ = "huangshiyu@4paradigm.com" From 11af42aba4dce9e32ea5d859d8b390eded06e857 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 1 Nov 2023 19:36:08 +0800 Subject: [PATCH 34/78] fix eval_callback: need to reset rnn state when environment is done --- openrl/modules/common/ppo_net.py | 26 +++++++++++++++++++++++++- openrl/runners/common/ppo_agent.py | 2 ++ openrl/utils/evaluation.py | 7 ++++++- 3 files changed, 33 insertions(+), 2 deletions(-) diff --git a/openrl/modules/common/ppo_net.py b/openrl/modules/common/ppo_net.py index 93dbaa64..0be29d20 100644 --- a/openrl/modules/common/ppo_net.py +++ b/openrl/modules/common/ppo_net.py @@ -15,7 +15,7 @@ # limitations under the License. """""" - +import copy from typing import Any, Dict, Optional, Tuple, Union import gymnasium as gym @@ -29,6 +29,19 @@ from openrl.modules.ppo_module import PPOModule from openrl.utils.util import set_seed +def reset_rnn_states(rnn_states, episode_starts, env_num, agent_num, rnn_layers, hidden_size): + # First we reshape the episode_starts to match the rnn_states shape + # Since episode_starts affects all agents in the environment, we repeat it agent_num times + episode_starts = np.repeat(copy.copy(episode_starts), agent_num) + # We then need to expand the dimensions of episode_starts to match rnn_states + # The new shape of episode_starts should be (env_num * agent_num, 1, 1) to broadcast correctly + episode_starts = episode_starts[:, None, None] + # Now, episode_starts should broadcast over the last two dimensions of rnn_states when multiplied + # We want to set rnn_states to zero where episode_starts is 1, so we invert the episode_starts as a mask + mask = (1 - episode_starts) + # Apply the mask to rnn_states, setting the appropriate states to zero + rnn_states *= mask + return rnn_states class PPONet(BaseNet): def __init__( @@ -89,7 +102,18 @@ def act( observation: Union[np.ndarray, Dict[str, np.ndarray]], action_masks: Optional[np.ndarray] = None, deterministic: bool = False, + episode_starts: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: + if episode_starts is not None: + self.rnn_states_actor = reset_rnn_states( + self.rnn_states_actor, + episode_starts, + self.env.parallel_env_num, + self.env.agent_num, + self.rnn_states_actor.shape[1], + self.rnn_states_actor.shape[2], + ) + actions, self.rnn_states_actor = self.module.act( obs=observation, rnn_states_actor=self.rnn_states_actor, diff --git a/openrl/runners/common/ppo_agent.py b/openrl/runners/common/ppo_agent.py index ad7d0a84..a1f17450 100644 --- a/openrl/runners/common/ppo_agent.py +++ b/openrl/runners/common/ppo_agent.py @@ -136,6 +136,7 @@ def act( observation: Union[np.ndarray, Dict[str, np.ndarray]], info: Optional[List[Dict[str, Any]]] = None, deterministic: bool = True, + episode_starts: Optional[np.ndarray] = None, ) -> Tuple[np.ndarray, Optional[Tuple[np.ndarray, ...]]]: assert self.net is not None, "net is None" observation = ObsData.prepare_input(observation) @@ -149,6 +150,7 @@ def act( observation, action_masks=action_masks, deterministic=deterministic, + episode_starts=episode_starts ) action = np.array(np.split(_t2n(action), self.env_num)) diff --git a/openrl/utils/evaluation.py b/openrl/utils/evaluation.py index 391ba10f..9d759f46 100644 --- a/openrl/utils/evaluation.py +++ b/openrl/utils/evaluation.py @@ -95,9 +95,14 @@ def evaluate_policy( episode_starts = np.ones((env.parallel_env_num,), dtype=bool) while (episode_counts < episode_count_targets).any(): + if not np.all(episode_starts == 0): + episode_starts_tmp = episode_starts + else: + episode_starts_tmp = None + actions, states = agent.act( observations, - deterministic=deterministic, + deterministic=deterministic,episode_starts=episode_starts_tmp ) observations, rewards, dones, infos = env.step(actions) rewards = np.squeeze(rewards, axis=-1) From d034ce111e995d2ea22e5b7989ab606824518aa2 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 1 Nov 2023 19:38:50 +0800 Subject: [PATCH 35/78] fix eval_callback: need to reset rnn state when environment is done --- openrl/modules/common/ppo_net.py | 8 ++++++-- openrl/runners/common/ppo_agent.py | 2 +- openrl/utils/evaluation.py | 3 +-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/openrl/modules/common/ppo_net.py b/openrl/modules/common/ppo_net.py index 0be29d20..7c537c91 100644 --- a/openrl/modules/common/ppo_net.py +++ b/openrl/modules/common/ppo_net.py @@ -29,7 +29,10 @@ from openrl.modules.ppo_module import PPOModule from openrl.utils.util import set_seed -def reset_rnn_states(rnn_states, episode_starts, env_num, agent_num, rnn_layers, hidden_size): + +def reset_rnn_states( + rnn_states, episode_starts, env_num, agent_num, rnn_layers, hidden_size +): # First we reshape the episode_starts to match the rnn_states shape # Since episode_starts affects all agents in the environment, we repeat it agent_num times episode_starts = np.repeat(copy.copy(episode_starts), agent_num) @@ -38,11 +41,12 @@ def reset_rnn_states(rnn_states, episode_starts, env_num, agent_num, rnn_layers, episode_starts = episode_starts[:, None, None] # Now, episode_starts should broadcast over the last two dimensions of rnn_states when multiplied # We want to set rnn_states to zero where episode_starts is 1, so we invert the episode_starts as a mask - mask = (1 - episode_starts) + mask = 1 - episode_starts # Apply the mask to rnn_states, setting the appropriate states to zero rnn_states *= mask return rnn_states + class PPONet(BaseNet): def __init__( self, diff --git a/openrl/runners/common/ppo_agent.py b/openrl/runners/common/ppo_agent.py index a1f17450..414ff409 100644 --- a/openrl/runners/common/ppo_agent.py +++ b/openrl/runners/common/ppo_agent.py @@ -150,7 +150,7 @@ def act( observation, action_masks=action_masks, deterministic=deterministic, - episode_starts=episode_starts + episode_starts=episode_starts, ) action = np.array(np.split(_t2n(action), self.env_num)) diff --git a/openrl/utils/evaluation.py b/openrl/utils/evaluation.py index 9d759f46..c008c437 100644 --- a/openrl/utils/evaluation.py +++ b/openrl/utils/evaluation.py @@ -101,8 +101,7 @@ def evaluate_policy( episode_starts_tmp = None actions, states = agent.act( - observations, - deterministic=deterministic,episode_starts=episode_starts_tmp + observations, deterministic=deterministic, episode_starts=episode_starts_tmp ) observations, rewards, dones, infos = env.step(actions) rewards = np.squeeze(rewards, axis=-1) From 6e1f5e8cfa5c06f4663691f02f426ad24bb266a2 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Sat, 4 Nov 2023 00:14:37 -0400 Subject: [PATCH 36/78] update_ds_config --- .gitignore | 2 + examples/nlp/README.md | 8 ++++ examples/nlp/ds_config.json | 11 ++++++ examples/nlp/eval_ds_config.json | 10 +++++ examples/nlp/nlp_ppo.yaml | 16 ++++---- examples/nlp/nlp_ppo_ds.yaml | 38 ++++++++++++++++++ examples/nlp/train_ppo.py | 9 ++++- openrl/configs/config.py | 8 +++- openrl/envs/nlp/rewards/intent.py | 26 +++++++++---- openrl/envs/nlp/rewards/kl_penalty.py | 34 ++++++++++------ openrl/modules/networks/policy_network.py | 4 +- .../networks/policy_value_network_gpt.py | 4 +- openrl/modules/rl_module.py | 39 ++++++++----------- openrl/rewards/nlp_reward.py | 10 ++++- 14 files changed, 162 insertions(+), 57 deletions(-) create mode 100644 examples/nlp/ds_config.json create mode 100644 examples/nlp/eval_ds_config.json create mode 100644 examples/nlp/nlp_ppo_ds.yaml diff --git a/.gitignore b/.gitignore index 80ced1f6..469ab373 100644 --- a/.gitignore +++ b/.gitignore @@ -156,6 +156,8 @@ api_docs *.json opponent_pool !/examples/selfplay/opponent_templates/tictactoe_opponent/info.json +!/examples/nlp/ds_config.json +!/examples/nlp/eval_ds_config.json wandb_run examples/dmc/new.gif /examples/snake/submissions/rl/actor_2000.pth diff --git a/examples/nlp/README.md b/examples/nlp/README.md index 6bcbb7c0..c3632f9c 100644 --- a/examples/nlp/README.md +++ b/examples/nlp/README.md @@ -6,6 +6,14 @@ Users can train the dialog task via: python train_ppo.py --config nlp_ppo.yaml ``` +Users can train the dialog task with deepspeed: + +```shell +deepspeed train_ppo.py --config nlp_ppo.yaml + + +``` + After the training, users can chat with the agent via: ```shell diff --git a/examples/nlp/ds_config.json b/examples/nlp/ds_config.json new file mode 100644 index 00000000..8ad017c2 --- /dev/null +++ b/examples/nlp/ds_config.json @@ -0,0 +1,11 @@ +{ + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 16, + "steps_per_print": 10, + "zero_optimization": { + "stage": 2, + "reduce_bucket_size": 5e7, + "allgather_bucket_size": 5e7 + }, + "fp16": {"enabled": true, "loss_scale_window": 100} +} \ No newline at end of file diff --git a/examples/nlp/eval_ds_config.json b/examples/nlp/eval_ds_config.json new file mode 100644 index 00000000..65152edf --- /dev/null +++ b/examples/nlp/eval_ds_config.json @@ -0,0 +1,10 @@ +{ + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 16, + "steps_per_print": 10, + "zero_optimization": { + "stage": 0, + "offload_param": {"device": "cpu"} +}, + "fp16": {"enabled": true} +} \ No newline at end of file diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index 1ea1cc5b..7d2447e5 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -1,19 +1,18 @@ seed: 0 -lr: 2e-7 -critic_lr: 2e-7 +lr: 1e-7 +critic_lr: 1e-7 run_dir: ./run_results/ log_interval: 1 use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 112 -num_mini_batch: 20 +episode_length: 8 +num_mini_batch: 1 use_share_model: true hidden_size: 1 -use_deepspeed: true -use_fp16: true + model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog env: @@ -25,9 +24,8 @@ vec_info_class: id: "NLPVecInfo" reward_class: id: "NLPReward" - args: { - "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", + args: { "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", - "use_deepspeed": true, + "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml new file mode 100644 index 00000000..fa31fce5 --- /dev/null +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -0,0 +1,38 @@ +seed: 0 +lr: 1e-7 +critic_lr: 1e-7 +run_dir: ./run_results/ +log_interval: 1 +use_valuenorm: true +use_adv_normalize: true +wandb_entity: "openrl-lab" +ppo_epoch: 5 +episode_length: 8 +num_mini_batch: 1 +use_share_model: true + +hidden_size: 1 + +use_deepspeed: true +use_fp16: true +use_offload: false +deepspeed_config: ds_config.json + +model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog +env: + args: { + 'tokenizer_path': 'gpt2', + 'data_path': 'daily_dialog', + } +vec_info_class: + id: "NLPVecInfo" +reward_class: + id: "NLPReward" + args: { + "use_deepspeed": true, + "ref_ds_config": "eval_ds_config.json", + "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", + "intent_ds_config": "eval_ds_config.json", + "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", + } + \ No newline at end of file diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index e2fcc3d6..14cbf0b4 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -12,9 +12,14 @@ def train(): # create environment cfg_parser = create_config_parser() + try: + import deepspeed + cfg_parser = deepspeed.add_config_arguments(cfg_parser) + except: + print("choose not to use deepspeed in the nlp task") cfg = cfg_parser.parse_args() - env_num = 5 + env_num = 2 env = make( "daily_dialog", env_num=env_num, @@ -27,7 +32,7 @@ def train(): net = Net(env, device="cuda", cfg=cfg, model_dict=model_dict) # initialize the trainer - agent = Agent(net, use_wandb=True) + agent = Agent(net, use_wandb=False) # start training agent.train(total_time_steps=100000) diff --git a/openrl/configs/config.py b/openrl/configs/config.py index c0fd176d..1137d6e3 100644 --- a/openrl/configs/config.py +++ b/openrl/configs/config.py @@ -1246,11 +1246,17 @@ def create_config_parser(): type=int, help="local_rank", ) + parser.add_argument( + "--use_offload", + default=False, + type=bool, + help="whether to use offload (deepspeed)", + ) parser.add_argument( "--use_fp16", default=False, type=bool, - help="whether to use fp16", + help="whether to use fp16 (deepspeed)", ) return parser diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 812cc5f4..db6cb3a3 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -9,24 +9,28 @@ from openrl.supports.opengpu.manager import LocalGPUManager -def get_eval_ds_config(offload, stage=0): +def get_default_ds_config(offload=True, stage=0, fp16=True): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, "offload_param": {"device": device}, } return { - "train_batch_size": 28, - "train_micro_batch_size_per_gpu": 7, + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": zero_opt_dict, - "fp16": {"enabled": True}, + "fp16": {"enabled": fp16}, } class Intent: def __init__( - self, intent_model: str, intent_coeff: float = 1.0, use_deepspeed: bool = True + self, + intent_model: str, + intent_coeff: float = 1.0, + use_deepspeed: bool = True, + ds_config: str = "default", ) -> None: super().__init__() @@ -64,11 +68,17 @@ def __init__(self, input_ids, attention_mask): if self.use_deepspeed: import deepspeed - + + if ds_config == "default": + ds_config = get_default_ds_config() + else: + import json + with open(ds_config) as file: + ds_config = json.load(file) + + self._device = "cuda" self._model = self._model.to("cuda") - ds_config = get_eval_ds_config(offload=True, stage=0) self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) - self._device = "cuda" else: if torch.cuda.is_available(): manager = LocalGPUManager() diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 643d263d..0438c161 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -10,18 +10,18 @@ from openrl.envs.nlp.utils.distribution import CategoricalDistribution -def get_eval_ds_config(offload, stage=0): +def get_default_ds_config(offload=True, stage=0, fp16=True): device = "cpu" if offload else "none" zero_opt_dict = { "stage": stage, "offload_param": {"device": device}, } return { - "train_batch_size": 28, # - "train_micro_batch_size_per_gpu": 7, + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": zero_opt_dict, - "fp16": {"enabled": True}, + "fp16": {"enabled": fp16}, } @@ -32,10 +32,10 @@ def __init__( ref_model: str, apply_model_parallel: bool = True, use_deepspeed: bool = True, + ds_config: str = "default", ): super().__init__() self.use_deepspeed = use_deepspeed - self.use_fp16 = True # reference model self._apply_model_parallel = apply_model_parallel @@ -49,8 +49,19 @@ def __init__( self._ref_net = self._ref_net.eval() if self.use_deepspeed: import deepspeed + + if ds_config == "default": + self.use_fp16 = True + ds_config = get_default_ds_config() + else: + import json + with open(ds_config) as file: + ds_config = json.load(file) + if "fp16" in ds_config: + self.use_fp16 = ds_config["fp16"]["enabled"] + else: + self.use_fp16 = False - ds_config = get_eval_ds_config(offload=True, stage=0) self._ref_engine, *_ = deepspeed.initialize(model=self, config=ds_config) elif torch.cuda.is_available(): if self._apply_model_parallel and self._ref_net.is_parallelizable: @@ -94,11 +105,12 @@ def __call__( self._ref_net, input_ids, past_model_kwargs ) - if self.use_fp16: - for key in ["input_ids", "position_ids"]: - model_inputs[key] = model_inputs[key].half().int() - for key in ["attention_mask"]: - model_inputs[key] = model_inputs[key].half() + if self.use_deepspeed: + if self.use_fp16: + for key in ["input_ids", "position_ids"]: + model_inputs[key] = model_inputs[key].half().int() + for key in ["attention_mask"]: + model_inputs[key] = model_inputs[key].half() with torch.no_grad(): output = self._ref_net(output_hidden_states=True, **model_inputs) diff --git a/openrl/modules/networks/policy_network.py b/openrl/modules/networks/policy_network.py index 280adfc6..343953ff 100644 --- a/openrl/modules/networks/policy_network.py +++ b/openrl/modules/networks/policy_network.py @@ -53,10 +53,12 @@ def __init__( self._influence_layer_N = cfg.influence_layer_N self._use_policy_vhead = cfg.use_policy_vhead self._recurrent_N = cfg.recurrent_N - self._use_fp16 = cfg.use_fp16 and cfg.use_deepspeed self.use_half = use_half self.tpdv = dict(dtype=torch.float32, device=device) + self._use_fp16 = cfg.use_fp16 + assert cfg.use_fp16 and cfg.use_deepspeed + policy_obs_shape = get_policy_obs_space(input_space) if "Dict" in policy_obs_shape.__class__.__name__: diff --git a/openrl/modules/networks/policy_value_network_gpt.py b/openrl/modules/networks/policy_value_network_gpt.py index fdbc15b1..2b9ec08c 100644 --- a/openrl/modules/networks/policy_value_network_gpt.py +++ b/openrl/modules/networks/policy_value_network_gpt.py @@ -43,8 +43,10 @@ def __init__( device=device, ) self.use_half = use_half - self._use_fp16 = cfg.use_fp16 and cfg.use_deepspeed self.tpdv = dict(dtype=torch.float32, device=device) + + self._use_fp16 = cfg.use_fp16 + assert cfg.use_fp16 and cfg.use_deepspeed def get_actor_para(self): return self._policy_model.parameters() diff --git a/openrl/modules/rl_module.py b/openrl/modules/rl_module.py index 47779426..8568d23c 100644 --- a/openrl/modules/rl_module.py +++ b/openrl/modules/rl_module.py @@ -26,20 +26,6 @@ from openrl.modules.model_config import ModelTrainConfig -def get_train_ds_config(offload, use_fp16=False, stage=2): - return { - "train_batch_size": 28, - "train_micro_batch_size_per_gpu": 7, - "steps_per_print": 10, - "zero_optimization": { - "stage": 2, - "reduce_bucket_size": 5e7, - "allgather_bucket_size": 5e7, - }, - "fp16": {"enabled": use_fp16, "loss_scale_window": 100}, - } - - class RLModule(BaseModule): def __init__( self, @@ -100,17 +86,24 @@ def __init__( self.models.update({model_key: model}) self.optimizers.update({model_key: optimizer}) else: + import json import deepspeed from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam from transformers import get_constant_schedule - - self.offload = False - ds_config = get_train_ds_config( - offload=self.offload, - use_fp16=cfg.use_fp16, - ) - - AdamOptimizer = DeepSpeedCPUAdam if self.offload else FusedAdam + + self.use_fp16 = cfg.use_fp16 + self.use_offload = cfg.use_offload + + # Check for inconsistencies in configuration files + assert not (self.use_fp16 and not self.use_deepspeed) + assert not (self.use_offload and not self.use_deepspeed) + assert cfg.deepspeed_config is not None + with open(cfg.deepspeed_config) as file: + ds_config = json.load(file) + if "fp16" in ds_config: + assert ds_config["fp16"]["enabled"] == self.use_fp16 + + AdamOptimizer = DeepSpeedCPUAdam if self.use_offload else FusedAdam optim_params = filter(lambda p: p.requires_grad, model.parameters()) optim = AdamOptimizer( optim_params, lr=model_cg["lr"], betas=(0.9, 0.95) @@ -122,10 +115,10 @@ def __init__( ) engine, *_ = deepspeed.initialize( + args=cfg, model=model, optimizer=optim, lr_scheduler=lr_scheduler, - config=ds_config, ) self.models.update({model_key: engine}) self.optimizers.update({model_key: engine}) diff --git a/openrl/rewards/nlp_reward.py b/openrl/rewards/nlp_reward.py index 467f2a16..859eecf7 100644 --- a/openrl/rewards/nlp_reward.py +++ b/openrl/rewards/nlp_reward.py @@ -11,7 +11,13 @@ class NLPReward(BaseReward): def __init__( - self, env: Env, ref_model: str, intent_model: str, use_deepspeed: bool = True + self, + env: Env, + ref_model: str, + intent_model: str, + use_deepspeed: bool = False, + ref_ds_config: str = "default", + intent_ds_config: str = "default", ): self.rew_infos = [] self.env_infos = [] @@ -28,6 +34,7 @@ def __init__( "action_space": env.action_space, "ref_model": ref_model, "use_deepspeed": use_deepspeed, + "ds_config": ref_ds_config, } self.step_rew_funcs = { "kl_pen": KLPenalty(**kl_config), @@ -37,6 +44,7 @@ def __init__( "intent_model": intent_model, "intent_coeff": 0.5, "use_deepspeed": use_deepspeed, + "ds_config": intent_ds_config, } self.batch_rew_funcs = { "intent_acc": Intent(**intent_config), From bd88dcb0d43e4ce956e469ab1451caf972ba536d Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Sat, 4 Nov 2023 16:13:41 -0400 Subject: [PATCH 37/78] update ds_config --- examples/nlp/README.md | 4 ++-- examples/nlp/nlp_ppo.yaml | 4 ++-- examples/nlp/nlp_ppo_ds.yaml | 4 ++-- examples/nlp/train_ppo.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/nlp/README.md b/examples/nlp/README.md index c3632f9c..2fb61de9 100644 --- a/examples/nlp/README.md +++ b/examples/nlp/README.md @@ -6,10 +6,10 @@ Users can train the dialog task via: python train_ppo.py --config nlp_ppo.yaml ``` -Users can train the dialog task with deepspeed: +Users can train the dialog task with deepspeed via: ```shell -deepspeed train_ppo.py --config nlp_ppo.yaml +deepspeed train_ppo.py --config nlp_ppo_ds.yaml ``` diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index 7d2447e5..b46e6211 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -7,8 +7,8 @@ use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 8 -num_mini_batch: 1 +episode_length: 128 +num_mini_batch: 20 use_share_model: true hidden_size: 1 diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index fa31fce5..c1babe3e 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -7,8 +7,8 @@ use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 8 -num_mini_batch: 1 +episode_length: 128 +num_mini_batch: 20 use_share_model: true hidden_size: 1 diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index 14cbf0b4..e30f6d8a 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -19,7 +19,7 @@ def train(): print("choose not to use deepspeed in the nlp task") cfg = cfg_parser.parse_args() - env_num = 2 + env_num = 5 env = make( "daily_dialog", env_num=env_num, From ed0ceb72e5e131cad70cfd1d1b2dce2536526c0d Mon Sep 17 00:00:00 2001 From: WentseChen Date: Mon, 6 Nov 2023 05:47:52 +0800 Subject: [PATCH 38/78] update config --- examples/nlp/ds_config.json | 2 +- examples/nlp/eval_ds_config.json | 2 +- examples/nlp/nlp_ppo_ds.yaml | 2 +- examples/nlp/train_ppo.py | 2 +- openrl/modules/networks/policy_value_network_gpt.py | 3 ++- openrl/modules/networks/utils/nlp/base_policy.py | 9 +++++---- openrl/modules/networks/utils/nlp/causal_policy.py | 2 ++ 7 files changed, 13 insertions(+), 9 deletions(-) diff --git a/examples/nlp/ds_config.json b/examples/nlp/ds_config.json index 8ad017c2..544bc405 100644 --- a/examples/nlp/ds_config.json +++ b/examples/nlp/ds_config.json @@ -7,5 +7,5 @@ "reduce_bucket_size": 5e7, "allgather_bucket_size": 5e7 }, - "fp16": {"enabled": true, "loss_scale_window": 100} + "fp16": {"enabled": false, "loss_scale_window": 100} } \ No newline at end of file diff --git a/examples/nlp/eval_ds_config.json b/examples/nlp/eval_ds_config.json index 65152edf..58c08252 100644 --- a/examples/nlp/eval_ds_config.json +++ b/examples/nlp/eval_ds_config.json @@ -6,5 +6,5 @@ "stage": 0, "offload_param": {"device": "cpu"} }, - "fp16": {"enabled": true} + "fp16": {"enabled": false} } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index c1babe3e..ab0c0b6c 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -14,7 +14,7 @@ use_share_model: true hidden_size: 1 use_deepspeed: true -use_fp16: true +use_fp16: false use_offload: false deepspeed_config: ds_config.json diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index e30f6d8a..12ac7921 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -32,7 +32,7 @@ def train(): net = Net(env, device="cuda", cfg=cfg, model_dict=model_dict) # initialize the trainer - agent = Agent(net, use_wandb=False) + agent = Agent(net, use_wandb=True) # start training agent.train(total_time_steps=100000) diff --git a/openrl/modules/networks/policy_value_network_gpt.py b/openrl/modules/networks/policy_value_network_gpt.py index 2b9ec08c..28cac41d 100644 --- a/openrl/modules/networks/policy_value_network_gpt.py +++ b/openrl/modules/networks/policy_value_network_gpt.py @@ -37,6 +37,7 @@ def __init__( self.disable_drop_out = disable_drop_out self._use_valuenorm = cfg.use_valuenorm super(CausalLMActorCriticPolicy, self).__init__( + cfg, input_space, action_space, model_name=cfg.model_path, @@ -46,7 +47,7 @@ def __init__( self.tpdv = dict(dtype=torch.float32, device=device) self._use_fp16 = cfg.use_fp16 - assert cfg.use_fp16 and cfg.use_deepspeed + assert not(cfg.use_fp16 and not cfg.use_deepspeed) def get_actor_para(self): return self._policy_model.parameters() diff --git a/openrl/modules/networks/utils/nlp/base_policy.py b/openrl/modules/networks/utils/nlp/base_policy.py index bd5fd6b2..8ae9e49d 100644 --- a/openrl/modules/networks/utils/nlp/base_policy.py +++ b/openrl/modules/networks/utils/nlp/base_policy.py @@ -124,13 +124,14 @@ class GenerationOutputs: class LMActorCriticPolicy(nn.Module): def __init__( self, + cfg: Any, observation_space: DictSpace, action_space: Discrete, model_name: str, optimizer_kwargs: Dict[str, Any] = {}, weight_decay: float = 1e-6, use_sde: bool = None, - apply_model_parallel: bool = False, # TODO + # apply_model_parallel: bool = True, optimizer_class: torch.optim.Optimizer = torch.optim.AdamW, generation_kwargs: Dict[str, Any] = {}, prompt_truncation_side: str = "left", @@ -146,15 +147,15 @@ def __init__( optimizer_kwargs (Dict[str, Any], optional): optimizer kwargs. Defaults to {}. weight_decay (float, optional): weight decay. Defaults to 1e-6. use_sde (bool, optional): Use state-dependent exploration. Defaults to None. - apply_model_parallel (bool, optional): whether to apply model parallel. Defaults to True. + apply_model_parallel (bool, optional): default to use model parallel when not using deepspeed. optimizer_class (torch.optim.Optimizer, optional): Optimizer class. Defaults to torch.optim.AdamW. generation_kwargs (Dict[str, Any], optional): generation parameters for rollout. Defaults to {}. prompt_truncation_side (str, optional): truncation side for prompt text. Defaults to "left". """ super().__init__() - self._use_deepspeed = True # TODO + self._use_deepspeed = cfg.use_deepspeed self._action_space = action_space - self._apply_model_parallel = apply_model_parallel + self._apply_model_parallel = not cfg.use_deepspeed # TODO self._build_model_heads(model_name, config, device) self._action_dist = CategoricalDistribution(self._action_space.n) self._generation_kwargs = generation_kwargs diff --git a/openrl/modules/networks/utils/nlp/causal_policy.py b/openrl/modules/networks/utils/nlp/causal_policy.py index 8dfc65dc..f0b86d0d 100644 --- a/openrl/modules/networks/utils/nlp/causal_policy.py +++ b/openrl/modules/networks/utils/nlp/causal_policy.py @@ -21,6 +21,7 @@ class CausalLMActorCriticPolicy(LMActorCriticPolicy): def __init__( self, + cfg: Any, observation_space: DictSpace, action_space: Discrete, model_name: str, @@ -36,6 +37,7 @@ def __init__( device: str = "cpu", ): super().__init__( + cfg, observation_space, action_space, model_name, From d7c974e757c49dc32a1b238edfbf857e88cf9400 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Mon, 6 Nov 2023 23:30:19 -0500 Subject: [PATCH 39/78] make format --- examples/nlp/train_ppo.py | 1 + openrl/envs/nlp/rewards/intent.py | 11 ++++++----- openrl/envs/nlp/rewards/kl_penalty.py | 3 ++- openrl/modules/networks/policy_network.py | 2 +- openrl/modules/networks/policy_value_network_gpt.py | 4 ++-- openrl/modules/networks/utils/nlp/base_policy.py | 2 +- openrl/modules/rl_module.py | 7 ++++--- openrl/rewards/nlp_reward.py | 8 ++++---- 8 files changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index 12ac7921..728e4aa5 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -14,6 +14,7 @@ def train(): cfg_parser = create_config_parser() try: import deepspeed + cfg_parser = deepspeed.add_config_arguments(cfg_parser) except: print("choose not to use deepspeed in the nlp task") diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index db6cb3a3..0d449d13 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -26,9 +26,9 @@ def get_default_ds_config(offload=True, stage=0, fp16=True): class Intent: def __init__( - self, - intent_model: str, - intent_coeff: float = 1.0, + self, + intent_model: str, + intent_coeff: float = 1.0, use_deepspeed: bool = True, ds_config: str = "default", ) -> None: @@ -68,14 +68,15 @@ def __init__(self, input_ids, attention_mask): if self.use_deepspeed: import deepspeed - + if ds_config == "default": ds_config = get_default_ds_config() else: import json + with open(ds_config) as file: ds_config = json.load(file) - + self._device = "cuda" self._model = self._model.to("cuda") self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 0438c161..fe9e9594 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -49,12 +49,13 @@ def __init__( self._ref_net = self._ref_net.eval() if self.use_deepspeed: import deepspeed - + if ds_config == "default": self.use_fp16 = True ds_config = get_default_ds_config() else: import json + with open(ds_config) as file: ds_config = json.load(file) if "fp16" in ds_config: diff --git a/openrl/modules/networks/policy_network.py b/openrl/modules/networks/policy_network.py index 343953ff..875ef4a7 100644 --- a/openrl/modules/networks/policy_network.py +++ b/openrl/modules/networks/policy_network.py @@ -58,7 +58,7 @@ def __init__( self._use_fp16 = cfg.use_fp16 assert cfg.use_fp16 and cfg.use_deepspeed - + policy_obs_shape = get_policy_obs_space(input_space) if "Dict" in policy_obs_shape.__class__.__name__: diff --git a/openrl/modules/networks/policy_value_network_gpt.py b/openrl/modules/networks/policy_value_network_gpt.py index 28cac41d..85daef3a 100644 --- a/openrl/modules/networks/policy_value_network_gpt.py +++ b/openrl/modules/networks/policy_value_network_gpt.py @@ -45,9 +45,9 @@ def __init__( ) self.use_half = use_half self.tpdv = dict(dtype=torch.float32, device=device) - + self._use_fp16 = cfg.use_fp16 - assert not(cfg.use_fp16 and not cfg.use_deepspeed) + assert not (cfg.use_fp16 and not cfg.use_deepspeed) def get_actor_para(self): return self._policy_model.parameters() diff --git a/openrl/modules/networks/utils/nlp/base_policy.py b/openrl/modules/networks/utils/nlp/base_policy.py index 8ae9e49d..dd0e2032 100644 --- a/openrl/modules/networks/utils/nlp/base_policy.py +++ b/openrl/modules/networks/utils/nlp/base_policy.py @@ -155,7 +155,7 @@ def __init__( super().__init__() self._use_deepspeed = cfg.use_deepspeed self._action_space = action_space - self._apply_model_parallel = not cfg.use_deepspeed # TODO + self._apply_model_parallel = not cfg.use_deepspeed # TODO self._build_model_heads(model_name, config, device) self._action_dist = CategoricalDistribution(self._action_space.n) self._generation_kwargs = generation_kwargs diff --git a/openrl/modules/rl_module.py b/openrl/modules/rl_module.py index 8568d23c..7b7e390e 100644 --- a/openrl/modules/rl_module.py +++ b/openrl/modules/rl_module.py @@ -87,14 +87,15 @@ def __init__( self.optimizers.update({model_key: optimizer}) else: import json + import deepspeed from deepspeed.ops.adam import DeepSpeedCPUAdam, FusedAdam from transformers import get_constant_schedule - + self.use_fp16 = cfg.use_fp16 self.use_offload = cfg.use_offload - - # Check for inconsistencies in configuration files + + # Check for inconsistencies in configuration files assert not (self.use_fp16 and not self.use_deepspeed) assert not (self.use_offload and not self.use_deepspeed) assert cfg.deepspeed_config is not None diff --git a/openrl/rewards/nlp_reward.py b/openrl/rewards/nlp_reward.py index 859eecf7..51c76fb3 100644 --- a/openrl/rewards/nlp_reward.py +++ b/openrl/rewards/nlp_reward.py @@ -11,10 +11,10 @@ class NLPReward(BaseReward): def __init__( - self, - env: Env, - ref_model: str, - intent_model: str, + self, + env: Env, + ref_model: str, + intent_model: str, use_deepspeed: bool = False, ref_ds_config: str = "default", intent_ds_config: str = "default", From b6e78a2698028db6b0bc25cb2b7d27fe94e46d69 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Wed, 8 Nov 2023 11:55:39 -0500 Subject: [PATCH 40/78] fix assersion bug --- openrl/modules/networks/policy_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openrl/modules/networks/policy_network.py b/openrl/modules/networks/policy_network.py index 875ef4a7..e3ebb025 100644 --- a/openrl/modules/networks/policy_network.py +++ b/openrl/modules/networks/policy_network.py @@ -57,7 +57,7 @@ def __init__( self.tpdv = dict(dtype=torch.float32, device=device) self._use_fp16 = cfg.use_fp16 - assert cfg.use_fp16 and cfg.use_deepspeed + assert not (cfg.use_fp16 and not cfg.use_deepspeed) policy_obs_shape = get_policy_obs_space(input_space) From 1edd6c636e99788f10d84b4c1fa881fc0d27bee2 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 10 Nov 2023 15:06:30 +0800 Subject: [PATCH 41/78] fix rock paper scissors --- examples/custom_env/pettingzoo_env.py | 3 +++ examples/custom_env/rock_paper_scissors.py | 6 +++--- openrl/selfplay/wrappers/base_multiplayer_wrapper.py | 5 +++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/custom_env/pettingzoo_env.py b/examples/custom_env/pettingzoo_env.py index 8b173449..db05b468 100644 --- a/examples/custom_env/pettingzoo_env.py +++ b/examples/custom_env/pettingzoo_env.py @@ -24,7 +24,10 @@ from openrl.envs.PettingZoo.registration import register from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper + + register("RockPaperScissors", RockPaperScissors) + env = make( "RockPaperScissors", env_num=10, diff --git a/examples/custom_env/rock_paper_scissors.py b/examples/custom_env/rock_paper_scissors.py index 7d5649d1..ad82830b 100644 --- a/examples/custom_env/rock_paper_scissors.py +++ b/examples/custom_env/rock_paper_scissors.py @@ -54,7 +54,7 @@ class RockPaperScissors(AECEnv): metadata = {"render_modes": ["human"], "name": "rps_v2"} - def __init__(self, render_mode=None): + def __init__(self, id, render_mode=None): """ The init method takes in environment arguments and should define the following attributes: @@ -122,8 +122,8 @@ def observe(self, agent): """ # observation of one agent is the previous state of the other # return np.array(self.observations[agent]) - obs = np.zeros(4, dtype=np.int64) - obs[self.observations[agent]] = 1 + obs = np.zeros([1,4], dtype=np.int64) + obs[0,self.observations[agent]] = 1 return obs def close(self): diff --git a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py index a3de3c0f..525e7078 100644 --- a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py +++ b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py @@ -145,12 +145,13 @@ def _step(self, action): info, ) if termination or truncation: + return ( copy.copy(self.env.observe(self.self_player)), - self.env.rewards[self.self_player], + self.env.rewards[self.self_player] if self.self_player in self.env.rewards else 0, termination, truncation, - self.env.infos[self.self_player], + self.env.infos[self.self_player] if self.self_player in self.env.rewards else {}, ) else: From 3e0d6442d21fc45a8f5b2d6b940486714baab162 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 10 Nov 2023 15:06:52 +0800 Subject: [PATCH 42/78] fix rock paper scissors --- examples/custom_env/pettingzoo_env.py | 2 -- examples/custom_env/rock_paper_scissors.py | 4 ++-- .../selfplay/wrappers/base_multiplayer_wrapper.py | 13 ++++++++++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/examples/custom_env/pettingzoo_env.py b/examples/custom_env/pettingzoo_env.py index db05b468..d5644b7b 100644 --- a/examples/custom_env/pettingzoo_env.py +++ b/examples/custom_env/pettingzoo_env.py @@ -24,8 +24,6 @@ from openrl.envs.PettingZoo.registration import register from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper - - register("RockPaperScissors", RockPaperScissors) env = make( diff --git a/examples/custom_env/rock_paper_scissors.py b/examples/custom_env/rock_paper_scissors.py index ad82830b..71d0fa74 100644 --- a/examples/custom_env/rock_paper_scissors.py +++ b/examples/custom_env/rock_paper_scissors.py @@ -122,8 +122,8 @@ def observe(self, agent): """ # observation of one agent is the previous state of the other # return np.array(self.observations[agent]) - obs = np.zeros([1,4], dtype=np.int64) - obs[0,self.observations[agent]] = 1 + obs = np.zeros([1, 4], dtype=np.int64) + obs[0, self.observations[agent]] = 1 return obs def close(self): diff --git a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py index 525e7078..5cd6116c 100644 --- a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py +++ b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py @@ -145,13 +145,20 @@ def _step(self, action): info, ) if termination or truncation: - return ( copy.copy(self.env.observe(self.self_player)), - self.env.rewards[self.self_player] if self.self_player in self.env.rewards else 0, + ( + self.env.rewards[self.self_player] + if self.self_player in self.env.rewards + else 0 + ), termination, truncation, - self.env.infos[self.self_player] if self.self_player in self.env.rewards else {}, + ( + self.env.infos[self.self_player] + if self.self_player in self.env.rewards + else {} + ), ) else: From 3c34b5e0e2d15bf4d0c874932cf59ae49bd5edd0 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Sat, 11 Nov 2023 22:38:31 -0500 Subject: [PATCH 43/78] not using shared model --- examples/nlp/ds_config.json | 4 +- examples/nlp/eval_ds_config.json | 4 +- examples/nlp/nlp_ppo_ds.yaml | 4 +- examples/nlp/train_ppo.py | 9 +- openrl/algorithms/ppo.py | 14 +- openrl/modules/networks/policy_network_gpt.py | 169 ++++++++++++++++++ openrl/modules/networks/value_network_gpt.py | 104 +++++++++++ 7 files changed, 294 insertions(+), 14 deletions(-) create mode 100644 openrl/modules/networks/policy_network_gpt.py create mode 100644 openrl/modules/networks/value_network_gpt.py diff --git a/examples/nlp/ds_config.json b/examples/nlp/ds_config.json index 544bc405..d3b68fe1 100644 --- a/examples/nlp/ds_config.json +++ b/examples/nlp/ds_config.json @@ -1,6 +1,6 @@ { - "train_batch_size": 32, - "train_micro_batch_size_per_gpu": 16, + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "zero_optimization": { "stage": 2, diff --git a/examples/nlp/eval_ds_config.json b/examples/nlp/eval_ds_config.json index 58c08252..e9429896 100644 --- a/examples/nlp/eval_ds_config.json +++ b/examples/nlp/eval_ds_config.json @@ -1,6 +1,6 @@ { - "train_batch_size": 32, - "train_micro_batch_size_per_gpu": 16, + "train_batch_size": 16, + "train_micro_batch_size_per_gpu": 4, "steps_per_print": 10, "zero_optimization": { "stage": 0, diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index ab0c0b6c..3a031ae6 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -7,9 +7,9 @@ use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 128 +episode_length: 64 num_mini_batch: 20 -use_share_model: true +# use_share_model: true hidden_size: 1 diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index 728e4aa5..384d8f9d 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -3,9 +3,8 @@ from openrl.configs.config import create_config_parser from openrl.envs.common import make from openrl.modules.common import PPONet as Net -from openrl.modules.networks.policy_value_network_gpt import ( - PolicyValueNetworkGPT as PolicyValueNetwork, -) +from openrl.modules.networks.value_network_gpt import ValueNetworkGPT as ValueNetwork +from openrl.modules.networks.policy_network_gpt import PolicyNetworkGPT as PolicyNetwork from openrl.runners.common import PPOAgent as Agent @@ -29,11 +28,11 @@ def train(): ) # create the neural network - model_dict = {"model": PolicyValueNetwork} + model_dict = {"policy": PolicyNetwork, "critic": ValueNetwork} net = Net(env, device="cuda", cfg=cfg, model_dict=model_dict) # initialize the trainer - agent = Agent(net, use_wandb=True) + agent = Agent(net, use_wandb=False) # start training agent.train(total_time_steps=100000) diff --git a/openrl/algorithms/ppo.py b/openrl/algorithms/ppo.py index e72e01bb..fafd657e 100644 --- a/openrl/algorithms/ppo.py +++ b/openrl/algorithms/ppo.py @@ -45,7 +45,8 @@ def __init__( def ppo_update(self, sample, turn_on=True): for optimizer in self.algo_module.optimizers.values(): - optimizer.zero_grad() + if not self.use_deepspeed: + optimizer.zero_grad() ( critic_obs_batch, @@ -152,8 +153,15 @@ def ppo_update(self, sample, turn_on=True): self.algo_module.scaler.update() else: - for optimizer in self.algo_module.optimizers.values(): - optimizer.step() + if self.use_deepspeed: + if self._use_share_model: + self.algo_module.optimizers["model"].step() + else: + self.algo_module.optimizers["policy"].step() + self.algo_module.optimizers["critic"].step() + else: + for optimizer in self.algo_module.optimizers.values(): + optimizer.step() if self.world_size > 1: torch.cuda.synchronize() diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py new file mode 100644 index 00000000..cad3157e --- /dev/null +++ b/openrl/modules/networks/policy_network_gpt.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2021 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +from typing import Any, Optional, Dict + +import numpy as np +import torch +import torch.nn as nn + +from openrl.buffers.utils.util import get_policy_obs, get_policy_obs_space +from openrl.modules.networks.base_policy_network import BasePolicyNetwork +from openrl.modules.networks.utils.act import ACTLayer +from openrl.modules.networks.utils.cnn import CNNBase +from openrl.modules.networks.utils.mix import MIXBase +from openrl.modules.networks.utils.mlp import MLPBase, MLPLayer +from openrl.modules.networks.utils.popart import PopArt +from openrl.modules.networks.utils.rnn import RNNLayer +from openrl.modules.networks.utils.util import init +from openrl.utils.util import check_v2 as check +from openrl.envs.nlp.utils.distribution import CategoricalDistribution + +from transformers.modeling_utils import unwrap_model + +class PolicyNetworkGPT(BasePolicyNetwork): + def __init__( + self, + cfg, + input_space, + action_space, + device=torch.device("cpu"), + use_half=False, + disable_drop_out: bool = True, + extra_args=None, + ) -> None: + + self.use_half = use_half + self.tpdv = dict(dtype=torch.float32, device=device) + + super(PolicyNetworkGPT, self).__init__(cfg, device) + + self.disable_drop_out = disable_drop_out + + + + self._action_dist = CategoricalDistribution(action_space.n) + + from transformers import AutoConfig, AutoModelForCausalLM + config = AutoConfig.from_pretrained(cfg.model_path) + config_dict = config.to_dict() + for key in config_dict: + if "drop" in key: + config_dict[key] = 0.0 + config = config.from_dict(config_dict) + self._policy_model = AutoModelForCausalLM.from_pretrained( + cfg.model_path, config=config + ) + self._policy_model.config.use_cache = False + + def forward(self, forward_type, *args, **kwargs): + if forward_type == "original": + return self.forward_original(*args, **kwargs) + elif forward_type == "eval_actions": + return self.eval_actions(*args, **kwargs) + else: + raise NotImplementedError + + def _prepare_inputs_for_model( + self, + model: Any, + input_ids: torch.tensor, + model_kwargs: Optional[Dict[str, torch.tensor]] = None, + ): + model_inputs = unwrap_model(model).prepare_inputs_for_generation( + input_ids, **model_kwargs + ) + return model_inputs + + def forward_original( + self, raw_obs, rnn_states, masks, action_masks=None, deterministic=False + ): + for key in raw_obs.keys(): + raw_obs[key] = torch.from_numpy(raw_obs[key]) if type(raw_obs[key]) == np.ndarray else raw_obs[key] + raw_obs[key] = raw_obs[key].to(self._policy_model.device) + # raw_obs[key] = check(raw_obs[key], self.use_half, self.tpdv) + # if self._use_fp16: + # raw_obs[key] = raw_obs[key].half() + rnn_states = check(rnn_states) + + input_ids = raw_obs["input_encoded_pt"].int() + attention_mask = raw_obs["input_attention_mask_pt"] + + past_model_kwargs = None + + if past_model_kwargs is None: + past_model_kwargs = { + "attention_mask": attention_mask, + } + + model_inputs = self._prepare_inputs_for_model( + self._policy_model, input_ids, past_model_kwargs + ) + + # forward pass to transformers + output = self._policy_model(**model_inputs) + + # compute action probs - policy head + next_token_logits = output.logits[:, -1] + dist = self._action_dist.proba_distribution(action_logits=next_token_logits) + + actions = dist.mode() if deterministic else dist.sample() + action_log_probs = dist.log_prob(actions) + + return actions.unsqueeze(-1), action_log_probs.unsqueeze(-1), rnn_states + + def eval_actions( + self, obs, rnn_states, action, masks, action_masks=None, active_masks=None + ): + for key in obs.keys(): + obs[key] = torch.from_numpy(obs[key]) if type(obs[key]) == np.ndarray else obs[key] + obs[key] = obs[key].to(self._policy_model.device) + # obs[key] = check(obs[key], self.use_half, self.tpdv) + # if self._use_fp16: + # obs[key] = obs[key].half() + action = check(action).to(self._policy_model.device).squeeze() + rnn_states = check(rnn_states) + + input_ids = obs["input_encoded_pt"].int() + attention_mask = obs["input_attention_mask_pt"] + + past_model_kwargs = None + + if past_model_kwargs is None: + past_model_kwargs = { + "attention_mask": attention_mask, + } + + model_inputs = self._prepare_inputs_for_model( + self._policy_model, input_ids, past_model_kwargs + ) + + # forward pass to transformers + output = self._policy_model(**model_inputs) + + # compute action probs - policy head + next_token_logits = output.logits[:, -1] + dist = self._action_dist.proba_distribution(action_logits=next_token_logits) + + action_log_probs = dist.log_prob(action) + dist_entropy = dist.entropy() + values = None + + return action_log_probs.unsqueeze(-1), dist_entropy.mean(), values + + def get_policy_values(self, obs, rnn_states, masks): + raise NotImplementedError \ No newline at end of file diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py new file mode 100644 index 00000000..13db87b8 --- /dev/null +++ b/openrl/modules/networks/value_network_gpt.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2021 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +from typing import Any, Optional, Dict + +import numpy as np +import torch +import torch.nn as nn + +from openrl.buffers.utils.util import get_critic_obs_space +from openrl.modules.networks.base_value_network import BaseValueNetwork +from openrl.modules.networks.utils.cnn import CNNBase +from openrl.modules.networks.utils.mix import MIXBase +from openrl.modules.networks.utils.mlp import MLPBase, MLPLayer +from openrl.modules.networks.utils.popart import PopArt +from openrl.modules.networks.utils.rnn import RNNLayer +from openrl.modules.networks.utils.util import init +from openrl.modules.utils.valuenorm import ValueNorm +from openrl.utils.util import check_v2 as check + +from transformers.modeling_utils import unwrap_model + +class ValueNetworkGPT(BaseValueNetwork): + def __init__( + self, + cfg, + input_space, + action_space=None, + use_half=False, + device=torch.device("cpu"), + extra_args=None, + ): + + self.use_half = use_half + self.tpdv = dict(dtype=torch.float32, device=device) + + super(ValueNetworkGPT, self).__init__(cfg, device) + + from transformers import AutoModelForCausalLM + + self._value_model = AutoModelForCausalLM.from_pretrained(cfg.model_path) + self._value_model.config.use_cache = False + self._value_head = nn.Linear( + self._value_model.config.hidden_size, 1, bias=False + ) + self.value_normalizer = ( + ValueNorm(1, device=device) if self._use_valuenorm else None + ) + + self._value_head.to(self.device) + + + def _prepare_inputs_for_model( + self, + model: Any, + input_ids: torch.tensor, + model_kwargs: Optional[Dict[str, torch.tensor]] = None, + ): + model_inputs = unwrap_model(model).prepare_inputs_for_generation( + input_ids, **model_kwargs + ) + return model_inputs + + def forward(self, critic_obs, rnn_states, masks): + for key in critic_obs.keys(): + critic_obs[key] = torch.from_numpy(critic_obs[key]) if type(critic_obs[key]) == np.ndarray else critic_obs[key] + critic_obs[key] = critic_obs[key].to(self._value_model.device) + # critic_obs[key] = check(critic_obs[key], self.use_half, self.tpdv) + # if self._use_fp16: + # critic_obs[key] = critic_obs[key].half() + masks = check(masks).to(self._value_model.device) + rnn_states = check(rnn_states) + + input_ids = critic_obs["input_encoded_pt"].int() + attention_mask = critic_obs["input_attention_mask_pt"] + + past_model_kwargs = None + if not past_model_kwargs: + past_model_kwargs = { + "attention_mask": attention_mask, + } + + model_inputs = self._prepare_inputs_for_model( + self._value_model, input_ids, past_model_kwargs + ) + output = self._value_model(output_hidden_states=True, **model_inputs) + last_tokens_hidden = output.hidden_states[-1][:, -1] + values = self._value_head.forward(last_tokens_hidden) + + return values, rnn_states From 8c196b39d1755dee7cca44a581d9d61c6fab8fdb Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 13 Nov 2023 15:38:17 +0800 Subject: [PATCH 44/78] update --- examples/custom_env/pettingzoo_env.py | 2 +- examples/custom_env/rock_paper_scissors.py | 1 + openrl/envs/vec_env/sync_venv.py | 2 + openrl/envs/wrappers/extra_wrappers.py | 79 +++++++++++++++++++ .../wrappers/base_multiplayer_wrapper.py | 1 + 5 files changed, 84 insertions(+), 1 deletion(-) diff --git a/examples/custom_env/pettingzoo_env.py b/examples/custom_env/pettingzoo_env.py index d5644b7b..64211512 100644 --- a/examples/custom_env/pettingzoo_env.py +++ b/examples/custom_env/pettingzoo_env.py @@ -28,7 +28,7 @@ env = make( "RockPaperScissors", - env_num=10, + env_num=1, opponent_wrappers=[RandomOpponentWrapper], ) diff --git a/examples/custom_env/rock_paper_scissors.py b/examples/custom_env/rock_paper_scissors.py index 71d0fa74..f18e1841 100644 --- a/examples/custom_env/rock_paper_scissors.py +++ b/examples/custom_env/rock_paper_scissors.py @@ -18,6 +18,7 @@ import functools +import time import gymnasium import numpy as np diff --git a/openrl/envs/vec_env/sync_venv.py b/openrl/envs/vec_env/sync_venv.py index 6a61d489..1e208e4c 100644 --- a/openrl/envs/vec_env/sync_venv.py +++ b/openrl/envs/vec_env/sync_venv.py @@ -15,6 +15,7 @@ # limitations under the License. """""" +import time from copy import deepcopy from typing import Any, Callable, Iterable, List, Optional, Sequence, Union @@ -202,6 +203,7 @@ def _step(self, actions: ActType): self._truncateds[i], info, ) = returns + need_reset = _need_reset and ( all(self._terminateds[i]) or all(self._truncateds[i]) ) diff --git a/openrl/envs/wrappers/extra_wrappers.py b/openrl/envs/wrappers/extra_wrappers.py index da819a87..27359d9e 100644 --- a/openrl/envs/wrappers/extra_wrappers.py +++ b/openrl/envs/wrappers/extra_wrappers.py @@ -21,6 +21,9 @@ import gymnasium as gym import numpy as np from gymnasium import spaces +from gymnasium.utils.step_api_compatibility import ( + convert_to_terminated_truncated_step_api, +) from gymnasium.wrappers import AutoResetWrapper, StepAPICompatibility from openrl.envs.wrappers import BaseObservationWrapper, BaseRewardWrapper, BaseWrapper @@ -46,6 +49,76 @@ def step(self, action): return obs, total_reward, term, trunc, info +def convert_to_done_step_api( + step_returns, + is_vector_env: bool = False, +): + if len(step_returns) == 4: + return step_returns + else: + assert len(step_returns) == 5 + observations, rewards, terminated, truncated, infos = step_returns + + # Cases to handle - info single env / info vector env (list) / info vector env (dict) + # if truncated[0]: + # import pdb; + # pdb.set_trace() + + if is_vector_env is False: + if isinstance(terminated, list): + infos["TimeLimit.truncated"] = truncated[0] and not terminated[0] + done_return = np.logical_or(terminated, truncated) + else: + if truncated or terminated: + infos["TimeLimit.truncated"] = truncated and not terminated + done_return = terminated or truncated + return ( + observations, + rewards, + done_return, + infos, + ) + elif isinstance(infos, list): + for info, env_truncated, env_terminated in zip( + infos, truncated, terminated + ): + if env_truncated or env_terminated: + info["TimeLimit.truncated"] = env_truncated and not env_terminated + return ( + observations, + rewards, + np.logical_or(terminated, truncated), + infos, + ) + elif isinstance(infos, dict): + if np.logical_or(np.any(truncated), np.any(terminated)): + infos["TimeLimit.truncated"] = np.logical_and( + truncated, np.logical_not(terminated) + ) + return ( + observations, + rewards, + np.logical_or(terminated, truncated), + infos, + ) + else: + raise TypeError( + "Unexpected value of infos, as is_vector_envs=False, expects `info` to" + f" be a list or dict, actual type: {type(infos)}" + ) + + +def step_api_compatibility( + step_returns, + output_truncation_bool: bool = True, + is_vector_env: bool = False, +): + if output_truncation_bool: + return convert_to_terminated_truncated_step_api(step_returns, is_vector_env) + else: + return convert_to_done_step_api(step_returns, is_vector_env) + + class RemoveTruncated(StepAPICompatibility, BaseWrapper): def __init__( self, @@ -54,6 +127,12 @@ def __init__( output_truncation_bool = False super().__init__(env, output_truncation_bool=output_truncation_bool) + def step(self, action): + step_returns = self.env.step(action) + return step_api_compatibility( + step_returns, self.output_truncation_bool, self.is_vector_env + ) + class FlattenObservation(BaseObservationWrapper): def __init__(self, env: gym.Env): diff --git a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py index 5cd6116c..ca8d1e95 100644 --- a/openrl/selfplay/wrappers/base_multiplayer_wrapper.py +++ b/openrl/selfplay/wrappers/base_multiplayer_wrapper.py @@ -104,6 +104,7 @@ def reset(self, *, seed: Optional[int] = None, **kwargs): action = self.get_opponent_action( player_name, observation, reward, termination, truncation, info ) + self.env.step(action) def on_episode_end( From 6e9ce0f6a81309acecaee6f09924d1b788d49219 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Mon, 13 Nov 2023 15:41:47 +0800 Subject: [PATCH 45/78] fix petting zoo --- examples/custom_env/pettingzoo_env.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/custom_env/pettingzoo_env.py b/examples/custom_env/pettingzoo_env.py index 64211512..d5644b7b 100644 --- a/examples/custom_env/pettingzoo_env.py +++ b/examples/custom_env/pettingzoo_env.py @@ -28,7 +28,7 @@ env = make( "RockPaperScissors", - env_num=1, + env_num=10, opponent_wrappers=[RandomOpponentWrapper], ) From f1ecdef34808a0ed9a95ee4ce3954f2f6b44f01b Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 15 Nov 2023 15:02:14 +0800 Subject: [PATCH 46/78] add MAT network test --- .../test_networks/test_MAT_network.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 tests/test_modules/test_networks/test_MAT_network.py diff --git a/tests/test_modules/test_networks/test_MAT_network.py b/tests/test_modules/test_networks/test_MAT_network.py new file mode 100644 index 00000000..25b4fd00 --- /dev/null +++ b/tests/test_modules/test_networks/test_MAT_network.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + +from openrl.configs.config import create_config_parser +from openrl.modules.networks.MAT_network import MultiAgentTransformer + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_MAT_network(config): + net = MultiAgentTransformer( + config, + input_space=spaces.Discrete(2), + action_space=spaces.Discrete(2), + ) + net.get_actor_para() + net.get_critic_para() + + obs = np.zeros([1, 2]) + rnn_states = np.zeros(2) + masks = np.zeros(2) + action = np.zeros(1) + net.get_actions(obs=obs, masks=masks) + net.eval_actions( + obs=obs, rnn_states=rnn_states, action=action, masks=masks, action_masks=None + ) + net.get_values(critic_obs=obs, masks=masks) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 58c022f86ca58639f9504ea4515835c248238bcc Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 14:07:51 +0800 Subject: [PATCH 47/78] update README.md --- README.md | 6 +++--- README_zh.md | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7c5cfbbe..2b175f86 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@
- +
--- @@ -25,7 +25,7 @@ [![Contributors](https://img.shields.io/github/contributors/OpenRL-Lab/openrl)](https://github.com/OpenRL-Lab/openrl/graphs/contributors) [![GitHub license](https://img.shields.io/github/license/OpenRL-Lab/openrl)](https://github.com/OpenRL-Lab/openrl/blob/master/LICENSE) -[![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) +[![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/qMbVT2qBhr) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) OpenRL-v0.1.10 is updated on Oct 27, 2023 @@ -333,7 +333,7 @@ If you are using OpenRL in your research project, you are also welcome to join t - Join the [slack](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) group to discuss OpenRL usage and development with us. -- Join the [Discord](https://discord.gg/guvAS2up) group to discuss OpenRL usage and development with us. +- Join the [Discord](https://discord.gg/qMbVT2qBhr) group to discuss OpenRL usage and development with us. - Send an E-mail to: [huangshiyu@4paradigm.com](huangshiyu@4paradigm.com) - Join the [GitHub Discussion](https://github.com/orgs/OpenRL-Lab/discussions). diff --git a/README_zh.md b/README_zh.md index aa193ed7..91cd7642 100644 --- a/README_zh.md +++ b/README_zh.md @@ -1,5 +1,5 @@
- +
@@ -26,7 +26,7 @@ [![Contributors](https://img.shields.io/github/contributors/OpenRL-Lab/openrl)](https://github.com/OpenRL-Lab/openrl/graphs/contributors) [![GitHub license](https://img.shields.io/github/license/OpenRL-Lab/openrl)](https://github.com/OpenRL-Lab/openrl/blob/master/LICENSE) -[![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/guvAS2up) +[![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/qMbVT2qBhr) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) OpenRL-v0.1.10 is updated on Oct 27, 2023 @@ -293,7 +293,7 @@ openrl --mode train --env CartPole-v1 - 加入 [slack](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) 群组,与我们一起讨论OpenRL的使用和开发。 -- 加入 [Discord](https://discord.gg/guvAS2up) 群组,与我们一起讨论OpenRL的使用和开发。 +- 加入 [Discord](https://discord.gg/qMbVT2qBhr) 群组,与我们一起讨论OpenRL的使用和开发。 - 发送邮件到: [huangshiyu@4paradigm.com](huangshiyu@4paradigm.com) - 加入 [GitHub Discussion](https://github.com/orgs/OpenRL-Lab/discussions) From b08d096bfa69e2a7bb8c903c3a4e8fb466f8f11d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 14:51:34 +0800 Subject: [PATCH 48/78] add selfplay test --- openrl/selfplay/opponents/utils.py | 3 + openrl/selfplay/strategies/__init__.py | 41 -- openrl/selfplay/strategies/base_strategy.py | 39 -- openrl/selfplay/strategies/strategies.py | 413 ------------------ tests/test_selfplay/test_selfplay_strategy.py | 91 ---- tests/test_selfplay/test_train_selfplay.py | 120 +++++ 6 files changed, 123 insertions(+), 584 deletions(-) delete mode 100644 openrl/selfplay/strategies/__init__.py delete mode 100644 openrl/selfplay/strategies/base_strategy.py delete mode 100644 openrl/selfplay/strategies/strategies.py delete mode 100644 tests/test_selfplay/test_selfplay_strategy.py create mode 100644 tests/test_selfplay/test_train_selfplay.py diff --git a/openrl/selfplay/opponents/utils.py b/openrl/selfplay/opponents/utils.py index d1d983d5..42ddbb2b 100644 --- a/openrl/selfplay/opponents/utils.py +++ b/openrl/selfplay/opponents/utils.py @@ -28,6 +28,9 @@ def check_opponent_template(opponent_template: Union[str, Path]): + assert isinstance(opponent_template, Path) or isinstance( + opponent_template, str + ), f"opponent_template {opponent_template} must be a Path or str" if isinstance(opponent_template, str): opponent_template = Path(opponent_template) assert ( diff --git a/openrl/selfplay/strategies/__init__.py b/openrl/selfplay/strategies/__init__.py deleted file mode 100644 index 2908f8b4..00000000 --- a/openrl/selfplay/strategies/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" -from openrl.selfplay.strategies.strategies import ( - NaiveSelfplayStrategy, - OnlyLatestSelfplayStrategy, - VarExistEnemySelfplayStrategy, - WeightExistEnemySelfplayStrategy, - WeightSelfplayStrategy, - WinRateSelfplayStrategy, -) - - -def make_strategy(strategy_name): - if strategy_name == "Naive": - selfplay_strategy = NaiveSelfplayStrategy - elif strategy_name == "OnlyLatest": - selfplay_strategy = OnlyLatestSelfplayStrategy - elif strategy_name == "Weight": - selfplay_strategy = WeightSelfplayStrategy - elif strategy_name == "WinRate": - selfplay_strategy = WinRateSelfplayStrategy - elif strategy_name == "VarExistEnemy": - selfplay_strategy = VarExistEnemySelfplayStrategy - elif strategy_name == "WeightExistEnemy": - selfplay_strategy = WeightExistEnemySelfplayStrategy - return selfplay_strategy diff --git a/openrl/selfplay/strategies/base_strategy.py b/openrl/selfplay/strategies/base_strategy.py deleted file mode 100644 index 4e280b13..00000000 --- a/openrl/selfplay/strategies/base_strategy.py +++ /dev/null @@ -1,39 +0,0 @@ -from abc import abstractmethod - - -class BaseSelfplayStrategy: - @abstractmethod - def __init__(self, all_args, nenvs, exist_enemy_num): - raise NotImplementedError - - @abstractmethod - def getcnt(self): - raise NotImplementedError - - @abstractmethod - def update_enemy_ids(self, new_enemy_ids): - raise NotImplementedError - - @abstractmethod - def restore(self, model_dir): - raise NotImplementedError - - @abstractmethod - def get_qlist(self): - raise NotImplementedError - - @abstractmethod - def update_weight(self, enemy_loses): - raise NotImplementedError - - @abstractmethod - def update_win_rate(self, dones, enemy_wins): - raise NotImplementedError - - @abstractmethod - def push_newone(self): - raise NotImplementedError - - @abstractmethod - def get_plist(self): - raise NotImplementedError diff --git a/openrl/selfplay/strategies/strategies.py b/openrl/selfplay/strategies/strategies.py deleted file mode 100644 index 28e492ec..00000000 --- a/openrl/selfplay/strategies/strategies.py +++ /dev/null @@ -1,413 +0,0 @@ -import json - -import numpy as np - -from openrl.selfplay.strategies.base_strategy import BaseSelfplayStrategy - - -class SelfplayStrategy(BaseSelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - # qlist和history_cnt的数据结构 - self.all_args = all_args - self.qlist = [] - self.history_cnt = 0 - self.enemy_ids = [0] * nenvs - self.length = nenvs - - def getcnt(self): - return self.history_cnt - - def update_enemy_ids(self, new_enemy_ids): - self.enemy_ids = new_enemy_ids - - def restore(self, model_dir): - with open(model_dir + "/enemy_history_info.json") as f_obj: - enemy_info = json.load(f_obj) - self.qlist = enemy_info["qlist"] - self.history_cnt = enemy_info["history_cnt"] - - def get_qlist(self): - return self.qlist - - def update_weight(self, enemy_loses): - pass - - def update_win_rate(self, dones, enemy_wins): - pass - - def push_newone(self): - pass - - -class RatioSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(RatioSelfplayStrategy, self).__init__(all_args, nenvs) - - def push_newone(self): - self.history_cnt += 1 - - def get_plist(self): - if self.history_cnt == 1: - return [1] - temp_plist = np.logspace( - 0, self.history_cnt - 1, self.history_cnt, endpoint=True, base=1.5 - ) - temp_plist[-1] = sum(temp_plist[:-1]) * 4 - temp_plist /= sum(temp_plist) - return temp_plist - - -class NaiveSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(NaiveSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - - def push_newone(self): - self.history_cnt += 1 - - def get_plist(self): - return [1] * (self.history_cnt - 1) + [4 * (self.history_cnt - 1)] - - def save_new_one(self): - return True - - -class OnlyLatestSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(OnlyLatestSelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - self.play_list = [] - self.max_play_num = all_args.max_play_num - self.least_win_rate = all_args.least_win_rate - - def push_newone(self): - self.play_list.append([]) - self.history_cnt += 1 - - def get_plist(self): - return [0] * (self.history_cnt - 1) + [1] - - def save_new_one(self, least_win_rate): - if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * ( - len(self.play_list[-1]) + 1 - ) and len(self.play_list[-1]) >= (self.max_play_num - 10): - return True - - def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids): - for win_enemy_id in win_enemy_ids: - self.play_list[win_enemy_id].append(1) - for tie_enemy_id in tie_enemy_ids: - self.play_list[tie_enemy_id].append(0) - for lose_enemy_id in lose_enemy_ids: - self.play_list[lose_enemy_id].append(-1) - self.cut_overflow() - - def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses): - win_enemy_ids = np.array(self.enemy_ids)[enemy_wins] - tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties] - lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses] - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def cut_overflow(self): - for index in range(len(self.play_list)): - if len(self.play_list[index]) > self.max_play_num: - self.play_list[index] = self.play_list[index][ - (-1) * self.max_play_num : - ] - - def get_info_list(self, info_list): - return_info = [] - for info in info_list: - if info == "win": - equal_num = 1 - elif info == "tie": - equal_num = 0 - elif info == "lose": - equal_num = -1 - num_list = [] - for enemy_play_list in self.play_list: - if info == "play": - num_list.append(len(enemy_play_list)) - else: - num_list.append(int(sum(np.array(enemy_play_list) == equal_num))) - return_info.append(num_list) - return tuple(return_info) - - def get_enemy_play_dict(self): - win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list( - ["win", "tie", "lose", "play"] - ) - return { - "win_num_list": list(win_num_list), - "tie_num_list": list(tie_num_list), - "lose_num_list": list(lose_num_list), - "play_num_list": list(play_num_list), - } - - -class WeightSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WeightSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - self.recent_weight = 0.8 - self.recent_num = 3 - self.gama = 1 / (nenvs) - - def push_newone(self): - self.history_cnt += 1 - if self.history_cnt <= self.recent_num: - return - elif self.history_cnt == self.recent_num + 1: - self.qlist = [1] - else: - self.qlist.append(max(self.qlist)) - - def get_plist(self): - temp_plist = np.zeros([self.history_cnt]) - temp_plist[: (-1 * self.recent_num)] = ( - np.exp(self.qlist) / sum(np.exp(self.qlist)) * (1 - self.recent_weight) - ) - temp_plist[(-1 * self.recent_num) :] = self.recent_weight / self.recent_num - return temp_plist - - def update_weight(self, enemy_loses): - if self.history_cnt < self.recent_num + 2: - return - lose_enemy_ids = np.array(self.enemy_ids)[ - enemy_loses - ] # 输了的enemy_ids,进行更新,其中可能有重复的enemy_id - for enemy_id in lose_enemy_ids: - if enemy_id <= len(self.qlist) - 1: - divide_num = ( - len(self.qlist) - * np.exp(self.qlist[enemy_id]) - / sum(np.exp(self.qlist)) - ) - next_weight = self.qlist[enemy_id] - self.gama / divide_num - self.qlist[enemy_id] = next_weight - - -class WinRateSelfplayStrategy(SelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WinRateSelfplayStrategy, self).__init__(all_args, nenvs, exist_enemy_num) - self.max_play_num = all_args.max_play_num - self.play_list = ( - [] - ) # 在该list中,每个对手维护一个长度不超过max_play_num的列表,1为该对手获胜, 0为平, -1为我方获胜 - self.recent_list = [] - self.recent_list_max_len = all_args.recent_list_max_len - self.latest_weight = all_args.latest_weight - self.least_win_rate = all_args.least_win_rate - self.stage2_least_win_rate = all_args.least_win_rate - self.stage = 1 - self.newest_pos = all_args.newest_pos - self.newest_weight = all_args.newest_weight - - def push_newone(self): - self.play_list.append([]) - self.history_cnt += 1 - - def get_info_list(self, info_list): - return_info = [] - for info in info_list: - if info == "win": - equal_num = 1 - elif info == "tie": - equal_num = 0 - elif info == "lose": - equal_num = -1 - num_list = [] - for enemy_play_list in self.play_list: - if info == "play": - num_list.append(len(enemy_play_list)) - else: - num_list.append(int(sum(np.array(enemy_play_list) == equal_num))) - return_info.append(num_list) - return tuple(return_info) - - def get_plist(self): - def f_hard(win_rate_list): - p = 1 - return win_rate_list**p - - def f_var(win_rate_list): - return (1 - win_rate_list) * win_rate_list - - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - return f_hard(win_rate_list) - - def update_play_list(self, win_enemy_ids, tie_enemy_ids, lose_enemy_ids): - if self.stage == 2: - win_enemy_num = (np.array(win_enemy_ids) != self.newest_pos).sum() - tie_enemy_num = (np.array(tie_enemy_ids) != self.newest_pos).sum() - lose_enemy_num = (np.array(lose_enemy_ids) != self.newest_pos).sum() - self.recent_list += ( - [1] * win_enemy_num + [0] * tie_enemy_num + [-1] * lose_enemy_num - ) - for win_enemy_id in win_enemy_ids: - self.play_list[win_enemy_id].append(1) - for tie_enemy_id in tie_enemy_ids: - self.play_list[tie_enemy_id].append(0) - for lose_enemy_id in lose_enemy_ids: - self.play_list[lose_enemy_id].append(-1) - self.cut_overflow() - - def update_win_rate(self, enemy_wins, enemy_ties, enemy_loses): - win_enemy_ids = np.array(self.enemy_ids)[enemy_wins] - tie_enemy_ids = np.array(self.enemy_ids)[enemy_ties] - lose_enemy_ids = np.array(self.enemy_ids)[enemy_loses] - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def restore(self, model_dir): - with open(model_dir + "/enemy_history_info.json") as f_obj: - enemy_info = json.load(f_obj) - self.history_cnt = enemy_info["history_cnt"] - self.play_list = enemy_info["play_list"] - - def get_enemy_play_dict(self): - win_num_list, tie_num_list, lose_num_list, play_num_list = self.get_info_list( - ["win", "tie", "lose", "play"] - ) - return { - "win_num_list": list(win_num_list), - "tie_num_list": list(tie_num_list), - "lose_num_list": list(lose_num_list), - "play_num_list": list(play_num_list), - } - - def update_win_info(self, data): - win_enemy_ids, tie_enemy_ids, lose_enemy_ids = ( - data["win_enemy_ids"], - data["tie_enemy_ids"], - data["lose_enemy_ids"], - ) - self.update_play_list(win_enemy_ids, tie_enemy_ids, lose_enemy_ids) - - def cut_overflow(self): - for index in range(len(self.play_list)): - if len(self.play_list[index]) > self.max_play_num: - self.play_list[index] = self.play_list[index][ - (-1) * self.max_play_num : - ] - if len(self.recent_list) > self.recent_list_max_len: - self.recent_list = self.recent_list[(-1) * self.recent_list_max_len :] - - def save_new_one(self, least_win_rate): - if self.stage == 1: - if sum(np.array(self.play_list[-1]) == -1) >= least_win_rate * ( - len(self.play_list[-1]) + 1 - ) and len(self.play_list[-1]) >= (self.max_play_num - 10): - if self.getcnt() - self.all_args.exist_enemy_num == 1: - return True - self.stage = 2 - print("switch to stage 2") - if self.stage == 2: - if sum(np.array(self.recent_list) == -1) >= self.stage2_least_win_rate * ( - len(self.recent_list) + 1 - ) and len(self.recent_list) >= (self.recent_list_max_len - 10): - self.stage = 1 - self.recent_list = [] - return True - return False - - -class ExistEnemySelfplayStrategy(WinRateSelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(ExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - self.all_args = all_args - self.enemy_ids = [0] * nenvs # 第一个step就会更新,所以初始化无所谓 - # 列表的前exist_enemy_num个为已存在的对手 - if exist_enemy_num > 0: - self.play_list = [[]] * exist_enemy_num - self.history_cnt = exist_enemy_num - self.exist_enemy_num = exist_enemy_num - self.max_enemy_num = all_args.max_enemy_num - - def get_final_plist(self, f_hard, f_var): - raise NotImplementedError - - def get_plist(self): - def f_hard(win_rate_list): - p = 2 - return win_rate_list**p - - def f_var(win_rate_list): - return (1 - win_rate_list) * win_rate_list - - plist = self.get_final_plist(f_hard, f_var) - if self.max_enemy_num != -1: - if self.history_cnt - self.exist_enemy_num > self.max_enemy_num: - mask_index = np.array( - list( - range( - self.exist_enemy_num, self.history_cnt - self.max_enemy_num - ) - ) - ) - zero_vec = np.zeros( - self.history_cnt - self.exist_enemy_num - self.max_enemy_num - ) - plist[mask_index] = zero_vec - - return plist - - -class VarExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(VarExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - - def get_final_plist(self, f_hard, f_var): - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - win_rate_list = f_var(win_rate_list) - - return win_rate_list - - -class WeightExistEnemySelfplayStrategy(ExistEnemySelfplayStrategy): - def __init__(self, all_args, nenvs, exist_enemy_num): - super(WeightExistEnemySelfplayStrategy, self).__init__( - all_args, nenvs, exist_enemy_num - ) - - def get_final_plist(self, f_hard, f_var): - win_num_list, tie_num_list, play_num_list = self.get_info_list( - ["win", "tie", "play"] - ) - win_rate_list = ( - np.array(win_num_list) + 0.5 * np.array(tie_num_list) + 0.5 - ) / (np.array(play_num_list) + 1) - - if self.stage == 1: - win_rate_list = f_hard(win_rate_list)[:-1] - # if self.newest_pos != -1: - # win_rate_list[self.newest_pos] = 0 - win_rate_list = ( - win_rate_list / (sum(win_rate_list) + 1e-8) * (1 - self.latest_weight) - ) - return list(win_rate_list) + [self.latest_weight] - elif self.stage == 2: - win_rate_list = f_hard(win_rate_list) - if self.newest_pos != -1: - win_rate_list[self.newest_pos] = self.newest_weight - index_without_newest = list(range(self.history_cnt)) - index_without_newest.remove(self.newest_pos) - win_rate_list[index_without_newest] /= sum( - win_rate_list[index_without_newest] - ) - win_rate_list[index_without_newest] *= 1 - self.newest_weight - else: - win_rate_list /= sum(win_rate_list) - return win_rate_list diff --git a/tests/test_selfplay/test_selfplay_strategy.py b/tests/test_selfplay/test_selfplay_strategy.py deleted file mode 100644 index 61b04052..00000000 --- a/tests/test_selfplay/test_selfplay_strategy.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" -import os -import sys - -import pytest - -from openrl.selfplay.strategies import ( - NaiveSelfplayStrategy, - OnlyLatestSelfplayStrategy, - VarExistEnemySelfplayStrategy, - WeightExistEnemySelfplayStrategy, - WeightSelfplayStrategy, - WinRateSelfplayStrategy, -) - - -@pytest.fixture(scope="module", params=[""]) -def config(request): - from openrl.configs.config import create_config_parser - - cfg_parser = create_config_parser() - cfg = cfg_parser.parse_args(request.param.split()) - return cfg - - -@pytest.mark.unittest -def test_naive_selfplay(config): - strategy = NaiveSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.update_win_rate(dones=True, enemy_wins=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_only_latest_selfplay(config): - strategy = OnlyLatestSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_weight_selfplay(config): - strategy = WeightSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_win_rate_selfplay(config): - strategy = WinRateSelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - - -@pytest.mark.unittest -def test_var_exist_enemy_selfplay(config): - strategy = VarExistEnemySelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -@pytest.mark.unittest -def test_weight_exist_enemy_selfplay(config): - strategy = WeightExistEnemySelfplayStrategy(config, 1, 1) - strategy.get_plist() - strategy.update_weight(enemy_loses=1) - strategy.push_newone() - - -if __name__ == "__main__": - sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py new file mode 100644 index 00000000..9e7b501f --- /dev/null +++ b/tests/test_selfplay/test_train_selfplay.py @@ -0,0 +1,120 @@ +import os +import sys + +import numpy as np +import pytest +import torch + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers import FlattenObservation +from openrl.envs.wrappers.pettingzoo_wrappers import RecordWinner +from openrl.modules.common import PPONet as Net +from openrl.runners.common import PPOAgent as Agent +from openrl.selfplay.wrappers.opponent_pool_wrapper import OpponentPoolWrapper +from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper + + +@pytest.fixture( + scope="module", + params=[ + "RandomOpponent", + "LastOpponent", + ], +) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) + for i, c in enumerate(cfg.callbacks): + if c["id"] == "SelfplayCallback": + c["args"][ + "opponent_template" + ] = "./examples/selfplay/opponent_templates/tictactoe_opponent" + cfg.callbacks[i] = c + elif c["id"] == "SelfplayAPI": + c["args"]["sample_strategy"] = request.param + cfg.callbacks[i] = c + else: + pass + + return cfg + + +def train(cfg): + # Create environment + env_num = 2 + render_model = None + env = make( + "tictactoe_v3", + render_mode=render_model, + env_num=env_num, + asynchronous=True, + opponent_wrappers=[RecordWinner, OpponentPoolWrapper], + env_wrappers=[FlattenObservation], + cfg=cfg, + ) + # Create neural network + + net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu") + # Create agent + agent = Agent(net) + # Begin training + agent.train(total_time_steps=100) + env.close() + agent.save("./selfplay_agent/") + return agent + + +def evaluation(): + from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender + + print("Evaluation...") + env_num = 1 + env = make( + "tictactoe_v3", + env_num=env_num, + asynchronous=True, + opponent_wrappers=[TictactoeRender, RandomOpponentWrapper], + env_wrappers=[FlattenObservation], + auto_reset=False, + ) + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args([]) + net = Net(env, cfg=cfg, device="cuda" if torch.cuda.is_available() else "cpu") + + agent = Agent(net) + + agent.load("./selfplay_agent/") + agent.set_env(env) + env.reset(seed=0) + + total_reward = 0.0 + ep_num = 2 + for ep_now in range(ep_num): + obs, info = env.reset() + done = False + step = 0 + + while not np.any(done): + # predict next action based on the observation + action, _ = agent.act(obs, info, deterministic=True) + obs, r, done, info = env.step(action) + step += 1 + + if np.any(done): + total_reward += np.mean(r) > 0 + print(f"{ep_now}/{ep_num}: reward: {np.mean(r)}") + print(f"win rate: {total_reward/ep_num}") + env.close() + print("Evaluation finished.") + + +@pytest.mark.unittest +def test_train_selfplay(config): + train(config) + evaluation() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 7373b044228d9b34b621730076973dde5e55c98d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 15:16:45 +0800 Subject: [PATCH 49/78] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 9e7b501f..c2ae29be 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -59,22 +59,20 @@ def train(cfg): # Create agent agent = Agent(net) # Begin training - agent.train(total_time_steps=100) + agent.train(total_time_steps=20) env.close() agent.save("./selfplay_agent/") return agent def evaluation(): - from examples.selfplay.tictactoe_utils.tictactoe_render import TictactoeRender - print("Evaluation...") env_num = 1 env = make( "tictactoe_v3", env_num=env_num, asynchronous=True, - opponent_wrappers=[TictactoeRender, RandomOpponentWrapper], + opponent_wrappers=[RandomOpponentWrapper], env_wrappers=[FlattenObservation], auto_reset=False, ) From 7ded5d55c8b2793bbf9624edebbe9d2d64857a4d Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:25:25 +0800 Subject: [PATCH 50/78] add selfplay test --- examples/selfplay/selfplay.yaml | 2 +- openrl/selfplay/callbacks/selfplay_api.py | 6 +++++- setup.py | 10 ++++++++-- tests/test_selfplay/test_train_selfplay.py | 14 +++++++++++--- 4 files changed, 25 insertions(+), 7 deletions(-) diff --git a/examples/selfplay/selfplay.yaml b/examples/selfplay/selfplay.yaml index 7a7c1bbe..8a05611d 100644 --- a/examples/selfplay/selfplay.yaml +++ b/examples/selfplay/selfplay.yaml @@ -1,6 +1,6 @@ globals: selfplay_api_host: 127.0.0.1 - selfplay_api_port: 10086 + selfplay_api_port: 13486 seed: 0 selfplay_api: diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py index 3d148749..cdf9d04d 100644 --- a/openrl/selfplay/callbacks/selfplay_api.py +++ b/openrl/selfplay/callbacks/selfplay_api.py @@ -57,7 +57,10 @@ def _init_callback(self) -> None: success = self.api_client.set_sample_strategy(self.sample_strategy) try_time -= 1 if try_time <= 0: - raise RuntimeError("Failed to set sample strategy.") + raise RuntimeError( + f"Failed to set sample strategy: {self.sample_strategy}. host:" + f" {self.host}, port: {self.port}" + ) def _on_step(self) -> bool: # print("To send request to API server.") @@ -72,5 +75,6 @@ def _on_training_end(self) -> None: print(f"deleting {application_name}") serve.delete(application_name) del self.bind + serve.shutdown() if self.verbose >= 2: print(f"delete {application_name} done!") diff --git a/setup.py b/setup.py index 84da342e..7c8f31a5 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,7 @@ def get_install_requires() -> list: return [ "setuptools>=67.0", - "gymnasium", + "gymnasium>=0.29", "click", "termcolor", "gym", @@ -71,7 +71,13 @@ def get_extra_requires() -> dict: "evaluate", ], "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], - "selfplay_test": ["pettingzoo[mpe]", "pettingzoo[butterfly]"], + "selfplay_test": [ + "ray[default]", + "ray[serve]", + "fastapi", + "pettingzoo[mpe]", + "pettingzoo[butterfly]", + ], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], "atari": ["gymnasium[atari]", "gymnasium[accept-rom-license]"], diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index c2ae29be..7e440bad 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -3,6 +3,7 @@ import numpy as np import pytest +import ray import torch from openrl.configs.config import create_config_parser @@ -18,22 +19,29 @@ @pytest.fixture( scope="module", params=[ - "RandomOpponent", - "LastOpponent", + {"port": 13486, "strategy": "RandomOpponent"}, + {"port": 13487, "strategy": "LastOpponent"}, ], ) def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) + cfg.selfplay_api.port = request.param["port"] for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ "opponent_template" ] = "./examples/selfplay/opponent_templates/tictactoe_opponent" + port = c["args"]["api_address"].split(":")[-1].split("/")[0] + c["args"]["api_address"] = c["args"]["api_address"].replace( + port, str(request.param["port"]) + ) cfg.callbacks[i] = c elif c["id"] == "SelfplayAPI": - c["args"]["sample_strategy"] = request.param + c["args"]["sample_strategy"] = request.param["strategy"] + c["args"]["port"] = request.param["port"] cfg.callbacks[i] = c + else: pass From 2bd465841e915f1fc0b2a462f93790ed9b411369 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:36:24 +0800 Subject: [PATCH 51/78] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 7e440bad..9af75a1c 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -3,7 +3,6 @@ import numpy as np import pytest -import ray import torch from openrl.configs.config import create_config_parser @@ -20,7 +19,6 @@ scope="module", params=[ {"port": 13486, "strategy": "RandomOpponent"}, - {"port": 13487, "strategy": "LastOpponent"}, ], ) def config(request): @@ -67,7 +65,7 @@ def train(cfg): # Create agent agent = Agent(net) # Begin training - agent.train(total_time_steps=20) + agent.train(total_time_steps=10) env.close() agent.save("./selfplay_agent/") return agent From 17cf742192511b40550e48265e3a2468df586150 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 23 Nov 2023 16:54:04 +0800 Subject: [PATCH 52/78] add selfplay test --- tests/test_selfplay/test_train_selfplay.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 9af75a1c..bdeb40c1 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -1,4 +1,5 @@ import os +import socket import sys import numpy as np @@ -15,10 +16,17 @@ from openrl.selfplay.wrappers.random_opponent_wrapper import RandomOpponentWrapper +def find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + return s.getsockname()[1] + + @pytest.fixture( scope="module", params=[ - {"port": 13486, "strategy": "RandomOpponent"}, + {"port": find_free_port(), "strategy": "RandomOpponent"}, ], ) def config(request): @@ -54,7 +62,7 @@ def train(cfg): "tictactoe_v3", render_mode=render_model, env_num=env_num, - asynchronous=True, + asynchronous=False, opponent_wrappers=[RecordWinner, OpponentPoolWrapper], env_wrappers=[FlattenObservation], cfg=cfg, From 434495499708752f0d32fb519e6fbf9f2ad63110 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 13:02:32 +0800 Subject: [PATCH 53/78] add selfplay test --- .github/workflows/unit_test.yml | 2 +- tests/test_selfplay/test_train_selfplay.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_test.yml b/.github/workflows/unit_test.yml index e327cdf5..9671f935 100644 --- a/.github/workflows/unit_test.yml +++ b/.github/workflows/unit_test.yml @@ -31,7 +31,7 @@ jobs: - name: do_unittest timeout-minutes: 40 run: | - xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes + xvfb-run -s "-screen 0 1400x900x24" python3 -m pytest tests --cov=openrl --cov-report=xml -m unittest --cov-report=term-missing --durations=0 -v --color=yes -s - name: Upload coverage reports to Codecov with GitHub Action uses: codecov/codecov-action@v3 with: diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index bdeb40c1..34d28fc3 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -27,12 +27,14 @@ def find_free_port(): scope="module", params=[ {"port": find_free_port(), "strategy": "RandomOpponent"}, + {"port": find_free_port(), "strategy": "LastOpponent"}, ], ) def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) cfg.selfplay_api.port = request.param["port"] + print("port:",request.param["port"]) for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ From a702e8d93f9e9f2a6b064ec32b7e92d7efd850b1 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 13:28:51 +0800 Subject: [PATCH 54/78] add selfplay test --- openrl/selfplay/callbacks/selfplay_api.py | 2 +- openrl/selfplay/selfplay_api/selfplay_api.py | 2 +- setup.py | 9 +++++++-- tests/test_selfplay/test_train_selfplay.py | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) diff --git a/openrl/selfplay/callbacks/selfplay_api.py b/openrl/selfplay/callbacks/selfplay_api.py index cdf9d04d..e2214ecb 100644 --- a/openrl/selfplay/callbacks/selfplay_api.py +++ b/openrl/selfplay/callbacks/selfplay_api.py @@ -50,7 +50,7 @@ def _init_callback(self) -> None: ) self.bind = SelfplayAPIServer.bind() - serve.run(self.bind) + serve.run(self.bind, route_prefix="/selfplay") success = False try_time = 10 while not success: diff --git a/openrl/selfplay/selfplay_api/selfplay_api.py b/openrl/selfplay/selfplay_api/selfplay_api.py index 2c346b46..307c4fcc 100644 --- a/openrl/selfplay/selfplay_api/selfplay_api.py +++ b/openrl/selfplay/selfplay_api/selfplay_api.py @@ -33,7 +33,7 @@ from openrl.selfplay.selfplay_api.opponent_model import BattleResult -@serve.deployment(route_prefix="/selfplay") +@serve.deployment() @serve.ingress(app) class SelfplayAPIServer(BaseSelfplayAPIServer): @app.post("/set_sample_strategy") diff --git a/setup.py b/setup.py index 7c8f31a5..043a7267 100644 --- a/setup.py +++ b/setup.py @@ -70,9 +70,14 @@ def get_extra_requires() -> dict: "datasets==2.13", "evaluate", ], - "selfplay": ["ray[default]", "ray[serve]", "pettingzoo[classic]", "trueskill"], + "selfplay": [ + "ray[default]>=2.7", + "ray[serve]", + "pettingzoo[classic]", + "trueskill", + ], "selfplay_test": [ - "ray[default]", + "ray[default]>=2.7", "ray[serve]", "fastapi", "pettingzoo[mpe]", diff --git a/tests/test_selfplay/test_train_selfplay.py b/tests/test_selfplay/test_train_selfplay.py index 34d28fc3..a67ea964 100644 --- a/tests/test_selfplay/test_train_selfplay.py +++ b/tests/test_selfplay/test_train_selfplay.py @@ -34,7 +34,7 @@ def config(request): cfg_parser = create_config_parser() cfg = cfg_parser.parse_args(["--config", "./examples/selfplay/selfplay.yaml"]) cfg.selfplay_api.port = request.param["port"] - print("port:",request.param["port"]) + print("port:", request.param["port"]) for i, c in enumerate(cfg.callbacks): if c["id"] == "SelfplayCallback": c["args"][ From 714a9ec6eae5b988d2a978836a2fea4d92aa37ce Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Fri, 24 Nov 2023 14:27:53 +0800 Subject: [PATCH 55/78] add selfplay test --- setup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/setup.py b/setup.py index 043a7267..28cffd3c 100644 --- a/setup.py +++ b/setup.py @@ -73,12 +73,14 @@ def get_extra_requires() -> dict: "selfplay": [ "ray[default]>=2.7", "ray[serve]", + "async_timeout", "pettingzoo[classic]", "trueskill", ], "selfplay_test": [ "ray[default]>=2.7", "ray[serve]", + "async_timeout", "fastapi", "pettingzoo[mpe]", "pettingzoo[butterfly]", From 2a1cf9cac6ed9e514081f5670e17fdb16245e6d5 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Tue, 28 Nov 2023 11:50:10 +0800 Subject: [PATCH 56/78] update data parallel and model parallel --- examples/nlp/ds_config.json | 4 +- examples/nlp/eval_ds_config.json | 4 +- examples/nlp/nlp_ppo.yaml | 12 ++--- examples/nlp/nlp_ppo_ds.yaml | 13 +++-- examples/nlp/train_ppo.py | 2 +- openrl/envs/nlp/rewards/kl_penalty.py | 27 +++++++--- openrl/modules/networks/policy_network_gpt.py | 54 ++++++++++++++----- openrl/modules/networks/value_network_gpt.py | 44 ++++++++++++--- openrl/modules/utils/valuenorm.py | 18 +++---- openrl/utils/logger.py | 42 +++++++++------ 10 files changed, 151 insertions(+), 69 deletions(-) diff --git a/examples/nlp/ds_config.json b/examples/nlp/ds_config.json index d3b68fe1..544bc405 100644 --- a/examples/nlp/ds_config.json +++ b/examples/nlp/ds_config.json @@ -1,6 +1,6 @@ { - "train_batch_size": 16, - "train_micro_batch_size_per_gpu": 4, + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": { "stage": 2, diff --git a/examples/nlp/eval_ds_config.json b/examples/nlp/eval_ds_config.json index e9429896..58c08252 100644 --- a/examples/nlp/eval_ds_config.json +++ b/examples/nlp/eval_ds_config.json @@ -1,6 +1,6 @@ { - "train_batch_size": 16, - "train_micro_batch_size_per_gpu": 4, + "train_batch_size": 32, + "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": { "stage": 0, diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index b46e6211..1ba77379 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -9,23 +9,21 @@ wandb_entity: "openrl-lab" ppo_epoch: 5 episode_length: 128 num_mini_batch: 20 -use_share_model: true hidden_size: 1 - -model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog +model_path: /home/chenwenze/data_server/huggingface/models/facebook/opt-125m env: args: { - 'tokenizer_path': 'gpt2', - 'data_path': 'daily_dialog', + 'tokenizer_path': '/home/chenwenze/data_server/huggingface/models/facebook/opt-125m', + 'data_path': '/home/chenwenze/data_server/huggingface/datasets/daily_dialog', } vec_info_class: id: "NLPVecInfo" reward_class: id: "NLPReward" args: { - "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", - "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", + "ref_model": "/home/chenwenze/data_server/huggingface/models/facebook/opt-125m", + "intent_model": "/home/chenwenze/data_server/huggingface/models/rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index 3a031ae6..c9d4ad60 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -7,9 +7,8 @@ use_valuenorm: true use_adv_normalize: true wandb_entity: "openrl-lab" ppo_epoch: 5 -episode_length: 64 +episode_length: 128 num_mini_batch: 20 -# use_share_model: true hidden_size: 1 @@ -18,11 +17,11 @@ use_fp16: false use_offload: false deepspeed_config: ds_config.json -model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog +model_path: /home/chenwenze/data_server/huggingface/models/facebook/opt-125m env: args: { - 'tokenizer_path': 'gpt2', - 'data_path': 'daily_dialog', + 'tokenizer_path': '/home/chenwenze/data_server/huggingface/models/gpt2', + 'data_path': '/home/chenwenze/data_server/huggingface/datasets/daily_dialog', } vec_info_class: id: "NLPVecInfo" @@ -31,8 +30,8 @@ reward_class: args: { "use_deepspeed": true, "ref_ds_config": "eval_ds_config.json", - "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", + "ref_model": "/home/chenwenze/data_server/huggingface/models/facebook/opt-125m", "intent_ds_config": "eval_ds_config.json", - "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", + "intent_model": "/home/chenwenze/data_server/huggingface/models/rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index 384d8f9d..18347a6b 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -32,7 +32,7 @@ def train(): net = Net(env, device="cuda", cfg=cfg, model_dict=model_dict) # initialize the trainer - agent = Agent(net, use_wandb=False) + agent = Agent(net, use_wandb=True) # start training agent.train(total_time_steps=100000) diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index fe9e9594..7f5a6426 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -35,10 +35,16 @@ def __init__( ds_config: str = "default", ): super().__init__() + + self.device = "cuda" + self.use_data_parallel = False + self.use_model_parallel = False self.use_deepspeed = use_deepspeed + assert not (self.use_deepspeed and self.use_data_parallel) + assert not (self.use_deepspeed and self.use_model_parallel) + assert not (self.use_data_parallel and self.use_model_parallel) # reference model - self._apply_model_parallel = apply_model_parallel if ref_model == "builtin_ref": from transformers import GPT2Config, GPT2LMHeadModel @@ -65,10 +71,11 @@ def __init__( self._ref_engine, *_ = deepspeed.initialize(model=self, config=ds_config) elif torch.cuda.is_available(): - if self._apply_model_parallel and self._ref_net.is_parallelizable: + if self.use_model_parallel: self._ref_net.parallelize() - else: # else defaults to data parallel + elif self.use_data_parallel: # else defaults to data parallel self._ref_net = torch.nn.DataParallel(self._ref_net) + self._ref_net = self._ref_net.to(self.device) # alpha adjustment self._alpha = 0.2 @@ -144,7 +151,7 @@ def _prepare_inputs_for_model( input_ids, **model_kwargs ) - if self._apply_model_parallel and unwrap_model(model).is_parallelizable: + if self.use_model_parallel: # if model is in parallel mode, move the tensors to the first device model_inputs = { key: ( @@ -155,8 +162,16 @@ def _prepare_inputs_for_model( ) for key, value in model_inputs.items() } - - if self.use_deepspeed: + elif self.use_data_parallel: + model_inputs = { + key: ( + value.to(self.device) + if isinstance(value, torch.Tensor) + else value + ) + for key, value in model_inputs.items() + } + elif self.use_deepspeed: model_inputs = { key: value.to("cuda") if isinstance(value, torch.Tensor) else value for key, value in model_inputs.items() diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py index cad3157e..5c97feef 100644 --- a/openrl/modules/networks/policy_network_gpt.py +++ b/openrl/modules/networks/policy_network_gpt.py @@ -47,15 +47,21 @@ def __init__( extra_args=None, ) -> None: + self.device = device self.use_half = use_half - self.tpdv = dict(dtype=torch.float32, device=device) + + self.use_data_parallel = False + self.use_model_parallel = False + self.use_deepspeed = cfg.use_deepspeed + + assert not (self.use_deepspeed and self.use_data_parallel) + assert not (self.use_deepspeed and self.use_model_parallel) + assert not (self.use_data_parallel and self.use_model_parallel) super(PolicyNetworkGPT, self).__init__(cfg, device) self.disable_drop_out = disable_drop_out - - self._action_dist = CategoricalDistribution(action_space.n) from transformers import AutoConfig, AutoModelForCausalLM @@ -70,6 +76,14 @@ def __init__( ) self._policy_model.config.use_cache = False + if torch.cuda.is_available(): + if self.use_model_parallel: + self._policy_model.parallelize() + elif self.use_data_parallel: + self._policy_model = torch.nn.DataParallel(self._policy_model) + self._policy_model = self._policy_model.to(self.device) + + def forward(self, forward_type, *args, **kwargs): if forward_type == "original": return self.forward_original(*args, **kwargs) @@ -87,6 +101,18 @@ def _prepare_inputs_for_model( model_inputs = unwrap_model(model).prepare_inputs_for_generation( input_ids, **model_kwargs ) + + if self.use_model_parallel: + model_inputs = { + key: ( + value.to(model.transformer.first_device) + if isinstance(value, torch.Tensor) + and hasattr(model.transformer, "first_device") + else value + ) + for key, value in model_inputs.items() + } + return model_inputs def forward_original( @@ -94,10 +120,11 @@ def forward_original( ): for key in raw_obs.keys(): raw_obs[key] = torch.from_numpy(raw_obs[key]) if type(raw_obs[key]) == np.ndarray else raw_obs[key] - raw_obs[key] = raw_obs[key].to(self._policy_model.device) - # raw_obs[key] = check(raw_obs[key], self.use_half, self.tpdv) - # if self._use_fp16: - # raw_obs[key] = raw_obs[key].half() + if self.use_data_parallel: + raw_obs[key] = raw_obs[key].to(self.device) + else: + raw_obs[key] = raw_obs[key].to(self._policy_model.device) + rnn_states = check(rnn_states) input_ids = raw_obs["input_encoded_pt"].int() @@ -131,11 +158,14 @@ def eval_actions( ): for key in obs.keys(): obs[key] = torch.from_numpy(obs[key]) if type(obs[key]) == np.ndarray else obs[key] - obs[key] = obs[key].to(self._policy_model.device) - # obs[key] = check(obs[key], self.use_half, self.tpdv) - # if self._use_fp16: - # obs[key] = obs[key].half() - action = check(action).to(self._policy_model.device).squeeze() + if self.use_data_parallel: + obs[key] = obs[key].to(self.device) + else: + obs[key] = obs[key].to(self._policy_model.device) + if self.use_data_parallel: + action = check(action).to(self.device).squeeze() + else: + action = check(action).to(self._policy_model.device).squeeze() rnn_states = check(rnn_states) input_ids = obs["input_encoded_pt"].int() diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py index 13db87b8..4815cff7 100644 --- a/openrl/modules/networks/value_network_gpt.py +++ b/openrl/modules/networks/value_network_gpt.py @@ -45,8 +45,15 @@ def __init__( extra_args=None, ): + self.device = device self.use_half = use_half - self.tpdv = dict(dtype=torch.float32, device=device) + + self.use_data_parallel = False + self.use_model_parallel = False + self.use_deepspeed = cfg.use_deepspeed + assert not (self.use_deepspeed and self.use_data_parallel) + assert not (self.use_deepspeed and self.use_model_parallel) + assert not (self.use_data_parallel and self.use_model_parallel) super(ValueNetworkGPT, self).__init__(cfg, device) @@ -63,6 +70,15 @@ def __init__( self._value_head.to(self.device) + if torch.cuda.is_available(): + if self.use_model_parallel: + self._value_model.parallelize() + elif self.use_data_parallel: + self._value_model = torch.nn.DataParallel(self._value_model) + self._value_model = self._value_model.to(self.device) + self._value_head = torch.nn.DataParallel(self._value_head) + self._value_head = self._value_head.to(self.device) + def _prepare_inputs_for_model( self, @@ -73,16 +89,28 @@ def _prepare_inputs_for_model( model_inputs = unwrap_model(model).prepare_inputs_for_generation( input_ids, **model_kwargs ) + + if self.use_model_parallel: + model_inputs = { + key: ( + value.to(model.transformer.first_device) + if isinstance(value, torch.Tensor) + and hasattr(model.transformer, "first_device") + else value + ) + for key, value in model_inputs.items() + } + return model_inputs def forward(self, critic_obs, rnn_states, masks): for key in critic_obs.keys(): critic_obs[key] = torch.from_numpy(critic_obs[key]) if type(critic_obs[key]) == np.ndarray else critic_obs[key] - critic_obs[key] = critic_obs[key].to(self._value_model.device) - # critic_obs[key] = check(critic_obs[key], self.use_half, self.tpdv) - # if self._use_fp16: - # critic_obs[key] = critic_obs[key].half() - masks = check(masks).to(self._value_model.device) + if self.use_data_parallel: + critic_obs[key] = critic_obs[key].to(self.device) + else: + critic_obs[key] = critic_obs[key].to(self._value_model.device) + rnn_states = check(rnn_states) input_ids = critic_obs["input_encoded_pt"].int() @@ -99,6 +127,10 @@ def forward(self, critic_obs, rnn_states, masks): ) output = self._value_model(output_hidden_states=True, **model_inputs) last_tokens_hidden = output.hidden_states[-1][:, -1] + + if self.use_model_parallel: + last_tokens_hidden = last_tokens_hidden.to(self.device) + values = self._value_head.forward(last_tokens_hidden) return values, rnn_states diff --git a/openrl/modules/utils/valuenorm.py b/openrl/modules/utils/valuenorm.py index bed1d705..0367084a 100644 --- a/openrl/modules/utils/valuenorm.py +++ b/openrl/modules/utils/valuenorm.py @@ -24,15 +24,15 @@ def __init__( self.per_element_update = per_element_update self.tpdv = dict(dtype=torch.float32, device=device) - # self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - # self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - # self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv) - - self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False) - self.running_mean_sq = nn.Parameter( - torch.zeros(input_shape), requires_grad=False - ) - self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False) + self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) + self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) + self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv) + + # self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False) + # self.running_mean_sq = nn.Parameter( + # torch.zeros(input_shape), requires_grad=False + # ) + # self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False) self.reset_parameters() diff --git a/openrl/utils/logger.py b/openrl/utils/logger.py index 3fe61b53..d9c49f34 100644 --- a/openrl/utils/logger.py +++ b/openrl/utils/logger.py @@ -46,6 +46,10 @@ def __init__( self.use_wandb = use_wandb self.use_tensorboard = use_tensorboard + self.skip_logging = False + if cfg.use_deepspeed and cfg.local_rank != 0: + self.skip_logging = True + self.log_level = log_level self.log_path = log_path self.project_name = project_name @@ -126,20 +130,21 @@ def _init(self) -> None: ) if self.use_wandb: - wandb.init( - config=self.cfg, - project=self.project_name, - entity=self.wandb_entity, - notes=socket.gethostname(), - name=self.scenario_name - + "_" - + str(self.exp_name) - + "_seed" - + str(self.cfg.seed), - dir=str(run_dir), - job_type="training", - reinit=True, - ) + if not self.skip_logging: + wandb.init( + config=self.cfg, + project=self.project_name, + entity=self.wandb_entity, + notes=socket.gethostname(), + name=self.scenario_name + + "_" + + str(self.exp_name) + + "_seed" + + str(self.cfg.seed), + dir=str(run_dir), + job_type="training", + reinit=True, + ) elif self.use_tensorboard: from tensorboardX import SummaryWriter @@ -152,7 +157,8 @@ def _init(self) -> None: def close(self): if self.use_wandb: - wandb.finish() + if not self.skip_logging: + wandb.finish() def info(self, msg: str): logging.info(msg) @@ -167,7 +173,8 @@ def log_learner_info( return for k, v in infos.items(): if self.use_wandb: - wandb.log({"Learner_{}/{}".format(leaner_id, k): v}, step=step) + if not self.skip_logging: + wandb.log({"Learner_{}/{}".format(leaner_id, k): v}, step=step) elif self.use_tensorboard: self.writter.add_scalars( "Learner_{}/{}".format(leaner_id, k), @@ -192,7 +199,8 @@ def log_info( logging_info_str += f"\t{k}: {v}\n" if self.use_wandb: - wandb.log({k: v}, step=step) + if not self.skip_logging: + wandb.log({k: v}, step=step) elif self.use_tensorboard: self.writter.add_scalars(k, {k: v}, step) if self.log_to_terminal: From 0c2ea2b3cabc88dd243cb55e6e10df8d29f77044 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 28 Nov 2023 15:21:06 +0800 Subject: [PATCH 57/78] - add net, gail test - fix bugs: remove cfg when make gymnasium env --- openrl/envs/common/build_envs.py | 3 +- openrl/modules/vdn_module.py | 2 + tests/test_examples/test_train_gail.py | 75 +++++++++++++++++++ .../test_modules/test_common/test_ddpg_net.py | 61 +++++++++++++++ .../test_modules/test_common/test_dqn_net.py | 61 +++++++++++++++ .../test_modules/test_common/test_sac_net.py | 61 +++++++++++++++ .../test_modules/test_common/test_vdn_net.py | 57 ++++++++++++++ 7 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 tests/test_examples/test_train_gail.py create mode 100644 tests/test_modules/test_common/test_ddpg_net.py create mode 100644 tests/test_modules/test_common/test_dqn_net.py create mode 100644 tests/test_modules/test_common/test_sac_net.py create mode 100644 tests/test_modules/test_common/test_vdn_net.py diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index 0893400a..76f4b35b 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -2,6 +2,7 @@ import inspect from typing import Callable, Iterable, List, Optional, Union +import gymnasium as gym from gymnasium import Env from openrl.envs.wrappers.base_wrapper import BaseWrapper @@ -33,7 +34,7 @@ def _make_env() -> Env: if need_env_id: new_kwargs["env_id"] = env_id new_kwargs["env_num"] = env_num - if id.startswith("ALE/"): + if id.startswith("ALE/") or id in gym.envs.registry.keys(): new_kwargs.pop("cfg", None) env = make( diff --git a/openrl/modules/vdn_module.py b/openrl/modules/vdn_module.py index 32987372..10a9b541 100644 --- a/openrl/modules/vdn_module.py +++ b/openrl/modules/vdn_module.py @@ -68,6 +68,8 @@ def __init__( device=device, ) self.cfg = cfg + self.obs_space = input_space + self.act_space = act_space def lr_decay(self, episode, episodes): update_linear_schedule(self.optimizers["q_net"], episode, episodes, self.lr) diff --git a/tests/test_examples/test_train_gail.py b/tests/test_examples/test_train_gail.py new file mode 100644 index 00000000..656ff2d0 --- /dev/null +++ b/tests/test_examples/test_train_gail.py @@ -0,0 +1,75 @@ +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.vec_env.wrappers.gen_data import GenDataWrapper +from openrl.envs.wrappers.extra_wrappers import ZeroRewardWrapper +from openrl.envs.wrappers.monitor import Monitor +from openrl.modules.common import GAILNet as Net +from openrl.modules.common import PPONet +from openrl.runners.common import GAILAgent as Agent +from openrl.runners.common import PPOAgent + + +@pytest.fixture(scope="function") +def gen_data(tmpdir): + tmp_data_path = os.path.join(tmpdir, "data.pkl") + env_wrappers = [ + Monitor, + ] + print("generate data....") + env = make( + "CartPole-v1", + env_num=2, + asynchronous=True, + env_wrappers=env_wrappers, + ) + agent = PPOAgent(PPONet(env)) + env = GenDataWrapper(env, data_save_path=tmp_data_path, total_episode=5) + obs, info = env.reset() + done = False + while not done: + # Based on environmental observation input, predict next action. + action, _ = agent.act(obs, deterministic=True) + obs, r, done, info = env.step(action) + env.close() + print("generate data done!") + return tmp_data_path + + +@pytest.fixture( + scope="function", params=[" --gail_use_action false", " --gail_use_action true"] +) +def config(request, gen_data): + input_str = ( + "--episode_length 5 --use_recurrent_policy true --use_joint_action_loss true" + " --use_valuenorm true --use_adv_normalize true --reward_class.id GAILReward" + ) + input_str += request.param + input_str += " --expert_data " + gen_data + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(input_str.split()) + return cfg + + +@pytest.mark.unittest +def test_train_gail(config): + env = make("CartPole-v1", env_num=2, cfg=config, env_wrappers=[ZeroRewardWrapper]) + + net = Net( + env, + cfg=config, + ) + # initialize the trainer + agent = Agent(net) + agent.train(total_time_steps=200) + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_modules/test_common/test_ddpg_net.py b/tests/test_modules/test_common/test_ddpg_net.py new file mode 100644 index 00000000..a4c03354 --- /dev/null +++ b/tests/test_modules/test_common/test_ddpg_net.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.extra_wrappers import AddStep +from openrl.modules.common import DDPGNet as Net +from openrl.runners.common import DDPGAgent as Agent + +env_wrappers = [AddStep] + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +def train(Agent, Net, env_name, env_num, total_time_steps, config): + cfg = config + env = make(env_name, env_num=env_num, cfg=cfg, env_wrappers=env_wrappers) + + net = Net( + env, + cfg=cfg, + ) + # initialize the trainer + agent = Agent(net) + # start training, set total number of training steps to 20000 + agent.train(total_time_steps=total_time_steps) + env.close() + + +@pytest.mark.unittest +def test_ddpg_net(config): + train(Agent, Net, "IdentityEnvcontinuous", 2, 100, config) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_modules/test_common/test_dqn_net.py b/tests/test_modules/test_common/test_dqn_net.py new file mode 100644 index 00000000..292c08b4 --- /dev/null +++ b/tests/test_modules/test_common/test_dqn_net.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.extra_wrappers import AddStep +from openrl.modules.common import DQNNet as Net +from openrl.runners.common import DQNAgent as Agent + +env_wrappers = [AddStep] + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +def train(Agent, Net, env_name, env_num, total_time_steps, config): + cfg = config + env = make(env_name, env_num=env_num, cfg=cfg, env_wrappers=env_wrappers) + + net = Net( + env, + cfg=cfg, + ) + # initialize the trainer + agent = Agent(net) + # start training, set total number of training steps to 20000 + agent.train(total_time_steps=total_time_steps) + env.close() + + +@pytest.mark.unittest +def test_dqn_net(config): + train(Agent, Net, "IdentityEnv", 2, 100, config) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_modules/test_common/test_sac_net.py b/tests/test_modules/test_common/test_sac_net.py new file mode 100644 index 00000000..8839986e --- /dev/null +++ b/tests/test_modules/test_common/test_sac_net.py @@ -0,0 +1,61 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.extra_wrappers import AddStep +from openrl.modules.common import SACNet as Net +from openrl.runners.common import SACAgent as Agent + +env_wrappers = [AddStep] + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +def train(Agent, Net, env_name, env_num, total_time_steps, config): + cfg = config + env = make(env_name, env_num=env_num, cfg=cfg, env_wrappers=env_wrappers) + + net = Net( + env, + cfg=cfg, + ) + # initialize the trainer + agent = Agent(net) + # start training, set total number of training steps to 20000 + agent.train(total_time_steps=total_time_steps) + env.close() + + +@pytest.mark.unittest +def test_sac_net(config): + train(Agent, Net, "IdentityEnvcontinuous", 2, 100, config) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_modules/test_common/test_vdn_net.py b/tests/test_modules/test_common/test_vdn_net.py new file mode 100644 index 00000000..29f1f58f --- /dev/null +++ b/tests/test_modules/test_common/test_vdn_net.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +import os +import sys + +import pytest + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.mat_wrapper import MATWrapper +from openrl.modules.common import VDNNet +from openrl.runners.common import VDNAgent as Agent + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.mark.unittest +def test_vdn_net(config): + env_num = 2 + env = make( + "simple_spread", + env_num=env_num, + asynchronous=True, + ) + env = MATWrapper(env) + + net = VDNNet(env, cfg=config) + # initialize the trainer + agent = Agent(net) + # start training + agent.train(total_time_steps=100) + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 467989469a076184d30b26fd49250eed91f5d9fe Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 5 Dec 2023 19:46:49 +0800 Subject: [PATCH 58/78] - fix bugs: AttributeError: module 'openrl.envs' has no attribute 'PettingZoo' --- openrl/envs/common/registration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index ec9756b3..576eb1cd 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -29,7 +29,7 @@ ) from openrl.envs.vec_env.vec_info import VecInfoFactory from openrl.rewards import RewardFactory - +from openrl.envs.PettingZoo.registration import pettingzoo_env_dict def make( id: str, @@ -150,7 +150,7 @@ def make( ) elif ( id in openrl.envs.pettingzoo_all_envs - or id in openrl.envs.PettingZoo.registration.pettingzoo_env_dict.keys() + or id in pettingzoo_env_dict.keys() ): from openrl.envs.PettingZoo import make_PettingZoo_envs From 9a05e6fe760fcefecd5205ef98b07aca5daf5a31 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 5 Dec 2023 19:48:06 +0800 Subject: [PATCH 59/78] - fix bugs: AttributeError: module 'openrl.envs' has no attribute 'PettingZoo' --- openrl/envs/common/registration.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 576eb1cd..5d1ed645 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -20,6 +20,7 @@ import gymnasium as gym import openrl +from openrl.envs.PettingZoo.registration import pettingzoo_env_dict from openrl.envs.vec_env import ( AsyncVectorEnv, BaseVecEnv, @@ -29,7 +30,7 @@ ) from openrl.envs.vec_env.vec_info import VecInfoFactory from openrl.rewards import RewardFactory -from openrl.envs.PettingZoo.registration import pettingzoo_env_dict + def make( id: str, @@ -148,10 +149,7 @@ def make( render_mode=convert_render_mode, **kwargs, ) - elif ( - id in openrl.envs.pettingzoo_all_envs - or id in pettingzoo_env_dict.keys() - ): + elif id in openrl.envs.pettingzoo_all_envs or id in pettingzoo_env_dict.keys(): from openrl.envs.PettingZoo import make_PettingZoo_envs env_fns = make_PettingZoo_envs( From dac28047bb94d344e54632649d7ab11d2d296e29 Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Thu, 7 Dec 2023 14:25:09 +0530 Subject: [PATCH 60/78] Add envpool to openrl --- examples/envpool/test_model.py | 78 ++++++++++ examples/envpool/train_ppo.py | 86 +++++++++++ openrl/envs/common/build_envs.py | 24 ++- openrl/envs/common/registration.py | 14 +- openrl/envs/envpool/__init__.py | 47 ++++++ openrl/envs/wrappers/envpool_wrappers.py | 182 +++++++++++++++++++++++ setup.py | 2 + 7 files changed, 425 insertions(+), 8 deletions(-) create mode 100644 examples/envpool/test_model.py create mode 100644 examples/envpool/train_ppo.py create mode 100644 openrl/envs/envpool/__init__.py create mode 100644 openrl/envs/wrappers/envpool_wrappers.py diff --git a/examples/envpool/test_model.py b/examples/envpool/test_model.py new file mode 100644 index 00000000..c0b4ddfb --- /dev/null +++ b/examples/envpool/test_model.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" + +# Use OpenRL to load stable-baselines's model for testing + +import numpy as np +import torch + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.modules.common.ppo_net import PPONet as Net +from openrl.modules.networks.policy_value_network_sb3 import ( + PolicyValueNetworkSB3 as PolicyValueNetwork, +) +from openrl.runners.common import PPOAgent as Agent + + +def evaluation(local_trained_file_path=None): + # begin to test + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(["--config", "ppo.yaml"]) + + # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human. + render_mode = "group_human" + render_mode = None + env = make("CartPole-v1", render_mode=render_mode, env_num=9, asynchronous=True) + model_dict = {"model": PolicyValueNetwork} + net = Net( + env, + cfg=cfg, + model_dict=model_dict, + device="cuda" if torch.cuda.is_available() else "cpu", + ) + # initialize the trainer + agent = Agent( + net, + ) + if local_trained_file_path is not None: + agent.load(local_trained_file_path) + # The trained agent sets up the interactive environment it needs. + agent.set_env(env) + # Initialize the environment and get initial observations and environmental information. + obs, info = env.reset() + done = False + + total_step = 0 + total_reward = 0.0 + while not np.any(done): + # Based on environmental observation input, predict next action. + action, _ = agent.act(obs, deterministic=True) + obs, r, done, info = env.step(action) + total_step += 1 + total_reward += np.mean(r) + if total_step % 50 == 0: + print(f"{total_step}: reward:{np.mean(r)}") + env.close() + print("total step:", total_step) + print("total reward:", total_reward) + + +if __name__ == "__main__": + evaluation() diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py new file mode 100644 index 00000000..4120ee4a --- /dev/null +++ b/examples/envpool/train_ppo.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import numpy as np +from test_model import evaluation + +from openrl.configs.config import create_config_parser +from openrl.envs.common import make +from openrl.envs.wrappers.envpool_wrappers import VecAdapter, VecMonitor +from openrl.modules.common import PPONet as Net +from openrl.modules.common.ppo_net import PPONet as Net +from openrl.runners.common import PPOAgent as Agent + + +def train(): + # create the neural network + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args() + + # create environment, set environment parallelism to 9 + env = make( + "envpool:Adventure-v5", + render_mode=None, + env_num=9, + asynchronous=False, + env_wrappers=[VecAdapter, VecMonitor], + env_type="gym", + ) + + net = Net( + env, + cfg=cfg, + ) + # initialize the trainer + agent = Agent(net, use_wandb=False, project_name="envpool:Adventure-v5") + # start training, set total number of training steps to 20000 + agent.train(total_time_steps=20000) + + env.close() + return agent + + +def evaluation(agent): + # begin to test + # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human. + render_mode = "group_human" + render_mode = None + env = make("CartPole-v1", render_mode=render_mode, env_num=9, asynchronous=True) + # The trained agent sets up the interactive environment it needs. + agent.set_env(env) + # Initialize the environment and get initial observations and environmental information. + obs, info = env.reset() + done = False + step = 0 + total_step, total_reward = 0, 0 + while not np.any(done): + # Based on environmental observation input, predict next action. + action, _ = agent.act(obs, deterministic=True) + obs, r, done, info = env.step(action) + step += 1 + total_step += 1 + total_reward += np.mean(r) + if step % 50 == 0: + print(f"{step}: reward:{np.mean(r)}") + env.close() + print("total step:", total_step) + print("total reward:", total_reward) + + +if __name__ == "__main__": + agent = train() + evaluation(agent) diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index 76f4b35b..386c4adc 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -6,6 +6,7 @@ from gymnasium import Env from openrl.envs.wrappers.base_wrapper import BaseWrapper +from openrl.envs.wrappers.envpool_wrappers import VecEnvWrapper, VecMonitor def build_envs( @@ -36,13 +37,22 @@ def _make_env() -> Env: new_kwargs["env_num"] = env_num if id.startswith("ALE/") or id in gym.envs.registry.keys(): new_kwargs.pop("cfg", None) - - env = make( - id, - render_mode=env_render_mode, - disable_env_checker=_disable_env_checker, - **new_kwargs, - ) + if "envpool" in new_kwargs: + # for now envpool doesnt support any render mode + # envpool also doesnt stores the id anywhere + new_kwargs.pop("envpool") + env = make( + id, + **new_kwargs, + ) + env.unwrapped.spec.id = id + else: + env = make( + id, + render_mode=env_render_mode, + disable_env_checker=_disable_env_checker, + **new_kwargs, + ) if wrappers is not None: if callable(wrappers): diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 5d1ed645..053dd104 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -17,6 +17,7 @@ """""" from typing import Callable, Optional +import envpool import gymnasium as gym import openrl @@ -72,7 +73,6 @@ def make( env_fns = make_single_agent_drone_envs( id=id, env_num=env_num, render_mode=convert_render_mode, **kwargs ) - elif id.startswith("snakes_"): from openrl.envs.snake import make_snake_envs @@ -155,6 +155,18 @@ def make( env_fns = make_PettingZoo_envs( id=id, env_num=env_num, render_mode=convert_render_mode, **kwargs ) + elif ( + "envpool:" in id + and id.split(":")[-1] in envpool.registration.list_all_envs() + ): + from openrl.envs.envpool import make_envpool_envs + + env_fns = make_envpool_envs( + id=id.split(":")[-1], + env_num=env_num, + render_mode=convert_render_mode, + **kwargs, + ) else: raise NotImplementedError(f"env {id} is not supported.") diff --git a/openrl/envs/envpool/__init__.py b/openrl/envs/envpool/__init__.py new file mode 100644 index 00000000..48fbd1f5 --- /dev/null +++ b/openrl/envs/envpool/__init__.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +from typing import List, Optional, Union + +import envpool + +from openrl.envs.common import build_envs + + +def make_envpool_envs( + id: str, + env_num: int = 1, + render_mode: Optional[Union[str, List[str]]] = None, + **kwargs, +): + assert "env_type" in kwargs + assert kwargs.get("env_type") in ["gym", "dm", "gymnasium"] + # Since render_mode is not supported, we set envpool to True + # so that we can remove render_mode keyword argument from build_envs + assert render_mode is None, "envpool does not support render_mode yet" + kwargs["envpool"] = True + + env_wrappers = kwargs.pop("env_wrappers") + env_fns = build_envs( + make=envpool.make, + id=id, + env_num=env_num, + render_mode=render_mode, + wrappers=env_wrappers, + **kwargs, + ) + return env_fns diff --git a/openrl/envs/wrappers/envpool_wrappers.py b/openrl/envs/wrappers/envpool_wrappers.py new file mode 100644 index 00000000..d0da090a --- /dev/null +++ b/openrl/envs/wrappers/envpool_wrappers.py @@ -0,0 +1,182 @@ +import time +import warnings +from typing import Optional + +import gym +import gymnasium +import numpy as np +from envpool.python.protocol import EnvPool +from packaging import version +from stable_baselines3.common.vec_env import VecEnvWrapper as BaseWrapper +from stable_baselines3.common.vec_env import VecMonitor +from stable_baselines3.common.vec_env.base_vec_env import (VecEnvObs, + VecEnvStepReturn) + +is_legacy_gym = version.parse(gym.__version__) < version.parse("0.26.0") + + +class VecEnvWrapper(BaseWrapper): + @property + def agent_num(self): + if self.is_original_envpool_env(): + return 1 + else: + return self.env.agent_num + + def is_original_envpool_env(self): + return not hasattr(self.venv, "agent_num`") + + +class VecAdapter(VecEnvWrapper): + """ + Convert EnvPool object to a Stable-Baselines3 (SB3) VecEnv. + + :param venv: The envpool object. + """ + + def __init__(self, venv: EnvPool): + venv.num_envs = venv.spec.config.num_envs + observation_space = venv.observation_space + new_observation_space = gymnasium.spaces.Box( + low=observation_space.low, + high=observation_space.high, + dtype=observation_space.dtype, + ) + action_space = venv.action_space + if isinstance(action_space, gym.spaces.Discrete): + new_action_space = gymnasium.spaces.Discrete(action_space.n) + elif isinstance(action_space, gym.spaces.MultiDiscrete): + new_action_space = gymnasium.spaces.MultiDiscrete(action_space.nvec) + elif isinstance(action_space, gym.spaces.MultiBinary): + new_action_space = gymnasium.spaces.MultiBinary(action_space.n) + elif isinstance(action_space, gym.spaces.Box): + new_action_space = gymnasium.spaces.Box( + low=action_space.low, + high=action_space.high, + dtype=action_space.dtype, + ) + else: + raise NotImplementedError(f"Action space {action_space} is not supported") + super().__init__( + venv=venv, + observation_space=new_observation_space, + action_space=new_action_space, + ) + + def step_async(self, actions: np.ndarray) -> None: + self.actions = actions + + def reset(self) -> VecEnvObs: + if is_legacy_gym: + return self.venv.reset(), {} + else: + return self.venv.reset() + + def step_wait(self) -> VecEnvStepReturn: + if is_legacy_gym: + obs, rewards, dones, info_dict = self.venv.step(self.actions) + else: + obs, rewards, terms, truncs, info_dict = self.venv.step(self.actions) + dones = terms + truncs + rewards = rewards + infos = [] + for i in range(self.num_envs): + infos.append( + { + key: info_dict[key][i] + for key in info_dict.keys() + if isinstance(info_dict[key], np.ndarray) + } + ) + if dones[i]: + infos[i]["terminal_observation"] = obs[i] + if is_legacy_gym: + obs[i] = self.venv.reset(np.array([i])) + else: + obs[i] = self.venv.reset(np.array([i]))[0] + return obs, rewards, dones, infos + + +class VecMonitor(VecEnvWrapper): + def __init__( + self, + venv, + filename: Optional[str] = None, + info_keywords=(), + ): + # Avoid circular import + from stable_baselines3.common.monitor import Monitor, ResultsWriter + + try: + is_wrapped_with_monitor = venv.env_is_wrapped(Monitor)[0] + except AttributeError: + is_wrapped_with_monitor = False + + if is_wrapped_with_monitor: + warnings.warn( + "The environment is already wrapped with a `Monitor` wrapper" + "but you are wrapping it with a `VecMonitor` wrapper, the `Monitor` statistics will be" + "overwritten by the `VecMonitor` ones.", + UserWarning, + ) + + VecEnvWrapper.__init__(self, venv) + self.episode_count = 0 + self.t_start = time.time() + + env_id = None + if hasattr(venv, "spec") and venv.spec is not None: + env_id = venv.spec.id + + self.results_writer: Optional[ResultsWriter] = None + if filename: + self.results_writer = ResultsWriter( + filename, + header={"t_start": self.t_start, "env_id": str(env_id)}, + extra_keys=info_keywords, + ) + + self.info_keywords = info_keywords + self.episode_returns = np.zeros(self.num_envs, dtype=np.float32) + self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32) + + def reset(self, **kwargs) -> VecEnvObs: + obs, info = self.venv.reset() + self.episode_returns = np.zeros(self.num_envs, dtype=np.float32) + self.episode_lengths = np.zeros(self.num_envs, dtype=np.int32) + return obs, info + + def step_wait(self) -> VecEnvStepReturn: + obs, rewards, dones, infos = self.venv.step_wait() + self.episode_returns += rewards + self.episode_lengths += 1 + new_infos = list(infos[:]) + for i in range(len(dones)): + if dones[i]: + info = infos[i].copy() + episode_return = self.episode_returns[i] + episode_length = self.episode_lengths[i] + episode_info = { + "r": episode_return, + "l": episode_length, + "t": round(time.time() - self.t_start, 6), + } + for key in self.info_keywords: + episode_info[key] = info[key] + info["episode"] = episode_info + self.episode_count += 1 + self.episode_returns[i] = 0 + self.episode_lengths[i] = 0 + if self.results_writer: + self.results_writer.write_row(episode_info) + new_infos[i] = info + rewards = np.expand_dims(rewards, 1) + return obs, rewards, dones, new_infos + + def close(self) -> None: + if self.results_writer: + self.results_writer.close() + return self.venv.close() + + +__all__ = ["VecAdapter", "VecMonitor"] diff --git a/setup.py b/setup.py index 28cffd3c..faffbe84 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ def get_extra_requires() -> dict: "async_timeout", "pettingzoo[classic]", "trueskill", + "envpool", ], "selfplay_test": [ "ray[default]>=2.7", @@ -84,6 +85,7 @@ def get_extra_requires() -> dict: "fastapi", "pettingzoo[mpe]", "pettingzoo[butterfly]", + "envpool", ], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], From 3c31b5a48a35830af2d1cfd633d04bdf05fae06a Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Thu, 7 Dec 2023 15:49:19 +0530 Subject: [PATCH 61/78] Remove unwanted test for envpool --- examples/envpool/test_model.py | 78 ---------------------------------- examples/envpool/train_ppo.py | 1 - 2 files changed, 79 deletions(-) delete mode 100644 examples/envpool/test_model.py diff --git a/examples/envpool/test_model.py b/examples/envpool/test_model.py deleted file mode 100644 index c0b4ddfb..00000000 --- a/examples/envpool/test_model.py +++ /dev/null @@ -1,78 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" - -# Use OpenRL to load stable-baselines's model for testing - -import numpy as np -import torch - -from openrl.configs.config import create_config_parser -from openrl.envs.common import make -from openrl.modules.common.ppo_net import PPONet as Net -from openrl.modules.networks.policy_value_network_sb3 import ( - PolicyValueNetworkSB3 as PolicyValueNetwork, -) -from openrl.runners.common import PPOAgent as Agent - - -def evaluation(local_trained_file_path=None): - # begin to test - - cfg_parser = create_config_parser() - cfg = cfg_parser.parse_args(["--config", "ppo.yaml"]) - - # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human. - render_mode = "group_human" - render_mode = None - env = make("CartPole-v1", render_mode=render_mode, env_num=9, asynchronous=True) - model_dict = {"model": PolicyValueNetwork} - net = Net( - env, - cfg=cfg, - model_dict=model_dict, - device="cuda" if torch.cuda.is_available() else "cpu", - ) - # initialize the trainer - agent = Agent( - net, - ) - if local_trained_file_path is not None: - agent.load(local_trained_file_path) - # The trained agent sets up the interactive environment it needs. - agent.set_env(env) - # Initialize the environment and get initial observations and environmental information. - obs, info = env.reset() - done = False - - total_step = 0 - total_reward = 0.0 - while not np.any(done): - # Based on environmental observation input, predict next action. - action, _ = agent.act(obs, deterministic=True) - obs, r, done, info = env.step(action) - total_step += 1 - total_reward += np.mean(r) - if total_step % 50 == 0: - print(f"{total_step}: reward:{np.mean(r)}") - env.close() - print("total step:", total_step) - print("total reward:", total_reward) - - -if __name__ == "__main__": - evaluation() diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py index 4120ee4a..49de50f4 100644 --- a/examples/envpool/train_ppo.py +++ b/examples/envpool/train_ppo.py @@ -16,7 +16,6 @@ """""" import numpy as np -from test_model import evaluation from openrl.configs.config import create_config_parser from openrl.envs.common import make From 990f8c5fd3e448f5594a20f05a48a3d8ed65696c Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Thu, 7 Dec 2023 15:52:42 +0530 Subject: [PATCH 62/78] Fix a typo: envpool/train-ppo --- examples/envpool/train_ppo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py index 49de50f4..49eb4456 100644 --- a/examples/envpool/train_ppo.py +++ b/examples/envpool/train_ppo.py @@ -32,7 +32,7 @@ def train(): # create environment, set environment parallelism to 9 env = make( - "envpool:Adventure-v5", + "envpool:CartPole-v1", render_mode=None, env_num=9, asynchronous=False, @@ -45,7 +45,7 @@ def train(): cfg=cfg, ) # initialize the trainer - agent = Agent(net, use_wandb=False, project_name="envpool:Adventure-v5") + agent = Agent(net, use_wandb=False, project_name="envpool:CartPole-v1") # start training, set total number of training steps to 20000 agent.train(total_time_steps=20000) From 448cc6f1189ca0ed46aa57b23c87b96e23f99e21 Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Thu, 7 Dec 2023 16:25:03 +0530 Subject: [PATCH 63/78] Fix dependency error: stablebaseline3 --- openrl/envs/common/build_envs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index 386c4adc..37c17c01 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -6,7 +6,6 @@ from gymnasium import Env from openrl.envs.wrappers.base_wrapper import BaseWrapper -from openrl.envs.wrappers.envpool_wrappers import VecEnvWrapper, VecMonitor def build_envs( From 48edf3211275181716405509534d7697a7a9ed3c Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 12 Dec 2023 14:06:26 +0800 Subject: [PATCH 64/78] update readme --- Project.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.md b/Project.md index d7c455f5..38a8d1c7 100644 --- a/Project.md +++ b/Project.md @@ -18,7 +18,7 @@ However, in many practical applications, it is important to develop reasonable a In this paper, we propose an on-policy framework for discovering multiple strategies for the same task. Experimental results show that our method efficiently finds diverse strategies in a wide variety of reinforcement learning tasks. -- Paper: [DGPO: Discovering Multiple Strategies with Diversity-Guided Policy Optimization](https://arxiv.org/abs/2207.05631)(AAMAS Extended Abstract 2023) -- Authors: Wenze Chen, Shiyu Huang, Yuan Chiang, Ting Chen, Jun Zhu +- Paper: [DGPO: Discovering Multiple Strategies with Diversity-Guided Policy Optimization](https://arxiv.org/abs/2207.05631)(AAAAI 2024) +- Authors: Wenze Chen, Shiyu Huang, Yuan Chiang, Tim Pearce, Wei-Wei Tu, Ting Chen, Jun Zhu From e29fd9031476875e4b10c7ee47461795f57f7745 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 12 Dec 2023 14:52:29 +0800 Subject: [PATCH 65/78] improve test --- openrl/buffers/offpolicy_replay_data.py | 92 +++--- openrl/buffers/replay_data.py | 312 +++++++++--------- openrl/configs/config.py | 7 +- tests/test_buffer/test_generator.py | 88 +++++ tests/test_buffer/test_offpolicy_generator.py | 68 ++++ 5 files changed, 362 insertions(+), 205 deletions(-) create mode 100644 tests/test_buffer/test_generator.py create mode 100644 tests/test_buffer/test_offpolicy_generator.py diff --git a/openrl/buffers/offpolicy_replay_data.py b/openrl/buffers/offpolicy_replay_data.py index 4d62d53f..31e52e85 100644 --- a/openrl/buffers/offpolicy_replay_data.py +++ b/openrl/buffers/offpolicy_replay_data.py @@ -97,52 +97,52 @@ def __init__( ) self.first_insert_flag = True - def dict_insert(self, data): - if self._mixed_obs: - for key in self.critic_obs.keys(): - self.critic_obs[key][self.step + 1] = data["critic_obs"][key].copy() - for key in self.policy_obs.keys(): - self.policy_obs[key][self.step + 1] = data["policy_obs"][key].copy() - for key in self.next_policy_obs.keys(): - self.next_policy_obs[key][self.step + 1] = data["next_policy_obs"][ - key - ].copy() - for key in self.next_critic_obs.keys(): - self.next_critic_obs[key][self.step + 1] = data["next_critic_obs"][ - key - ].copy() - else: - self.critic_obs[self.step + 1] = data["critic_obs"].copy() - self.policy_obs[self.step + 1] = data["policy_obs"].copy() - self.next_policy_obs[self.step + 1] = data["next_policy_obs"].copy() - self.next_critic_obs[self.step + 1] = data["next_critic_obs"].copy() - - if "rnn_states" in data: - self.rnn_states[self.step + 1] = data["rnn_states"].copy() - if "rnn_states_critic" in data: - self.rnn_states_critic[self.step + 1] = data["rnn_states_critic"].copy() - if "actions" in data: - self.actions[self.step + 1] = data["actions"].copy() - if "action_log_probs" in data: - self.action_log_probs[self.step] = data["action_log_probs"].copy() - - if "value_preds" in data: - self.value_preds[self.step] = data["value_preds"].copy() - if "rewards" in data: - self.rewards[self.step + 1] = data["rewards"].copy() - if "masks" in data: - self.masks[self.step + 1] = data["masks"].copy() - - if "bad_masks" in data: - self.bad_masks[self.step + 1] = data["bad_masks"].copy() - if "active_masks" in data: - self.active_masks[self.step + 1] = data["active_masks"].copy() - if "action_masks" in data: - self.action_masks[self.step + 1] = data["action_masks"].copy() - - if (self.step + 1) % self.episode_length != 0: - self.first_insert_flag = False - self.step = (self.step + 1) % self.episode_length + # def dict_insert(self, data): + # if self._mixed_obs: + # for key in self.critic_obs.keys(): + # self.critic_obs[key][self.step + 1] = data["critic_obs"][key].copy() + # for key in self.policy_obs.keys(): + # self.policy_obs[key][self.step + 1] = data["policy_obs"][key].copy() + # for key in self.next_policy_obs.keys(): + # self.next_policy_obs[key][self.step + 1] = data["next_policy_obs"][ + # key + # ].copy() + # for key in self.next_critic_obs.keys(): + # self.next_critic_obs[key][self.step + 1] = data["next_critic_obs"][ + # key + # ].copy() + # else: + # self.critic_obs[self.step + 1] = data["critic_obs"].copy() + # self.policy_obs[self.step + 1] = data["policy_obs"].copy() + # self.next_policy_obs[self.step + 1] = data["next_policy_obs"].copy() + # self.next_critic_obs[self.step + 1] = data["next_critic_obs"].copy() + # + # if "rnn_states" in data: + # self.rnn_states[self.step + 1] = data["rnn_states"].copy() + # if "rnn_states_critic" in data: + # self.rnn_states_critic[self.step + 1] = data["rnn_states_critic"].copy() + # if "actions" in data: + # self.actions[self.step + 1] = data["actions"].copy() + # if "action_log_probs" in data: + # self.action_log_probs[self.step] = data["action_log_probs"].copy() + # + # if "value_preds" in data: + # self.value_preds[self.step] = data["value_preds"].copy() + # if "rewards" in data: + # self.rewards[self.step + 1] = data["rewards"].copy() + # if "masks" in data: + # self.masks[self.step + 1] = data["masks"].copy() + # + # if "bad_masks" in data: + # self.bad_masks[self.step + 1] = data["bad_masks"].copy() + # if "active_masks" in data: + # self.active_masks[self.step + 1] = data["active_masks"].copy() + # if "action_masks" in data: + # self.action_masks[self.step + 1] = data["action_masks"].copy() + # + # if (self.step + 1) % self.episode_length != 0: + # self.first_insert_flag = False + # self.step = (self.step + 1) % self.episode_length def init_buffer(self, raw_obs, action_masks=None): critic_obs = get_critic_obs(raw_obs) diff --git a/openrl/buffers/replay_data.py b/openrl/buffers/replay_data.py index 40a4b383..8d092d7d 100644 --- a/openrl/buffers/replay_data.py +++ b/openrl/buffers/replay_data.py @@ -198,49 +198,49 @@ def get_batch_data( else: return np.concatenate(data[step]) - def all_batch_data(self, data_name: str, min=None, max=None): - assert hasattr(self, data_name) - data = getattr(self, data_name) - - if isinstance(data, ObsData): - return data.all_batch(min, max) - else: - return data[min:max].reshape((-1, *data.shape[3:])) - - def dict_insert(self, data): - if self._mixed_obs: - for key in self.critic_obs.keys(): - self.critic_obs[key][self.step + 1] = data["critic_obs"][key].copy() - for key in self.policy_obs.keys(): - self.policy_obs[key][self.step + 1] = data["policy_obs"][key].copy() - else: - self.critic_obs[self.step + 1] = data["critic_obs"].copy() - self.policy_obs[self.step + 1] = data["policy_obs"].copy() - - if "rnn_states" in data: - self.rnn_states[self.step + 1] = data["rnn_states"].copy() - if "rnn_states_critic" in data: - self.rnn_states_critic[self.step + 1] = data["rnn_states_critic"].copy() - if "actions" in data: - self.actions[self.step] = data["actions"].copy() - if "action_log_probs" in data: - self.action_log_probs[self.step] = data["action_log_probs"].copy() - - if "value_preds" in data: - self.value_preds[self.step] = data["value_preds"].copy() - if "rewards" in data: - self.rewards[self.step] = data["rewards"].copy() - if "masks" in data: - self.masks[self.step + 1] = data["masks"].copy() - - if "bad_masks" in data: - self.bad_masks[self.step + 1] = data["bad_masks"].copy() - if "active_masks" in data: - self.active_masks[self.step + 1] = data["active_masks"].copy() - if "action_masks" in data: - self.action_masks[self.step + 1] = data["action_masks"].copy() - - self.step = (self.step + 1) % self.episode_length + # def all_batch_data(self, data_name: str, min=None, max=None): + # assert hasattr(self, data_name) + # data = getattr(self, data_name) + # + # if isinstance(data, ObsData): + # return data.all_batch(min, max) + # else: + # return data[min:max].reshape((-1, *data.shape[3:])) + + # def dict_insert(self, data): + # if self._mixed_obs: + # for key in self.critic_obs.keys(): + # self.critic_obs[key][self.step + 1] = data["critic_obs"][key].copy() + # for key in self.policy_obs.keys(): + # self.policy_obs[key][self.step + 1] = data["policy_obs"][key].copy() + # else: + # self.critic_obs[self.step + 1] = data["critic_obs"].copy() + # self.policy_obs[self.step + 1] = data["policy_obs"].copy() + # + # if "rnn_states" in data: + # self.rnn_states[self.step + 1] = data["rnn_states"].copy() + # if "rnn_states_critic" in data: + # self.rnn_states_critic[self.step + 1] = data["rnn_states_critic"].copy() + # if "actions" in data: + # self.actions[self.step] = data["actions"].copy() + # if "action_log_probs" in data: + # self.action_log_probs[self.step] = data["action_log_probs"].copy() + # + # if "value_preds" in data: + # self.value_preds[self.step] = data["value_preds"].copy() + # if "rewards" in data: + # self.rewards[self.step] = data["rewards"].copy() + # if "masks" in data: + # self.masks[self.step + 1] = data["masks"].copy() + # + # if "bad_masks" in data: + # self.bad_masks[self.step + 1] = data["bad_masks"].copy() + # if "active_masks" in data: + # self.active_masks[self.step + 1] = data["active_masks"].copy() + # if "action_masks" in data: + # self.action_masks[self.step + 1] = data["action_masks"].copy() + # + # self.step = (self.step + 1) % self.episode_length def insert( self, @@ -947,119 +947,119 @@ def naive_recurrent_generator(self, advantages, num_mini_batch): yield critic_obs_batch, policy_obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, action_masks_batch - def recurrent_generator_v2( - self, advantages, num_mini_batch=None, mini_batch_size=None - ): - """ - Yield training data for MLP policies. - :param advantages: (np.ndarray) advantage estimates. - :param num_mini_batch: (int) number of minibatches to split the batch into. - :param mini_batch_size: (int) number of samples in each minibatch. - """ - episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3] - batch_size = n_rollout_threads * episode_length - - if mini_batch_size is None: - assert ( - batch_size >= num_mini_batch - ), ( - "PPO requires the number of processes ({}) " - "* number of steps ({}) = {} " - "to be greater than or equal to the number of PPO mini batches ({})." - "".format( - n_rollout_threads, - episode_length, - n_rollout_threads * episode_length, - num_mini_batch, - ) - ) - mini_batch_size = batch_size // num_mini_batch - - rand = torch.randperm(batch_size).numpy() - sampler = [ - rand[i * mini_batch_size : (i + 1) * mini_batch_size] - for i in range(num_mini_batch) - ] - - # keep (num_agent, dim) - critic_obs = self.critic_obs[:-1].reshape(-1, *self.critic_obs.shape[2:]) - - policy_obs = self.policy_obs[:-1].reshape(-1, *self.policy_obs.shape[2:]) - - rnn_states = self.rnn_states[:-1].reshape(-1, *self.rnn_states.shape[2:]) - - rnn_states_critic = self.rnn_states_critic[:-1].reshape( - -1, *self.rnn_states_critic.shape[2:] - ) - - actions = self.actions.reshape(-1, *self.actions.shape[2:]) - - if self.action_masks is not None: - action_masks = self.action_masks[:-1].reshape( - -1, *self.action_masks.shape[2:] - ) - - value_preds = self.value_preds[:-1].reshape(-1, *self.value_preds.shape[2:]) - - returns = self.returns[:-1].reshape(-1, *self.returns.shape[2:]) - - masks = self.masks[:-1].reshape(-1, *self.masks.shape[2:]) - - active_masks = self.active_masks[:-1].reshape(-1, *self.active_masks.shape[2:]) - - action_log_probs = self.action_log_probs.reshape( - -1, *self.action_log_probs.shape[2:] - ) - - advantages = advantages.reshape(-1, *advantages.shape[2:]) - - shuffle = False - if shuffle: - rows, cols = _shuffle_agent_grid(batch_size, num_agents) - - if self.action_masks is not None: - action_masks = action_masks[rows, cols] - critic_obs = critic_obs[rows, cols] - policy_obs = policy_obs[rows, cols] - rnn_states = rnn_states[rows, cols] - rnn_states_critic = rnn_states_critic[rows, cols] - actions = actions[rows, cols] - value_preds = value_preds[rows, cols] - returns = returns[rows, cols] - masks = masks[rows, cols] - active_masks = active_masks[rows, cols] - action_log_probs = action_log_probs[rows, cols] - advantages = advantages[rows, cols] - - for indices in sampler: - # [L,T,N,Dim]-->[L*T,N,Dim]-->[index,N,Dim]-->[index*N, Dim] - critic_obs_batch = critic_obs[indices].reshape(-1, *critic_obs.shape[2:]) - policy_obs_batch = policy_obs[indices].reshape(-1, *policy_obs.shape[2:]) - rnn_states_batch = rnn_states[indices].reshape(-1, *rnn_states.shape[2:]) - rnn_states_critic_batch = rnn_states_critic[indices].reshape( - -1, *rnn_states_critic.shape[2:] - ) - actions_batch = actions[indices].reshape(-1, *actions.shape[2:]) - if self.action_masks is not None: - action_masks_batch = action_masks[indices].reshape( - -1, *action_masks.shape[2:] - ) - else: - action_masks_batch = None - value_preds_batch = value_preds[indices].reshape(-1, *value_preds.shape[2:]) - return_batch = returns[indices].reshape(-1, *returns.shape[2:]) - masks_batch = masks[indices].reshape(-1, *masks.shape[2:]) - active_masks_batch = active_masks[indices].reshape( - -1, *active_masks.shape[2:] - ) - old_action_log_probs_batch = action_log_probs[indices].reshape( - -1, *action_log_probs.shape[2:] - ) - if advantages is None: - adv_targ = None - else: - adv_targ = advantages[indices].reshape(-1, *advantages.shape[2:]) - yield critic_obs_batch, policy_obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, action_masks_batch + # def recurrent_generator_v2( + # self, advantages, num_mini_batch=None, mini_batch_size=None + # ): + # """ + # Yield training data for MLP policies. + # :param advantages: (np.ndarray) advantage estimates. + # :param num_mini_batch: (int) number of minibatches to split the batch into. + # :param mini_batch_size: (int) number of samples in each minibatch. + # """ + # episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3] + # batch_size = n_rollout_threads * episode_length + # + # if mini_batch_size is None: + # assert ( + # batch_size >= num_mini_batch + # ), ( + # "PPO requires the number of processes ({}) " + # "* number of steps ({}) = {} " + # "to be greater than or equal to the number of PPO mini batches ({})." + # "".format( + # n_rollout_threads, + # episode_length, + # n_rollout_threads * episode_length, + # num_mini_batch, + # ) + # ) + # mini_batch_size = batch_size // num_mini_batch + # + # rand = torch.randperm(batch_size).numpy() + # sampler = [ + # rand[i * mini_batch_size : (i + 1) * mini_batch_size] + # for i in range(num_mini_batch) + # ] + # + # # keep (num_agent, dim) + # critic_obs = self.critic_obs[:-1].reshape(-1, *self.critic_obs.shape[2:]) + # + # policy_obs = self.policy_obs[:-1].reshape(-1, *self.policy_obs.shape[2:]) + # + # rnn_states = self.rnn_states[:-1].reshape(-1, *self.rnn_states.shape[2:]) + # + # rnn_states_critic = self.rnn_states_critic[:-1].reshape( + # -1, *self.rnn_states_critic.shape[2:] + # ) + # + # actions = self.actions.reshape(-1, *self.actions.shape[2:]) + # + # if self.action_masks is not None: + # action_masks = self.action_masks[:-1].reshape( + # -1, *self.action_masks.shape[2:] + # ) + # + # value_preds = self.value_preds[:-1].reshape(-1, *self.value_preds.shape[2:]) + # + # returns = self.returns[:-1].reshape(-1, *self.returns.shape[2:]) + # + # masks = self.masks[:-1].reshape(-1, *self.masks.shape[2:]) + # + # active_masks = self.active_masks[:-1].reshape(-1, *self.active_masks.shape[2:]) + # + # action_log_probs = self.action_log_probs.reshape( + # -1, *self.action_log_probs.shape[2:] + # ) + # + # advantages = advantages.reshape(-1, *advantages.shape[2:]) + # + # shuffle = False + # if shuffle: + # rows, cols = _shuffle_agent_grid(batch_size, num_agents) + # + # if self.action_masks is not None: + # action_masks = action_masks[rows, cols] + # critic_obs = critic_obs[rows, cols] + # policy_obs = policy_obs[rows, cols] + # rnn_states = rnn_states[rows, cols] + # rnn_states_critic = rnn_states_critic[rows, cols] + # actions = actions[rows, cols] + # value_preds = value_preds[rows, cols] + # returns = returns[rows, cols] + # masks = masks[rows, cols] + # active_masks = active_masks[rows, cols] + # action_log_probs = action_log_probs[rows, cols] + # advantages = advantages[rows, cols] + # + # for indices in sampler: + # # [L,T,N,Dim]-->[L*T,N,Dim]-->[index,N,Dim]-->[index*N, Dim] + # critic_obs_batch = critic_obs[indices].reshape(-1, *critic_obs.shape[2:]) + # policy_obs_batch = policy_obs[indices].reshape(-1, *policy_obs.shape[2:]) + # rnn_states_batch = rnn_states[indices].reshape(-1, *rnn_states.shape[2:]) + # rnn_states_critic_batch = rnn_states_critic[indices].reshape( + # -1, *rnn_states_critic.shape[2:] + # ) + # actions_batch = actions[indices].reshape(-1, *actions.shape[2:]) + # if self.action_masks is not None: + # action_masks_batch = action_masks[indices].reshape( + # -1, *action_masks.shape[2:] + # ) + # else: + # action_masks_batch = None + # value_preds_batch = value_preds[indices].reshape(-1, *value_preds.shape[2:]) + # return_batch = returns[indices].reshape(-1, *returns.shape[2:]) + # masks_batch = masks[indices].reshape(-1, *masks.shape[2:]) + # active_masks_batch = active_masks[indices].reshape( + # -1, *active_masks.shape[2:] + # ) + # old_action_log_probs_batch = action_log_probs[indices].reshape( + # -1, *action_log_probs.shape[2:] + # ) + # if advantages is None: + # adv_targ = None + # else: + # adv_targ = advantages[indices].reshape(-1, *advantages.shape[2:]) + # yield critic_obs_batch, policy_obs_batch, rnn_states_batch, rnn_states_critic_batch, actions_batch, value_preds_batch, return_batch, masks_batch, active_masks_batch, old_action_log_probs_batch, adv_targ, action_masks_batch def recurrent_generator(self, advantages, num_mini_batch, data_chunk_length): episode_length, n_rollout_threads, num_agents = self.rewards.shape[0:3] diff --git a/openrl/configs/config.py b/openrl/configs/config.py index 1137d6e3..49bdae79 100644 --- a/openrl/configs/config.py +++ b/openrl/configs/config.py @@ -498,13 +498,14 @@ def create_config_parser(): ) parser.add_argument( "--use_popart", - action="store_true", default=False, + type=bool, help="by default False, use PopArt to normalize rewards.", ) parser.add_argument( "--dual_clip_ppo", default=False, + type=bool, help="by default False, use dual-clip ppo.", ) parser.add_argument( @@ -730,8 +731,8 @@ def create_config_parser(): ) parser.add_argument( "--use_gae", - action="store_false", default=True, + type=bool, help="use generalized advantage estimation", ) parser.add_argument( @@ -748,8 +749,8 @@ def create_config_parser(): ) parser.add_argument( "--use_proper_time_limits", - action="store_true", default=False, + type=bool, help="compute returns taking into account time limits", ) parser.add_argument( diff --git a/tests/test_buffer/test_generator.py b/tests/test_buffer/test_generator.py new file mode 100644 index 00000000..4de33c02 --- /dev/null +++ b/tests/test_buffer/test_generator.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import pytest + +from openrl.envs.common import make +from openrl.modules.common import PPONet as Net +from openrl.runners.common import PPOAgent as Agent + + +@pytest.fixture( + scope="module", + params=[ + "--use_recurrent_policy true --use_joint_action_loss true", + "--use_recurrent_policy true --use_joint_action_loss false", + "--use_recurrent_policy false --use_naive_recurrent true", + "--use_recurrent_policy false --use_naive_recurrent false", + ], +) +def generator_type(request): + return request.param + + +@pytest.fixture(scope="module", params=["--use_gae true", "--use_gae false"]) +def use_gae(request): + return request.param + + +@pytest.fixture( + scope="module", + params=["--use_proper_time_limits true", "--use_proper_time_limits false"], +) +def use_proper_time_limits(request): + return request.param + + +@pytest.fixture( + scope="module", + params=[ + "--use_popart true --use_valuenorm false", + "--use_popart false --use_valuenorm true", + "--use_popart false --use_valuenorm false", + ], +) +def use_popart(request): + return request.param + + +@pytest.fixture(scope="module") +def config(use_proper_time_limits, use_popart, use_gae, generator_type): + config_str = ( + use_proper_time_limits + " " + use_popart + " " + use_gae + " " + generator_type + ) + + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(config_str.split()) + return cfg + + +@pytest.mark.unittest +def test_buffer_generator(config): + env = make("CartPole-v1", env_num=2) + agent = Agent(Net(env, cfg=config)) + agent.train(total_time_steps=200) + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_buffer/test_offpolicy_generator.py b/tests/test_buffer/test_offpolicy_generator.py new file mode 100644 index 00000000..5e5da276 --- /dev/null +++ b/tests/test_buffer/test_offpolicy_generator.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import pytest + +from openrl.envs.common import make +from openrl.modules.common import DQNNet as Net +from openrl.runners.common import DQNAgent as Agent + + +@pytest.fixture( + scope="module", + params=[ + "--use_recurrent_policy false --use_joint_action_loss false", + ], +) +def generator_type(request): + return request.param + + +@pytest.fixture(scope="module", params=["--use_proper_time_limits false"]) +def use_proper_time_limits(request): + return request.param + + +@pytest.fixture(scope="module", params=["--use_popart false --use_valuenorm false"]) +def use_popart(request): + return request.param + + +@pytest.fixture(scope="module") +def config(use_proper_time_limits, use_popart, generator_type): + config_str = use_proper_time_limits + " " + use_popart + " " + generator_type + + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(config_str.split()) + return cfg + + +@pytest.mark.unittest +def test_buffer_generator(config): + env = make("CartPole-v1", env_num=2) + agent = Agent(Net(env, cfg=config)) + agent.train(total_time_steps=200) + env.close() + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 40d8304581bfec6964a85ac4d410a4b86ba5cddb Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Tue, 12 Dec 2023 15:22:28 +0800 Subject: [PATCH 66/78] improve test --- openrl/algorithms/ppo.py | 11 +++++++---- openrl/buffers/replay_data.py | 8 ++++++-- tests/test_buffer/test_generator.py | 19 ++++++++++++++++--- tests/test_buffer/test_offpolicy_generator.py | 19 ++++++++++++++++--- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/openrl/algorithms/ppo.py b/openrl/algorithms/ppo.py index e72e01bb..80e9f23f 100644 --- a/openrl/algorithms/ppo.py +++ b/openrl/algorithms/ppo.py @@ -179,7 +179,7 @@ def cal_value_loss( -self.clip_param, self.clip_param ) - if self._use_popart or self._use_valuenorm: + if (self._use_popart or self._use_valuenorm) and value_normalizer is not None: value_normalizer.update(return_batch) error_clipped = ( value_normalizer.normalize(return_batch) - value_pred_clipped @@ -382,9 +382,12 @@ def train_ppo(self, buffer, turn_on): ].module.value_normalizer else: value_normalizer = self.algo_module.get_critic_value_normalizer() - advantages = buffer.returns[:-1] - value_normalizer.denormalize( - buffer.value_preds[:-1] - ) + if value_normalizer is not None: + advantages = buffer.returns[:-1] - value_normalizer.denormalize( + buffer.value_preds[:-1] + ) + else: + advantages = buffer.returns[:-1] - buffer.value_preds[:-1] else: advantages = buffer.returns[:-1] - buffer.value_preds[:-1] diff --git a/openrl/buffers/replay_data.py b/openrl/buffers/replay_data.py index 8d092d7d..a8f4c1b7 100644 --- a/openrl/buffers/replay_data.py +++ b/openrl/buffers/replay_data.py @@ -323,7 +323,9 @@ def compute_returns(self, next_value, value_normalizer=None): self.value_preds[-1] = next_value gae = 0 for step in reversed(range(self.rewards.shape[0])): - if self._use_popart or self._use_valuenorm: + if ( + self._use_popart or self._use_valuenorm + ) and value_normalizer is not None: # step + 1 delta = ( self.rewards[step] @@ -357,7 +359,9 @@ def compute_returns(self, next_value, value_normalizer=None): else: self.returns[-1] = next_value for step in reversed(range(self.rewards.shape[0])): - if self._use_popart or self._use_valuenorm: + if ( + self._use_popart or self._use_valuenorm + ) and value_normalizer is not None: self.returns[step] = ( self.returns[step + 1] * self.gamma * self.masks[step + 1] + self.rewards[step] diff --git a/tests/test_buffer/test_generator.py b/tests/test_buffer/test_generator.py index 4de33c02..27763635 100644 --- a/tests/test_buffer/test_generator.py +++ b/tests/test_buffer/test_generator.py @@ -25,6 +25,11 @@ from openrl.runners.common import PPOAgent as Agent +@pytest.fixture(scope="module", params=["--episode_length 10"]) +def episode_length(request): + return request.param + + @pytest.fixture( scope="module", params=[ @@ -64,9 +69,17 @@ def use_popart(request): @pytest.fixture(scope="module") -def config(use_proper_time_limits, use_popart, use_gae, generator_type): +def config(use_proper_time_limits, use_popart, use_gae, generator_type, episode_length): config_str = ( - use_proper_time_limits + " " + use_popart + " " + use_gae + " " + generator_type + use_proper_time_limits + + " " + + use_popart + + " " + + use_gae + + " " + + generator_type + + " " + + episode_length ) from openrl.configs.config import create_config_parser @@ -80,7 +93,7 @@ def config(use_proper_time_limits, use_popart, use_gae, generator_type): def test_buffer_generator(config): env = make("CartPole-v1", env_num=2) agent = Agent(Net(env, cfg=config)) - agent.train(total_time_steps=200) + agent.train(total_time_steps=50) env.close() diff --git a/tests/test_buffer/test_offpolicy_generator.py b/tests/test_buffer/test_offpolicy_generator.py index 5e5da276..ec960973 100644 --- a/tests/test_buffer/test_offpolicy_generator.py +++ b/tests/test_buffer/test_offpolicy_generator.py @@ -25,6 +25,11 @@ from openrl.runners.common import DQNAgent as Agent +@pytest.fixture(scope="module", params=["--episode_length 10"]) +def episode_length(request): + return request.param + + @pytest.fixture( scope="module", params=[ @@ -46,8 +51,16 @@ def use_popart(request): @pytest.fixture(scope="module") -def config(use_proper_time_limits, use_popart, generator_type): - config_str = use_proper_time_limits + " " + use_popart + " " + generator_type +def config(use_proper_time_limits, use_popart, generator_type, episode_length): + config_str = ( + use_proper_time_limits + + " " + + use_popart + + " " + + generator_type + + " " + + episode_length + ) from openrl.configs.config import create_config_parser @@ -60,7 +73,7 @@ def config(use_proper_time_limits, use_popart, generator_type): def test_buffer_generator(config): env = make("CartPole-v1", env_num=2) agent = Agent(Net(env, cfg=config)) - agent.train(total_time_steps=200) + agent.train(total_time_steps=50) env.close() From 38da73a3ac4cbd99ee1b1ed8830040d419e074a3 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Thu, 14 Dec 2023 14:34:23 +0800 Subject: [PATCH 67/78] improve test --- tests/test_algorithm/test_a2c_algorithm.py | 95 +++++++++++++++++++++ tests/test_algorithm/test_bc_algorithm.py | 84 ++++++++++++++++++ tests/test_algorithm/test_ddpg_algorithm.py | 91 ++++++++++++++++++++ tests/test_algorithm/test_ppo_algorithm.py | 4 +- tests/test_algorithm/test_sac_algorithm.py | 91 ++++++++++++++++++++ 5 files changed, 364 insertions(+), 1 deletion(-) create mode 100644 tests/test_algorithm/test_a2c_algorithm.py create mode 100644 tests/test_algorithm/test_bc_algorithm.py create mode 100644 tests/test_algorithm/test_ddpg_algorithm.py create mode 100644 tests/test_algorithm/test_sac_algorithm.py diff --git a/tests/test_algorithm/test_a2c_algorithm.py b/tests/test_algorithm/test_a2c_algorithm.py new file mode 100644 index 00000000..0f4f7226 --- /dev/null +++ b/tests/test_algorithm/test_a2c_algorithm.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + + +@pytest.fixture +def obs_space(): + return spaces.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture +def act_space(): + return spaces.Discrete(2) + + +@pytest.fixture( + scope="module", params=["--use_share_model false", "--use_share_model true"] +) +def config(request): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.fixture +def amp_config(): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args("") + return cfg + + +@pytest.fixture +def init_module(config, obs_space, act_space): + from openrl.modules.ppo_module import PPOModule + + module = PPOModule( + config, + policy_input_space=obs_space, + critic_input_space=obs_space, + act_space=act_space, + share_model=config.use_share_model, + ) + return module + + +@pytest.fixture +def buffer_data(config, obs_space, act_space): + from openrl.buffers.normal_buffer import NormalReplayBuffer + + buffer = NormalReplayBuffer( + config, + num_agents=1, + obs_space=obs_space, + act_space=act_space, + data_client=None, + episode_length=100, + ) + return buffer.data + + +@pytest.mark.unittest +def test_a2c_algorithm(config, init_module, buffer_data): + from openrl.algorithms.a2c import A2CAlgorithm + + a2c_algo = A2CAlgorithm(config, init_module) + + a2c_algo.train(buffer_data) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_algorithm/test_bc_algorithm.py b/tests/test_algorithm/test_bc_algorithm.py new file mode 100644 index 00000000..fa073174 --- /dev/null +++ b/tests/test_algorithm/test_bc_algorithm.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + + +@pytest.fixture +def obs_space(): + return spaces.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture +def act_space(): + return spaces.Discrete(2) + + +@pytest.fixture(scope="module", params=["", "--use_share_model true"]) +def config(request): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.fixture +def init_module(config, obs_space, act_space): + from openrl.modules.bc_module import BCModule + + module = BCModule( + config, + policy_input_space=obs_space, + critic_input_space=obs_space, + act_space=act_space, + share_model=config.use_share_model, + ) + return module + + +@pytest.fixture +def buffer_data(config, obs_space, act_space): + from openrl.buffers.normal_buffer import NormalReplayBuffer + + buffer = NormalReplayBuffer( + config, + num_agents=1, + obs_space=obs_space, + act_space=act_space, + data_client=None, + episode_length=100, + ) + return buffer.data + + +@pytest.mark.unittest +def test_bc_algorithm(config, init_module, buffer_data): + from openrl.algorithms.behavior_cloning import BCAlgorithm + + bc_algo = BCAlgorithm(config, init_module) + + bc_algo.train(buffer_data) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_algorithm/test_ddpg_algorithm.py b/tests/test_algorithm/test_ddpg_algorithm.py new file mode 100644 index 00000000..b31a56df --- /dev/null +++ b/tests/test_algorithm/test_ddpg_algorithm.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + + +@pytest.fixture +def obs_space(): + return spaces.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture +def act_space(): + return spaces.box.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.fixture +def amp_config(): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args("") + return cfg + + +@pytest.fixture +def init_module(config, obs_space, act_space): + from openrl.modules.ddpg_module import DDPGModule + + module = DDPGModule( + config, + input_space=obs_space, + act_space=act_space, + ) + return module + + +@pytest.fixture +def buffer_data(config, obs_space, act_space): + from openrl.buffers.offpolicy_buffer import OffPolicyReplayBuffer + + buffer = OffPolicyReplayBuffer( + config, + num_agents=1, + obs_space=obs_space, + act_space=act_space, + data_client=None, + episode_length=5000, + ) + return buffer.data + + +@pytest.mark.unittest +def test_ddpg_algorithm(config, init_module, buffer_data): + from openrl.algorithms.ddpg import DDPGAlgorithm + + ddpg_algo = DDPGAlgorithm(config, init_module) + + ddpg_algo.train(buffer_data) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) diff --git a/tests/test_algorithm/test_ppo_algorithm.py b/tests/test_algorithm/test_ppo_algorithm.py index 8ac5c865..98a8a5d4 100644 --- a/tests/test_algorithm/test_ppo_algorithm.py +++ b/tests/test_algorithm/test_ppo_algorithm.py @@ -33,7 +33,9 @@ def act_space(): return spaces.Discrete(2) -@pytest.fixture(scope="module", params=["", "--use_share_model true"]) +@pytest.fixture( + scope="module", params=["--use_share_model false", "--use_share_model true"] +) def config(request): from openrl.configs.config import create_config_parser diff --git a/tests/test_algorithm/test_sac_algorithm.py b/tests/test_algorithm/test_sac_algorithm.py new file mode 100644 index 00000000..80447a3a --- /dev/null +++ b/tests/test_algorithm/test_sac_algorithm.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# Copyright 2023 The OpenRL Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""""" +import os +import sys + +import numpy as np +import pytest +from gymnasium import spaces + + +@pytest.fixture +def obs_space(): + return spaces.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture +def act_space(): + return spaces.box.Box(low=-np.inf, high=+np.inf, shape=(1,), dtype=np.float32) + + +@pytest.fixture(scope="module", params=[""]) +def config(request): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args(request.param.split()) + return cfg + + +@pytest.fixture +def amp_config(): + from openrl.configs.config import create_config_parser + + cfg_parser = create_config_parser() + cfg = cfg_parser.parse_args("") + return cfg + + +@pytest.fixture +def init_module(config, obs_space, act_space): + from openrl.modules.sac_module import SACModule + + module = SACModule( + config, + input_space=obs_space, + act_space=act_space, + ) + return module + + +@pytest.fixture +def buffer_data(config, obs_space, act_space): + from openrl.buffers.offpolicy_buffer import OffPolicyReplayBuffer + + buffer = OffPolicyReplayBuffer( + config, + num_agents=1, + obs_space=obs_space, + act_space=act_space, + data_client=None, + episode_length=5000, + ) + return buffer.data + + +@pytest.mark.unittest +def test_sac_algorithm(config, init_module, buffer_data): + from openrl.algorithms.sac import SACAlgorithm + + sac_algo = SACAlgorithm(config, init_module) + + sac_algo.train(buffer_data) + + +if __name__ == "__main__": + sys.exit(pytest.main(["-sv", os.path.basename(__file__)])) From 05295b87dacc0ebc0a4b8a062beca77b04039429 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Tue, 19 Dec 2023 13:08:30 +0800 Subject: [PATCH 68/78] ds_support --- examples/nlp/ds_config.json | 4 +- examples/nlp/nlp_ppo.yaml | 10 ++--- examples/nlp/nlp_ppo_ds.yaml | 10 ++--- openrl/envs/nlp/rewards/intent.py | 29 +++++++++---- openrl/envs/nlp/rewards/kl_penalty.py | 29 ++++++++----- openrl/modules/networks/policy_network_gpt.py | 41 ++++++++++++------- openrl/modules/networks/value_network_gpt.py | 29 ++++++++----- 7 files changed, 96 insertions(+), 56 deletions(-) diff --git a/examples/nlp/ds_config.json b/examples/nlp/ds_config.json index 544bc405..3de0eb2d 100644 --- a/examples/nlp/ds_config.json +++ b/examples/nlp/ds_config.json @@ -3,9 +3,7 @@ "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": { - "stage": 2, - "reduce_bucket_size": 5e7, - "allgather_bucket_size": 5e7 + "stage": 2 }, "fp16": {"enabled": false, "loss_scale_window": 100} } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index 1ba77379..caf97bb2 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -12,18 +12,18 @@ num_mini_batch: 20 hidden_size: 1 -model_path: /home/chenwenze/data_server/huggingface/models/facebook/opt-125m +model_path: /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog env: args: { - 'tokenizer_path': '/home/chenwenze/data_server/huggingface/models/facebook/opt-125m', - 'data_path': '/home/chenwenze/data_server/huggingface/datasets/daily_dialog', + 'tokenizer_path': 'gpt2', + 'data_path': 'daily_dialog', } vec_info_class: id: "NLPVecInfo" reward_class: id: "NLPReward" args: { - "ref_model": "/home/chenwenze/data_server/huggingface/models/facebook/opt-125m", - "intent_model": "/home/chenwenze/data_server/huggingface/models/rajkumarrrk/roberta-daily-dialog-intent-classifier", + "ref_model": "/rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", + "intent_model": "/rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index c9d4ad60..1f2ad7f8 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -17,11 +17,11 @@ use_fp16: false use_offload: false deepspeed_config: ds_config.json -model_path: /home/chenwenze/data_server/huggingface/models/facebook/opt-125m +model_path: /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog/ env: args: { - 'tokenizer_path': '/home/chenwenze/data_server/huggingface/models/gpt2', - 'data_path': '/home/chenwenze/data_server/huggingface/datasets/daily_dialog', + 'tokenizer_path': 'gpt2', + 'data_path': 'daily_dialog', } vec_info_class: id: "NLPVecInfo" @@ -30,8 +30,8 @@ reward_class: args: { "use_deepspeed": true, "ref_ds_config": "eval_ds_config.json", - "ref_model": "/home/chenwenze/data_server/huggingface/models/facebook/opt-125m", + "ref_model": /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog/, "intent_ds_config": "eval_ds_config.json", - "intent_model": "/home/chenwenze/data_server/huggingface/models/rajkumarrrk/roberta-daily-dialog-intent-classifier", + "intent_model": "/rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 0d449d13..f0929932 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -36,6 +36,10 @@ def __init__( self._intent_coeff = intent_coeff self.use_deepspeed = use_deepspeed + self.use_half = False + self.use_data_parallel = not use_deepspeed # default to use data parallel + self.use_model_parallel = False + if intent_model == "builtin_intent": from transformers import GPT2Config, GPT2LMHeadModel @@ -80,16 +84,16 @@ def __init__(self, input_ids, attention_mask): self._device = "cuda" self._model = self._model.to("cuda") self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) + self.use_fp16 = ds_config["fp16"]["enabled"] else: - if torch.cuda.is_available(): - manager = LocalGPUManager() - manager.log_info() - self._device = f"cuda:{manager.get_gpu()}" - else: - self._device = "cpu" - print("Intent Model choose to use device:{}".format(self._device)) - - self._model = self._model.to(self._device) + self._device = "cuda" + if self.use_model_parallel: + self._model.parallelize() + elif self.use_data_parallel: + if self.use_half: + self._model = self._model.half() + self._model = torch.nn.DataParallel(self._model) + self._model = self._model.to(self._device) def __call__( self, @@ -120,6 +124,13 @@ def get_input_for_classifier(prompt, generated_text): input_texts, return_tensors="pt", truncation=True, padding=True ) + if self.use_half: + encoded.input_ids = encoded.input_ids.int() + encoded.attention_mask = encoded.attention_mask.int() + else: + encoded.input_ids = encoded.input_ids.long() + encoded.attention_mask = encoded.attention_mask.long() + with torch.no_grad(): outputs = self._model( input_ids=encoded.input_ids.to(self._device), diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 7f5a6426..406c9215 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -37,9 +37,10 @@ def __init__( super().__init__() self.device = "cuda" - self.use_data_parallel = False - self.use_model_parallel = False self.use_deepspeed = use_deepspeed + self.use_half = False + self.use_data_parallel = not use_deepspeed + self.use_model_parallel = False assert not (self.use_deepspeed and self.use_data_parallel) assert not (self.use_deepspeed and self.use_model_parallel) assert not (self.use_data_parallel and self.use_model_parallel) @@ -70,10 +71,12 @@ def __init__( self.use_fp16 = False self._ref_engine, *_ = deepspeed.initialize(model=self, config=ds_config) - elif torch.cuda.is_available(): + else: if self.use_model_parallel: self._ref_net.parallelize() elif self.use_data_parallel: # else defaults to data parallel + if self.use_half: + self._ref_net = self._ref_net.half() self._ref_net = torch.nn.DataParallel(self._ref_net) self._ref_net = self._ref_net.to(self.device) @@ -113,24 +116,30 @@ def __call__( self._ref_net, input_ids, past_model_kwargs ) - if self.use_deepspeed: - if self.use_fp16: - for key in ["input_ids", "position_ids"]: - model_inputs[key] = model_inputs[key].half().int() - for key in ["attention_mask"]: - model_inputs[key] = model_inputs[key].half() + if self.use_half: + for key in ["input_ids", "position_ids", "attention_mask"]: + if key in model_inputs: + model_inputs[key] = model_inputs[key].int() + else: + for key in ["input_ids", "position_ids", "attention_mask"]: + if key in model_inputs: + model_inputs[key] = model_inputs[key].long() + with torch.no_grad(): output = self._ref_net(output_hidden_states=True, **model_inputs) output["past_key_values"] = None next_token_logits = output.logits[:, -1, :] + if self.use_deepspeed and self.use_fp16: + next_token_logits = next_token_logits.double() dist = self._action_dist.proba_distribution(action_logits=next_token_logits) action_input = actions.to(next_token_logits.device) ref_log_prob = dist.log_prob(action_input) ref_log_prob = ref_log_prob.reshape(action_log_probs.shape) + kl_div = action_log_probs.copy() - ref_log_prob.detach().cpu().numpy() - rew = -self._alpha * kl_div + rew = -self._alpha * kl_div infos = [] for kl in kl_div: infos.append( diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py index 5c97feef..0cda244e 100644 --- a/openrl/modules/networks/policy_network_gpt.py +++ b/openrl/modules/networks/policy_network_gpt.py @@ -48,11 +48,11 @@ def __init__( ) -> None: self.device = device - self.use_half = use_half - - self.use_data_parallel = False - self.use_model_parallel = False + self.use_fp16 = cfg.use_fp16 self.use_deepspeed = cfg.use_deepspeed + self.use_half = False + self.use_data_parallel = not cfg.use_deepspeed # default to use data parallel + self.use_model_parallel = False assert not (self.use_deepspeed and self.use_data_parallel) assert not (self.use_deepspeed and self.use_model_parallel) @@ -80,6 +80,8 @@ def __init__( if self.use_model_parallel: self._policy_model.parallelize() elif self.use_data_parallel: + if self.use_half: + self._policy_model = self._policy_model.half() self._policy_model = torch.nn.DataParallel(self._policy_model) self._policy_model = self._policy_model.to(self.device) @@ -120,15 +122,22 @@ def forward_original( ): for key in raw_obs.keys(): raw_obs[key] = torch.from_numpy(raw_obs[key]) if type(raw_obs[key]) == np.ndarray else raw_obs[key] + rnn_states = check(rnn_states) + + if self.use_half: + input_ids = raw_obs["input_encoded_pt"].int() + attention_mask = raw_obs["input_attention_mask_pt"].int() + else: + input_ids = raw_obs["input_encoded_pt"].long() + attention_mask = raw_obs["input_attention_mask_pt"].long() + + for key in raw_obs.keys(): if self.use_data_parallel: - raw_obs[key] = raw_obs[key].to(self.device) + input_ids = input_ids.to(self.device) + attention_mask = attention_mask.to(self.device) else: - raw_obs[key] = raw_obs[key].to(self._policy_model.device) - - rnn_states = check(rnn_states) - - input_ids = raw_obs["input_encoded_pt"].int() - attention_mask = raw_obs["input_attention_mask_pt"] + input_ids = input_ids.to(self._policy_model.device) + attention_mask = attention_mask.to(self._policy_model.device) past_model_kwargs = None @@ -145,7 +154,7 @@ def forward_original( output = self._policy_model(**model_inputs) # compute action probs - policy head - next_token_logits = output.logits[:, -1] + next_token_logits = output.logits[:, -1] dist = self._action_dist.proba_distribution(action_logits=next_token_logits) actions = dist.mode() if deterministic else dist.sample() @@ -168,8 +177,12 @@ def eval_actions( action = check(action).to(self._policy_model.device).squeeze() rnn_states = check(rnn_states) - input_ids = obs["input_encoded_pt"].int() - attention_mask = obs["input_attention_mask_pt"] + if self.half: + input_ids = obs["input_encoded_pt"].int() + attention_mask = obs["input_attention_mask_pt"].int() + else: + input_ids = obs["input_encoded_pt"].long() + attention_mask = obs["input_attention_mask_pt"].long() past_model_kwargs = None diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py index 4815cff7..b4ed9b1c 100644 --- a/openrl/modules/networks/value_network_gpt.py +++ b/openrl/modules/networks/value_network_gpt.py @@ -46,11 +46,12 @@ def __init__( ): self.device = device - self.use_half = use_half - self.use_data_parallel = False - self.use_model_parallel = False + self.use_fp16 = cfg.use_fp16 self.use_deepspeed = cfg.use_deepspeed + self.use_half = False + self.use_data_parallel = not cfg.use_deepspeed + self.use_model_parallel = False assert not (self.use_deepspeed and self.use_data_parallel) assert not (self.use_deepspeed and self.use_model_parallel) assert not (self.use_data_parallel and self.use_model_parallel) @@ -62,18 +63,22 @@ def __init__( self._value_model = AutoModelForCausalLM.from_pretrained(cfg.model_path) self._value_model.config.use_cache = False self._value_head = nn.Linear( - self._value_model.config.hidden_size, 1, bias=False + self._value_model.config.n_embd, 1, bias=False # gpt2 + # self._value_model.config.word_embed_proj_dim, 1, bias=False # opt-x ) self.value_normalizer = ( ValueNorm(1, device=device) if self._use_valuenorm else None ) - self._value_head.to(self.device) - - if torch.cuda.is_available(): + if self.use_deepspeed: + self._value_head.to(self.device) + else: if self.use_model_parallel: self._value_model.parallelize() elif self.use_data_parallel: + if self.use_half: + self._value_model = self._value_model.half() + self._value_head = self._value_head.half() self._value_model = torch.nn.DataParallel(self._value_model) self._value_model = self._value_model.to(self.device) self._value_head = torch.nn.DataParallel(self._value_head) @@ -113,9 +118,13 @@ def forward(self, critic_obs, rnn_states, masks): rnn_states = check(rnn_states) - input_ids = critic_obs["input_encoded_pt"].int() - attention_mask = critic_obs["input_attention_mask_pt"] - + if self.use_half: + input_ids = critic_obs["input_encoded_pt"].int() + attention_mask = critic_obs["input_attention_mask_pt"].int() + else: + input_ids = critic_obs["input_encoded_pt"].long() + attention_mask = critic_obs["input_attention_mask_pt"].long() + past_model_kwargs = None if not past_model_kwargs: past_model_kwargs = { From cd7f5b075839e719fb66117259bcf98b750e60d5 Mon Sep 17 00:00:00 2001 From: Chen001117 Date: Tue, 19 Dec 2023 13:08:48 +0800 Subject: [PATCH 69/78] meteor_init_bug --- openrl/envs/nlp/daily_dialog_env.py | 18 +++++++++++++++--- openrl/envs/vec_env/wrappers/reward_wrapper.py | 5 ++--- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 61e68946..98dd4d85 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -113,8 +113,20 @@ def __init__( self.__time_step = None self.reward_function = None - def set_reward(self, reward_fn): - self.reward_function = reward_fn + self.set_reward() + + def set_reward(self, reward_fn=None): + + from openrl.envs.nlp.rewards.meteor import Meteor + meteor_config = { + "meteor_coeff": 0.5, + "test": False, + } + self.reward_function = { + "meteor": Meteor(**meteor_config), + } + + # self.reward_function = reward_fn def step_word(self, word: str) -> Tuple[Dict[str, torch.tensor], int, bool, dict]: action = self.tokenizer.encode(word)[1] @@ -135,7 +147,7 @@ def step( done = done or self.__current_obs.context_text.endswith(DailyDialog.EOU_TOKEN) reward = 0.0 - reward_info = dict() + reward_info = dict() if done and self.reward_function: for reward_function in self.reward_function.values(): diff --git a/openrl/envs/vec_env/wrappers/reward_wrapper.py b/openrl/envs/vec_env/wrappers/reward_wrapper.py index d0a4d630..2b5ca266 100644 --- a/openrl/envs/vec_env/wrappers/reward_wrapper.py +++ b/openrl/envs/vec_env/wrappers/reward_wrapper.py @@ -24,13 +24,12 @@ from openrl.envs.vec_env.wrappers.base_wrapper import VecEnvWrapper from openrl.rewards.base_reward import BaseReward - class RewardWrapper(VecEnvWrapper): def __init__(self, env: BaseVecEnv, reward_class: BaseReward): super().__init__(env) self.reward_class = reward_class - if len(self.reward_class.inner_rew_funcs) > 0: - env.call("set_reward", **{"reward_fn": self.reward_class.inner_rew_funcs}) + # if len(self.reward_class.inner_rew_funcs) > 0: + # env.call("set_reward", **{"reward_fn": self.reward_class.inner_rew_funcs}) def step( self, action: ActType, extra_data: Optional[Dict[str, Any]] From 70163284cf9976ab6c15a5b6cd19f6486eed0911 Mon Sep 17 00:00:00 2001 From: Wen-Tse Chen Date: Tue, 19 Dec 2023 21:45:13 -0500 Subject: [PATCH 70/78] meteor_init_bug --- examples/nlp/nlp_ppo.yaml | 6 +++--- examples/nlp/nlp_ppo_ds.yaml | 6 +++--- openrl/rewards/nlp_reward.py | 18 +++++++++++------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/nlp/nlp_ppo.yaml b/examples/nlp/nlp_ppo.yaml index caf97bb2..918a75b8 100644 --- a/examples/nlp/nlp_ppo.yaml +++ b/examples/nlp/nlp_ppo.yaml @@ -12,7 +12,7 @@ num_mini_batch: 20 hidden_size: 1 -model_path: /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog +model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog env: args: { 'tokenizer_path': 'gpt2', @@ -23,7 +23,7 @@ vec_info_class: reward_class: id: "NLPReward" args: { - "ref_model": "/rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", - "intent_model": "/rajkumarrrk/roberta-daily-dialog-intent-classifier", + "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", + "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/examples/nlp/nlp_ppo_ds.yaml b/examples/nlp/nlp_ppo_ds.yaml index 1f2ad7f8..88dac18c 100644 --- a/examples/nlp/nlp_ppo_ds.yaml +++ b/examples/nlp/nlp_ppo_ds.yaml @@ -17,7 +17,7 @@ use_fp16: false use_offload: false deepspeed_config: ds_config.json -model_path: /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog/ +model_path: rajkumarrrk/gpt2-fine-tuned-on-daily-dialog env: args: { 'tokenizer_path': 'gpt2', @@ -30,8 +30,8 @@ reward_class: args: { "use_deepspeed": true, "ref_ds_config": "eval_ds_config.json", - "ref_model": /rajkumarrrk/gpt2-fine-tuned-on-daily-dialog/, + "ref_model": "rajkumarrrk/gpt2-fine-tuned-on-daily-dialog", "intent_ds_config": "eval_ds_config.json", - "intent_model": "/rajkumarrrk/roberta-daily-dialog-intent-classifier", + "intent_model": "rajkumarrrk/roberta-daily-dialog-intent-classifier", } \ No newline at end of file diff --git a/openrl/rewards/nlp_reward.py b/openrl/rewards/nlp_reward.py index 51c76fb3..bedfcc59 100644 --- a/openrl/rewards/nlp_reward.py +++ b/openrl/rewards/nlp_reward.py @@ -22,13 +22,17 @@ def __init__( self.rew_infos = [] self.env_infos = [] - meteor_config = { - "meteor_coeff": 0.5, - "test": ref_model == "builtin_ref", - } - self.inner_rew_funcs = { - "meteor": Meteor(**meteor_config), - } + # bug unfixed + self.inner_rew_funcs = dict() + + # meteor_config = { + # "meteor_coeff": 0.5, + # "test": ref_model == "builtin_ref", + # } + # self.inner_rew_funcs = { + # "meteor": Meteor(**meteor_config), + # } + kl_config = { "action_space": env.action_space, From 3af758829a1c17ff17e5e05df29df4eb3e11e251 Mon Sep 17 00:00:00 2001 From: Wen-Tse Chen Date: Tue, 19 Dec 2023 22:08:13 -0500 Subject: [PATCH 71/78] update format --- examples/nlp/train_ppo.py | 2 +- openrl/envs/nlp/daily_dialog_env.py | 29 ++++----- openrl/envs/nlp/fake_dialog_env.py | 22 +++---- openrl/envs/nlp/rewards/intent.py | 2 +- openrl/envs/nlp/rewards/kl_penalty.py | 23 +++---- openrl/envs/nlp/utils/metrics/meteor.py | 24 +++---- .../envs/vec_env/wrappers/reward_wrapper.py | 1 + openrl/modules/networks/policy_network_gpt.py | 64 ++++++++++--------- openrl/modules/networks/value_network_gpt.py | 33 ++++++---- openrl/rewards/nlp_reward.py | 3 +- 10 files changed, 100 insertions(+), 103 deletions(-) diff --git a/examples/nlp/train_ppo.py b/examples/nlp/train_ppo.py index 18347a6b..4fefcf52 100644 --- a/examples/nlp/train_ppo.py +++ b/examples/nlp/train_ppo.py @@ -3,8 +3,8 @@ from openrl.configs.config import create_config_parser from openrl.envs.common import make from openrl.modules.common import PPONet as Net -from openrl.modules.networks.value_network_gpt import ValueNetworkGPT as ValueNetwork from openrl.modules.networks.policy_network_gpt import PolicyNetworkGPT as PolicyNetwork +from openrl.modules.networks.value_network_gpt import ValueNetworkGPT as ValueNetwork from openrl.runners.common import PPOAgent as Agent diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 98dd4d85..332db319 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -72,18 +72,16 @@ def __init__( # set the observation and action space here self._vocab_size = self.tokenizer.vocab_size - self.observation_space = DictSpace( - { - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - } - ) + self.observation_space = DictSpace({ + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + }) self.action_space = Discrete(n=self._vocab_size) # see https://github.com/huggingface/transformers/issues/4875 : rounding up to nearest power of 2 for better GPU efficiency @@ -116,8 +114,9 @@ def __init__( self.set_reward() def set_reward(self, reward_fn=None): - + from openrl.envs.nlp.rewards.meteor import Meteor + meteor_config = { "meteor_coeff": 0.5, "test": False, @@ -125,7 +124,7 @@ def set_reward(self, reward_fn=None): self.reward_function = { "meteor": Meteor(**meteor_config), } - + # self.reward_function = reward_fn def step_word(self, word: str) -> Tuple[Dict[str, torch.tensor], int, bool, dict]: @@ -147,7 +146,7 @@ def step( done = done or self.__current_obs.context_text.endswith(DailyDialog.EOU_TOKEN) reward = 0.0 - reward_info = dict() + reward_info = dict() if done and self.reward_function: for reward_function in self.reward_function.values(): diff --git a/openrl/envs/nlp/fake_dialog_env.py b/openrl/envs/nlp/fake_dialog_env.py index 02247bc0..27f9d8f4 100644 --- a/openrl/envs/nlp/fake_dialog_env.py +++ b/openrl/envs/nlp/fake_dialog_env.py @@ -30,18 +30,16 @@ def __init__( # set the observation and action space here self._vocab_size = 2 - self.observation_space = DictSpace( - { - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - } - ) + self.observation_space = DictSpace({ + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + }) self.action_space = Discrete(n=self._vocab_size) n = 2 diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index f0929932..0a0c4d3e 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -37,7 +37,7 @@ def __init__( self._intent_coeff = intent_coeff self.use_deepspeed = use_deepspeed self.use_half = False - self.use_data_parallel = not use_deepspeed # default to use data parallel + self.use_data_parallel = not use_deepspeed # default to use data parallel self.use_model_parallel = False if intent_model == "builtin_intent": diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 406c9215..9516b788 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -35,7 +35,7 @@ def __init__( ds_config: str = "default", ): super().__init__() - + self.device = "cuda" self.use_deepspeed = use_deepspeed self.use_half = False @@ -116,7 +116,7 @@ def __call__( self._ref_net, input_ids, past_model_kwargs ) - if self.use_half: + if self.use_half: for key in ["input_ids", "position_ids", "attention_mask"]: if key in model_inputs: model_inputs[key] = model_inputs[key].int() @@ -125,7 +125,6 @@ def __call__( if key in model_inputs: model_inputs[key] = model_inputs[key].long() - with torch.no_grad(): output = self._ref_net(output_hidden_states=True, **model_inputs) output["past_key_values"] = None @@ -139,15 +138,13 @@ def __call__( ref_log_prob = ref_log_prob.reshape(action_log_probs.shape) kl_div = action_log_probs.copy() - ref_log_prob.detach().cpu().numpy() - rew = -self._alpha * kl_div + rew = -self._alpha * kl_div infos = [] for kl in kl_div: - infos.append( - { - "alpha": self._alpha, - "kl_div": kl.mean(), - } - ) + infos.append({ + "alpha": self._alpha, + "kl_div": kl.mean(), + }) return rew, infos def _prepare_inputs_for_model( @@ -173,11 +170,7 @@ def _prepare_inputs_for_model( } elif self.use_data_parallel: model_inputs = { - key: ( - value.to(self.device) - if isinstance(value, torch.Tensor) - else value - ) + key: value.to(self.device) if isinstance(value, torch.Tensor) else value for key, value in model_inputs.items() } elif self.use_deepspeed: diff --git a/openrl/envs/nlp/utils/metrics/meteor.py b/openrl/envs/nlp/utils/metrics/meteor.py index ab15e66d..c2345fa9 100644 --- a/openrl/envs/nlp/utils/metrics/meteor.py +++ b/openrl/envs/nlp/utils/metrics/meteor.py @@ -88,20 +88,16 @@ def _info(self): citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence( - datasets.Value("string", id="sequence"), id="references" - ), - } - ), - datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - } - ), + datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence( + datasets.Value("string", id="sequence"), id="references" + ), + }), + datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + }), ], codebase_urls=[ "https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py" diff --git a/openrl/envs/vec_env/wrappers/reward_wrapper.py b/openrl/envs/vec_env/wrappers/reward_wrapper.py index 2b5ca266..25cdc424 100644 --- a/openrl/envs/vec_env/wrappers/reward_wrapper.py +++ b/openrl/envs/vec_env/wrappers/reward_wrapper.py @@ -24,6 +24,7 @@ from openrl.envs.vec_env.wrappers.base_wrapper import VecEnvWrapper from openrl.rewards.base_reward import BaseReward + class RewardWrapper(VecEnvWrapper): def __init__(self, env: BaseVecEnv, reward_class: BaseReward): super().__init__(env) diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py index 0cda244e..906f1fb5 100644 --- a/openrl/modules/networks/policy_network_gpt.py +++ b/openrl/modules/networks/policy_network_gpt.py @@ -15,13 +15,15 @@ # limitations under the License. """""" -from typing import Any, Optional, Dict +from typing import Any, Dict, Optional import numpy as np import torch import torch.nn as nn +from transformers.modeling_utils import unwrap_model from openrl.buffers.utils.util import get_policy_obs, get_policy_obs_space +from openrl.envs.nlp.utils.distribution import CategoricalDistribution from openrl.modules.networks.base_policy_network import BasePolicyNetwork from openrl.modules.networks.utils.act import ACTLayer from openrl.modules.networks.utils.cnn import CNNBase @@ -31,9 +33,7 @@ from openrl.modules.networks.utils.rnn import RNNLayer from openrl.modules.networks.utils.util import init from openrl.utils.util import check_v2 as check -from openrl.envs.nlp.utils.distribution import CategoricalDistribution -from transformers.modeling_utils import unwrap_model class PolicyNetworkGPT(BasePolicyNetwork): def __init__( @@ -46,25 +46,26 @@ def __init__( disable_drop_out: bool = True, extra_args=None, ) -> None: - + self.device = device self.use_fp16 = cfg.use_fp16 self.use_deepspeed = cfg.use_deepspeed self.use_half = False - self.use_data_parallel = not cfg.use_deepspeed # default to use data parallel + self.use_data_parallel = not cfg.use_deepspeed # default to use data parallel self.use_model_parallel = False assert not (self.use_deepspeed and self.use_data_parallel) assert not (self.use_deepspeed and self.use_model_parallel) assert not (self.use_data_parallel and self.use_model_parallel) - + super(PolicyNetworkGPT, self).__init__(cfg, device) - + self.disable_drop_out = disable_drop_out - + self._action_dist = CategoricalDistribution(action_space.n) - + from transformers import AutoConfig, AutoModelForCausalLM + config = AutoConfig.from_pretrained(cfg.model_path) config_dict = config.to_dict() for key in config_dict: @@ -85,7 +86,6 @@ def __init__( self._policy_model = torch.nn.DataParallel(self._policy_model) self._policy_model = self._policy_model.to(self.device) - def forward(self, forward_type, *args, **kwargs): if forward_type == "original": return self.forward_original(*args, **kwargs) @@ -93,7 +93,7 @@ def forward(self, forward_type, *args, **kwargs): return self.eval_actions(*args, **kwargs) else: raise NotImplementedError - + def _prepare_inputs_for_model( self, model: Any, @@ -121,7 +121,11 @@ def forward_original( self, raw_obs, rnn_states, masks, action_masks=None, deterministic=False ): for key in raw_obs.keys(): - raw_obs[key] = torch.from_numpy(raw_obs[key]) if type(raw_obs[key]) == np.ndarray else raw_obs[key] + raw_obs[key] = ( + torch.from_numpy(raw_obs[key]) + if type(raw_obs[key]) == np.ndarray + else raw_obs[key] + ) rnn_states = check(rnn_states) if self.use_half: @@ -138,35 +142,37 @@ def forward_original( else: input_ids = input_ids.to(self._policy_model.device) attention_mask = attention_mask.to(self._policy_model.device) - + past_model_kwargs = None - + if past_model_kwargs is None: past_model_kwargs = { "attention_mask": attention_mask, } - + model_inputs = self._prepare_inputs_for_model( self._policy_model, input_ids, past_model_kwargs ) - + # forward pass to transformers output = self._policy_model(**model_inputs) - + # compute action probs - policy head - next_token_logits = output.logits[:, -1] + next_token_logits = output.logits[:, -1] dist = self._action_dist.proba_distribution(action_logits=next_token_logits) - + actions = dist.mode() if deterministic else dist.sample() action_log_probs = dist.log_prob(actions) - + return actions.unsqueeze(-1), action_log_probs.unsqueeze(-1), rnn_states def eval_actions( self, obs, rnn_states, action, masks, action_masks=None, active_masks=None ): for key in obs.keys(): - obs[key] = torch.from_numpy(obs[key]) if type(obs[key]) == np.ndarray else obs[key] + obs[key] = ( + torch.from_numpy(obs[key]) if type(obs[key]) == np.ndarray else obs[key] + ) if self.use_data_parallel: obs[key] = obs[key].to(self.device) else: @@ -176,32 +182,32 @@ def eval_actions( else: action = check(action).to(self._policy_model.device).squeeze() rnn_states = check(rnn_states) - + if self.half: input_ids = obs["input_encoded_pt"].int() attention_mask = obs["input_attention_mask_pt"].int() else: input_ids = obs["input_encoded_pt"].long() attention_mask = obs["input_attention_mask_pt"].long() - + past_model_kwargs = None - + if past_model_kwargs is None: past_model_kwargs = { "attention_mask": attention_mask, } - + model_inputs = self._prepare_inputs_for_model( self._policy_model, input_ids, past_model_kwargs ) - + # forward pass to transformers output = self._policy_model(**model_inputs) - + # compute action probs - policy head next_token_logits = output.logits[:, -1] dist = self._action_dist.proba_distribution(action_logits=next_token_logits) - + action_log_probs = dist.log_prob(action) dist_entropy = dist.entropy() values = None @@ -209,4 +215,4 @@ def eval_actions( return action_log_probs.unsqueeze(-1), dist_entropy.mean(), values def get_policy_values(self, obs, rnn_states, masks): - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py index b4ed9b1c..afffffc2 100644 --- a/openrl/modules/networks/value_network_gpt.py +++ b/openrl/modules/networks/value_network_gpt.py @@ -15,11 +15,12 @@ # limitations under the License. """""" -from typing import Any, Optional, Dict +from typing import Any, Dict, Optional import numpy as np import torch import torch.nn as nn +from transformers.modeling_utils import unwrap_model from openrl.buffers.utils.util import get_critic_obs_space from openrl.modules.networks.base_value_network import BaseValueNetwork @@ -32,7 +33,6 @@ from openrl.modules.utils.valuenorm import ValueNorm from openrl.utils.util import check_v2 as check -from transformers.modeling_utils import unwrap_model class ValueNetworkGPT(BaseValueNetwork): def __init__( @@ -44,7 +44,7 @@ def __init__( device=torch.device("cpu"), extra_args=None, ): - + self.device = device self.use_fp16 = cfg.use_fp16 @@ -55,21 +55,23 @@ def __init__( assert not (self.use_deepspeed and self.use_data_parallel) assert not (self.use_deepspeed and self.use_model_parallel) assert not (self.use_data_parallel and self.use_model_parallel) - + super(ValueNetworkGPT, self).__init__(cfg, device) - + from transformers import AutoModelForCausalLM - + self._value_model = AutoModelForCausalLM.from_pretrained(cfg.model_path) self._value_model.config.use_cache = False self._value_head = nn.Linear( - self._value_model.config.n_embd, 1, bias=False # gpt2 + self._value_model.config.n_embd, + 1, + bias=False, # gpt2 # self._value_model.config.word_embed_proj_dim, 1, bias=False # opt-x ) self.value_normalizer = ( ValueNorm(1, device=device) if self._use_valuenorm else None ) - + if self.use_deepspeed: self._value_head.to(self.device) else: @@ -84,7 +86,6 @@ def __init__( self._value_head = torch.nn.DataParallel(self._value_head) self._value_head = self._value_head.to(self.device) - def _prepare_inputs_for_model( self, model: Any, @@ -105,19 +106,23 @@ def _prepare_inputs_for_model( ) for key, value in model_inputs.items() } - + return model_inputs def forward(self, critic_obs, rnn_states, masks): for key in critic_obs.keys(): - critic_obs[key] = torch.from_numpy(critic_obs[key]) if type(critic_obs[key]) == np.ndarray else critic_obs[key] + critic_obs[key] = ( + torch.from_numpy(critic_obs[key]) + if type(critic_obs[key]) == np.ndarray + else critic_obs[key] + ) if self.use_data_parallel: critic_obs[key] = critic_obs[key].to(self.device) else: critic_obs[key] = critic_obs[key].to(self._value_model.device) - + rnn_states = check(rnn_states) - + if self.use_half: input_ids = critic_obs["input_encoded_pt"].int() attention_mask = critic_obs["input_attention_mask_pt"].int() @@ -130,7 +135,7 @@ def forward(self, critic_obs, rnn_states, masks): past_model_kwargs = { "attention_mask": attention_mask, } - + model_inputs = self._prepare_inputs_for_model( self._value_model, input_ids, past_model_kwargs ) diff --git a/openrl/rewards/nlp_reward.py b/openrl/rewards/nlp_reward.py index bedfcc59..38cd306a 100644 --- a/openrl/rewards/nlp_reward.py +++ b/openrl/rewards/nlp_reward.py @@ -24,7 +24,7 @@ def __init__( # bug unfixed self.inner_rew_funcs = dict() - + # meteor_config = { # "meteor_coeff": 0.5, # "test": ref_model == "builtin_ref", @@ -32,7 +32,6 @@ def __init__( # self.inner_rew_funcs = { # "meteor": Meteor(**meteor_config), # } - kl_config = { "action_space": env.action_space, From f8879b3ec171b17d16bed8a72b6fe80f4690a0cc Mon Sep 17 00:00:00 2001 From: Wen-Tse Chen Date: Wed, 20 Dec 2023 00:22:57 -0500 Subject: [PATCH 72/78] fix test w/o gpu bug --- openrl/envs/nlp/rewards/intent.py | 9 ++++++--- openrl/envs/nlp/rewards/kl_penalty.py | 9 +++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 0a0c4d3e..2c82e96f 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -41,6 +41,10 @@ def __init__( self.use_model_parallel = False if intent_model == "builtin_intent": + + self._device = "cpu" + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel class TestTokenizer: @@ -66,6 +70,7 @@ def __init__(self, input_ids, attention_mask): self._model = GPT2LMHeadModel(config) else: + self._device = "cuda" model_path = data_abs_path(intent_model) self._tokenizer = AutoTokenizer.from_pretrained(intent_model) self._model = AutoModelForSequenceClassification.from_pretrained(model_path) @@ -81,12 +86,10 @@ def __init__(self, input_ids, attention_mask): with open(ds_config) as file: ds_config = json.load(file) - self._device = "cuda" - self._model = self._model.to("cuda") + self._model = self._model.to(self._device) self._model, *_ = deepspeed.initialize(model=self._model, config=ds_config) self.use_fp16 = ds_config["fp16"]["enabled"] else: - self._device = "cuda" if self.use_model_parallel: self._model.parallelize() elif self.use_data_parallel: diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 9516b788..3cfafd4b 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -47,6 +47,10 @@ def __init__( # reference model if ref_model == "builtin_ref": + + self.device = "cpu" + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel config = GPT2Config() @@ -77,8 +81,9 @@ def __init__( elif self.use_data_parallel: # else defaults to data parallel if self.use_half: self._ref_net = self._ref_net.half() - self._ref_net = torch.nn.DataParallel(self._ref_net) - self._ref_net = self._ref_net.to(self.device) + else: + self._ref_net = torch.nn.DataParallel(self._ref_net) + self._ref_net = self._ref_net.to(self.device) # alpha adjustment self._alpha = 0.2 From fc020301258336a13d9d2a20d14477e38f485879 Mon Sep 17 00:00:00 2001 From: Wen-Tse Chen Date: Wed, 20 Dec 2023 00:35:36 -0500 Subject: [PATCH 73/78] fix set reward bug --- openrl/envs/nlp/daily_dialog_env.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 332db319..2aa08684 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -111,21 +111,9 @@ def __init__( self.__time_step = None self.reward_function = None - self.set_reward() - def set_reward(self, reward_fn=None): - from openrl.envs.nlp.rewards.meteor import Meteor - - meteor_config = { - "meteor_coeff": 0.5, - "test": False, - } - self.reward_function = { - "meteor": Meteor(**meteor_config), - } - - # self.reward_function = reward_fn + self.reward_function = reward_fn def step_word(self, word: str) -> Tuple[Dict[str, torch.tensor], int, bool, dict]: action = self.tokenizer.encode(word)[1] From 753fba760929fb5502eef105c59d19113e88d8b1 Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Wed, 20 Dec 2023 11:47:42 +0530 Subject: [PATCH 74/78] Move envpool to examples --- examples/envpool/README.md | 20 +++ .../envpool}/envpool_wrappers.py | 0 examples/envpool/make_env.py | 128 ++++++++++++++++++ examples/envpool/train_ppo.py | 17 ++- openrl/envs/common/registration.py | 13 -- openrl/envs/envpool/__init__.py | 47 ------- setup.py | 2 - 7 files changed, 160 insertions(+), 67 deletions(-) create mode 100644 examples/envpool/README.md rename {openrl/envs/wrappers => examples/envpool}/envpool_wrappers.py (100%) create mode 100644 examples/envpool/make_env.py delete mode 100644 openrl/envs/envpool/__init__.py diff --git a/examples/envpool/README.md b/examples/envpool/README.md new file mode 100644 index 00000000..e9a16389 --- /dev/null +++ b/examples/envpool/README.md @@ -0,0 +1,20 @@ +## Installation + + +Install envpool with: + +``` shell +pip install envpool +``` + +Note 1: envpool only supports Linux operating system. + +## Usage + +You can use `OpenRL` to train Cartpole (envpool) via: + +``` shell +PYTHON_PATH train_ppo.py +``` + +You can also add custom wrappers in `envpool_wrapper.py`. Currently we have `VecAdapter` and `VecMonitor` wrappers. \ No newline at end of file diff --git a/openrl/envs/wrappers/envpool_wrappers.py b/examples/envpool/envpool_wrappers.py similarity index 100% rename from openrl/envs/wrappers/envpool_wrappers.py rename to examples/envpool/envpool_wrappers.py diff --git a/examples/envpool/make_env.py b/examples/envpool/make_env.py new file mode 100644 index 00000000..92c1b51a --- /dev/null +++ b/examples/envpool/make_env.py @@ -0,0 +1,128 @@ +import copy +import inspect +from typing import Callable, Iterable, List, Optional, Union + +import envpool +from gymnasium import Env + + +from openrl.envs.vec_env import (AsyncVectorEnv, RewardWrapper, + SyncVectorEnv, VecMonitorWrapper) +from openrl.envs.vec_env.vec_info import VecInfoFactory +from openrl.envs.wrappers.base_wrapper import BaseWrapper +from openrl.rewards import RewardFactory + + +def build_envs( + make, + id: str, + env_num: int = 1, + wrappers: Optional[Union[Callable[[Env], Env], List[Callable[[Env], Env]]]] = None, + need_env_id: bool = False, + **kwargs, +) -> List[Callable[[], Env]]: + cfg = kwargs.get("cfg", None) + + def create_env(env_id: int, env_num: int, need_env_id: bool) -> Callable[[], Env]: + def _make_env() -> Env: + new_kwargs = copy.deepcopy(kwargs) + if need_env_id: + new_kwargs["env_id"] = env_id + new_kwargs["env_num"] = env_num + if "envpool" in new_kwargs: + # for now envpool doesnt support any render mode + # envpool also doesnt stores the id anywhere + new_kwargs.pop("envpool") + env = make( + id, + **new_kwargs, + ) + env.unwrapped.spec.id = id + + if wrappers is not None: + if callable(wrappers): + if issubclass(wrappers, BaseWrapper): + env = wrappers(env, cfg=cfg) + else: + env = wrappers(env) + elif isinstance(wrappers, Iterable) and all( + [callable(w) for w in wrappers] + ): + for wrapper in wrappers: + if ( + issubclass(wrapper, BaseWrapper) + and "cfg" in inspect.signature(wrapper.__init__).parameters + ): + env = wrapper(env, cfg=cfg) + else: + env = wrapper(env) + else: + raise NotImplementedError + + return env + + return _make_env + + env_fns = [create_env(env_id, env_num, need_env_id) for env_id in range(env_num)] + return env_fns + + +def make_envpool_envs( + id: str, + env_num: int = 1, + **kwargs, +): + assert "env_type" in kwargs + assert kwargs.get("env_type") in ["gym", "dm", "gymnasium"] + kwargs["envpool"] = True + + if 'env_wrappers' in kwargs: + env_wrappers = kwargs.pop("env_wrappers") + else: + env_wrappers = [] + env_fns = build_envs( + make=envpool.make, + id=id, + env_num=env_num, + wrappers=env_wrappers, + **kwargs, + ) + return env_fns + + +def make( + id: str, + env_num: int = 1, + asynchronous: bool = False, + add_monitor: bool = True, + render_mode: Optional[str] = None, + auto_reset: bool = True, + **kwargs, +): + cfg = kwargs.get("cfg", None) + if id in envpool.registration.list_all_envs(): + env_fns = make_envpool_envs( + id=id.split(":")[-1], + env_num=env_num, + **kwargs, + ) + if asynchronous: + env = AsyncVectorEnv( + env_fns, render_mode=render_mode, auto_reset=auto_reset + ) + else: + env = SyncVectorEnv(env_fns, render_mode=render_mode, auto_reset=auto_reset) + + reward_class = cfg.reward_class if cfg else None + reward_class = RewardFactory.get_reward_class(reward_class, env) + + env = RewardWrapper(env, reward_class) + + if add_monitor: + vec_info_class = cfg.vec_info_class if cfg else None + vec_info_class = VecInfoFactory.get_vec_info_class(vec_info_class, env) + env = VecMonitorWrapper(vec_info_class, env) + + return env + else: + raise NotImplementedError(f"env {id} is not supported") diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py index 49eb4456..a02151f7 100644 --- a/examples/envpool/train_ppo.py +++ b/examples/envpool/train_ppo.py @@ -18,8 +18,8 @@ import numpy as np from openrl.configs.config import create_config_parser -from openrl.envs.common import make -from openrl.envs.wrappers.envpool_wrappers import VecAdapter, VecMonitor +from make_env import make +from examples.envpool.envpool_wrappers import VecAdapter, VecMonitor from openrl.modules.common import PPONet as Net from openrl.modules.common.ppo_net import PPONet as Net from openrl.runners.common import PPOAgent as Agent @@ -32,7 +32,7 @@ def train(): # create environment, set environment parallelism to 9 env = make( - "envpool:CartPole-v1", + "CartPole-v1", render_mode=None, env_num=9, asynchronous=False, @@ -45,7 +45,7 @@ def train(): cfg=cfg, ) # initialize the trainer - agent = Agent(net, use_wandb=False, project_name="envpool:CartPole-v1") + agent = Agent(net, use_wandb=False, project_name="CartPole-v1") # start training, set total number of training steps to 20000 agent.train(total_time_steps=20000) @@ -58,7 +58,14 @@ def evaluation(agent): # Create an environment for testing and set the number of environments to interact with to 9. Set rendering mode to group_human. render_mode = "group_human" render_mode = None - env = make("CartPole-v1", render_mode=render_mode, env_num=9, asynchronous=True) + env = make( + "CartPole-v1", + env_wrappers=[VecAdapter, VecMonitor], + render_mode=render_mode, + env_num=9, + asynchronous=True, + env_type="gym", + ) # The trained agent sets up the interactive environment it needs. agent.set_env(env) # Initialize the environment and get initial observations and environmental information. diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 053dd104..bb6c4462 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -17,7 +17,6 @@ """""" from typing import Callable, Optional -import envpool import gymnasium as gym import openrl @@ -155,18 +154,6 @@ def make( env_fns = make_PettingZoo_envs( id=id, env_num=env_num, render_mode=convert_render_mode, **kwargs ) - elif ( - "envpool:" in id - and id.split(":")[-1] in envpool.registration.list_all_envs() - ): - from openrl.envs.envpool import make_envpool_envs - - env_fns = make_envpool_envs( - id=id.split(":")[-1], - env_num=env_num, - render_mode=convert_render_mode, - **kwargs, - ) else: raise NotImplementedError(f"env {id} is not supported.") diff --git a/openrl/envs/envpool/__init__.py b/openrl/envs/envpool/__init__.py deleted file mode 100644 index 48fbd1f5..00000000 --- a/openrl/envs/envpool/__init__.py +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Copyright 2023 The OpenRL Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""""" -from typing import List, Optional, Union - -import envpool - -from openrl.envs.common import build_envs - - -def make_envpool_envs( - id: str, - env_num: int = 1, - render_mode: Optional[Union[str, List[str]]] = None, - **kwargs, -): - assert "env_type" in kwargs - assert kwargs.get("env_type") in ["gym", "dm", "gymnasium"] - # Since render_mode is not supported, we set envpool to True - # so that we can remove render_mode keyword argument from build_envs - assert render_mode is None, "envpool does not support render_mode yet" - kwargs["envpool"] = True - - env_wrappers = kwargs.pop("env_wrappers") - env_fns = build_envs( - make=envpool.make, - id=id, - env_num=env_num, - render_mode=render_mode, - wrappers=env_wrappers, - **kwargs, - ) - return env_fns diff --git a/setup.py b/setup.py index faffbe84..28cffd3c 100644 --- a/setup.py +++ b/setup.py @@ -76,7 +76,6 @@ def get_extra_requires() -> dict: "async_timeout", "pettingzoo[classic]", "trueskill", - "envpool", ], "selfplay_test": [ "ray[default]>=2.7", @@ -85,7 +84,6 @@ def get_extra_requires() -> dict: "fastapi", "pettingzoo[mpe]", "pettingzoo[butterfly]", - "envpool", ], "retro": ["gym-retro"], "super_mario": ["gym-super-mario-bros"], From 693d2e1c4d46ef636d0a8d3f4962378e9fa95da0 Mon Sep 17 00:00:00 2001 From: Geo Jolly Date: Wed, 20 Dec 2023 11:51:07 +0530 Subject: [PATCH 75/78] Revert files in openrl folder --- openrl/envs/common/build_envs.py | 25 ++++++++----------------- openrl/envs/common/registration.py | 3 ++- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index 37c17c01..a0c59c6f 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -36,22 +36,13 @@ def _make_env() -> Env: new_kwargs["env_num"] = env_num if id.startswith("ALE/") or id in gym.envs.registry.keys(): new_kwargs.pop("cfg", None) - if "envpool" in new_kwargs: - # for now envpool doesnt support any render mode - # envpool also doesnt stores the id anywhere - new_kwargs.pop("envpool") - env = make( - id, - **new_kwargs, - ) - env.unwrapped.spec.id = id - else: - env = make( - id, - render_mode=env_render_mode, - disable_env_checker=_disable_env_checker, - **new_kwargs, - ) + + env = make( + id, + render_mode=env_render_mode, + disable_env_checker=_disable_env_checker, + **new_kwargs, + ) if wrappers is not None: if callable(wrappers): @@ -78,4 +69,4 @@ def _make_env() -> Env: return _make_env env_fns = [create_env(env_id, env_num, need_env_id) for env_id in range(env_num)] - return env_fns + return env_fns \ No newline at end of file diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index bb6c4462..1ee9b532 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -72,6 +72,7 @@ def make( env_fns = make_single_agent_drone_envs( id=id, env_num=env_num, render_mode=convert_render_mode, **kwargs ) + elif id.startswith("snakes_"): from openrl.envs.snake import make_snake_envs @@ -172,4 +173,4 @@ def make( vec_info_class = VecInfoFactory.get_vec_info_class(vec_info_class, env) env = VecMonitorWrapper(vec_info_class, env) - return env + return env \ No newline at end of file From 5346eee2a74a3f7adc7d1881991eb9bc08e131a9 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 20 Dec 2023 15:01:16 +0800 Subject: [PATCH 76/78] init v0.2.0 --- README.md | 2 +- openrl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2b175f86..b58c3ea6 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ [![Embark](https://img.shields.io/badge/discord-OpenRL-%237289da.svg?logo=discord)](https://discord.gg/qMbVT2qBhr) [![slack badge](https://img.shields.io/badge/Slack-join-blueviolet?logo=slack&)](https://join.slack.com/t/openrlhq/shared_invite/zt-1tqwpvthd-Eeh0IxQ~DIaGqYXoW2IUQg) -OpenRL-v0.1.10 is updated on Oct 27, 2023 +OpenRL-v0.2.0 is updated on Dec 20, 2023 The main branch is the latest version of OpenRL, which is under active development. If you just want to have a try with OpenRL, you can switch to the stable branch. diff --git a/openrl/__init__.py b/openrl/__init__.py index 53ded95e..2ea67943 100644 --- a/openrl/__init__.py +++ b/openrl/__init__.py @@ -1,5 +1,5 @@ __TITLE__ = "openrl" -__VERSION__ = "v0.1.10" +__VERSION__ = "v0.2.0" __DESCRIPTION__ = "Distributed Deep RL Framework" __AUTHOR__ = "OpenRL Contributors" __EMAIL__ = "huangshiyu@4paradigm.com" From 2b798c08e473db58836b1c74c75f59f074c2fd50 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 20 Dec 2023 15:01:36 +0800 Subject: [PATCH 77/78] init v0.2.0 --- examples/envpool/envpool_wrappers.py | 9 ++++--- examples/envpool/make_env.py | 11 +++++---- examples/envpool/train_ppo.py | 4 ++-- openrl/envs/common/build_envs.py | 2 +- openrl/envs/common/registration.py | 2 +- openrl/envs/nlp/daily_dialog_env.py | 23 +++++++++--------- openrl/envs/nlp/fake_dialog_env.py | 22 +++++++++-------- openrl/envs/nlp/rewards/intent.py | 5 ++-- openrl/envs/nlp/rewards/kl_penalty.py | 15 ++++++------ openrl/envs/nlp/utils/metrics/meteor.py | 24 +++++++++++-------- openrl/modules/networks/policy_network_gpt.py | 1 - openrl/modules/networks/value_network_gpt.py | 1 - openrl/modules/utils/valuenorm.py | 12 +++++++--- 13 files changed, 72 insertions(+), 59 deletions(-) diff --git a/examples/envpool/envpool_wrappers.py b/examples/envpool/envpool_wrappers.py index d0da090a..bf975166 100644 --- a/examples/envpool/envpool_wrappers.py +++ b/examples/envpool/envpool_wrappers.py @@ -9,8 +9,7 @@ from packaging import version from stable_baselines3.common.vec_env import VecEnvWrapper as BaseWrapper from stable_baselines3.common.vec_env import VecMonitor -from stable_baselines3.common.vec_env.base_vec_env import (VecEnvObs, - VecEnvStepReturn) +from stable_baselines3.common.vec_env.base_vec_env import VecEnvObs, VecEnvStepReturn is_legacy_gym = version.parse(gym.__version__) < version.parse("0.26.0") @@ -114,9 +113,9 @@ def __init__( if is_wrapped_with_monitor: warnings.warn( - "The environment is already wrapped with a `Monitor` wrapper" - "but you are wrapping it with a `VecMonitor` wrapper, the `Monitor` statistics will be" - "overwritten by the `VecMonitor` ones.", + "The environment is already wrapped with a `Monitor` wrapperbut you are" + " wrapping it with a `VecMonitor` wrapper, the `Monitor` statistics" + " will beoverwritten by the `VecMonitor` ones.", UserWarning, ) diff --git a/examples/envpool/make_env.py b/examples/envpool/make_env.py index 92c1b51a..669ca67a 100644 --- a/examples/envpool/make_env.py +++ b/examples/envpool/make_env.py @@ -5,9 +5,12 @@ import envpool from gymnasium import Env - -from openrl.envs.vec_env import (AsyncVectorEnv, RewardWrapper, - SyncVectorEnv, VecMonitorWrapper) +from openrl.envs.vec_env import ( + AsyncVectorEnv, + RewardWrapper, + SyncVectorEnv, + VecMonitorWrapper, +) from openrl.envs.vec_env.vec_info import VecInfoFactory from openrl.envs.wrappers.base_wrapper import BaseWrapper from openrl.rewards import RewardFactory @@ -76,7 +79,7 @@ def make_envpool_envs( assert kwargs.get("env_type") in ["gym", "dm", "gymnasium"] kwargs["envpool"] = True - if 'env_wrappers' in kwargs: + if "env_wrappers" in kwargs: env_wrappers = kwargs.pop("env_wrappers") else: env_wrappers = [] diff --git a/examples/envpool/train_ppo.py b/examples/envpool/train_ppo.py index a02151f7..b6550b96 100644 --- a/examples/envpool/train_ppo.py +++ b/examples/envpool/train_ppo.py @@ -16,10 +16,10 @@ """""" import numpy as np - -from openrl.configs.config import create_config_parser from make_env import make + from examples.envpool.envpool_wrappers import VecAdapter, VecMonitor +from openrl.configs.config import create_config_parser from openrl.modules.common import PPONet as Net from openrl.modules.common.ppo_net import PPONet as Net from openrl.runners.common import PPOAgent as Agent diff --git a/openrl/envs/common/build_envs.py b/openrl/envs/common/build_envs.py index a0c59c6f..76f4b35b 100644 --- a/openrl/envs/common/build_envs.py +++ b/openrl/envs/common/build_envs.py @@ -69,4 +69,4 @@ def _make_env() -> Env: return _make_env env_fns = [create_env(env_id, env_num, need_env_id) for env_id in range(env_num)] - return env_fns \ No newline at end of file + return env_fns diff --git a/openrl/envs/common/registration.py b/openrl/envs/common/registration.py index 1ee9b532..5d1ed645 100644 --- a/openrl/envs/common/registration.py +++ b/openrl/envs/common/registration.py @@ -173,4 +173,4 @@ def make( vec_info_class = VecInfoFactory.get_vec_info_class(vec_info_class, env) env = VecMonitorWrapper(vec_info_class, env) - return env \ No newline at end of file + return env diff --git a/openrl/envs/nlp/daily_dialog_env.py b/openrl/envs/nlp/daily_dialog_env.py index 2aa08684..d197a232 100644 --- a/openrl/envs/nlp/daily_dialog_env.py +++ b/openrl/envs/nlp/daily_dialog_env.py @@ -72,16 +72,18 @@ def __init__( # set the observation and action space here self._vocab_size = self.tokenizer.vocab_size - self.observation_space = DictSpace({ - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - }) + self.observation_space = DictSpace( + { + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + } + ) self.action_space = Discrete(n=self._vocab_size) # see https://github.com/huggingface/transformers/issues/4875 : rounding up to nearest power of 2 for better GPU efficiency @@ -112,7 +114,6 @@ def __init__( self.reward_function = None def set_reward(self, reward_fn=None): - self.reward_function = reward_fn def step_word(self, word: str) -> Tuple[Dict[str, torch.tensor], int, bool, dict]: diff --git a/openrl/envs/nlp/fake_dialog_env.py b/openrl/envs/nlp/fake_dialog_env.py index 27f9d8f4..02247bc0 100644 --- a/openrl/envs/nlp/fake_dialog_env.py +++ b/openrl/envs/nlp/fake_dialog_env.py @@ -30,16 +30,18 @@ def __init__( # set the observation and action space here self._vocab_size = 2 - self.observation_space = DictSpace({ - "input_encoded_pt": spaces.Box( - low=0, - high=self._vocab_size, - shape=(self._max_text_length + self.max_steps,), - ), - "input_attention_mask_pt": spaces.Box( - low=0, high=1, shape=(self._max_text_length + self.max_steps,) - ), - }) + self.observation_space = DictSpace( + { + "input_encoded_pt": spaces.Box( + low=0, + high=self._vocab_size, + shape=(self._max_text_length + self.max_steps,), + ), + "input_attention_mask_pt": spaces.Box( + low=0, high=1, shape=(self._max_text_length + self.max_steps,) + ), + } + ) self.action_space = Discrete(n=self._vocab_size) n = 2 diff --git a/openrl/envs/nlp/rewards/intent.py b/openrl/envs/nlp/rewards/intent.py index 2c82e96f..bc4da36c 100644 --- a/openrl/envs/nlp/rewards/intent.py +++ b/openrl/envs/nlp/rewards/intent.py @@ -41,10 +41,9 @@ def __init__( self.use_model_parallel = False if intent_model == "builtin_intent": - self._device = "cpu" - self.use_data_parallel = False - + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel class TestTokenizer: diff --git a/openrl/envs/nlp/rewards/kl_penalty.py b/openrl/envs/nlp/rewards/kl_penalty.py index 3cfafd4b..c98c6bfb 100644 --- a/openrl/envs/nlp/rewards/kl_penalty.py +++ b/openrl/envs/nlp/rewards/kl_penalty.py @@ -47,10 +47,9 @@ def __init__( # reference model if ref_model == "builtin_ref": - self.device = "cpu" - self.use_data_parallel = False - + self.use_data_parallel = False + from transformers import GPT2Config, GPT2LMHeadModel config = GPT2Config() @@ -146,10 +145,12 @@ def __call__( rew = -self._alpha * kl_div infos = [] for kl in kl_div: - infos.append({ - "alpha": self._alpha, - "kl_div": kl.mean(), - }) + infos.append( + { + "alpha": self._alpha, + "kl_div": kl.mean(), + } + ) return rew, infos def _prepare_inputs_for_model( diff --git a/openrl/envs/nlp/utils/metrics/meteor.py b/openrl/envs/nlp/utils/metrics/meteor.py index c2345fa9..ab15e66d 100644 --- a/openrl/envs/nlp/utils/metrics/meteor.py +++ b/openrl/envs/nlp/utils/metrics/meteor.py @@ -88,16 +88,20 @@ def _info(self): citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ - datasets.Features({ - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Sequence( - datasets.Value("string", id="sequence"), id="references" - ), - }), - datasets.Features({ - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - }), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Sequence( + datasets.Value("string", id="sequence"), id="references" + ), + } + ), + datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + } + ), ], codebase_urls=[ "https://github.com/nltk/nltk/blob/develop/nltk/translate/meteor_score.py" diff --git a/openrl/modules/networks/policy_network_gpt.py b/openrl/modules/networks/policy_network_gpt.py index 906f1fb5..193094a7 100644 --- a/openrl/modules/networks/policy_network_gpt.py +++ b/openrl/modules/networks/policy_network_gpt.py @@ -46,7 +46,6 @@ def __init__( disable_drop_out: bool = True, extra_args=None, ) -> None: - self.device = device self.use_fp16 = cfg.use_fp16 self.use_deepspeed = cfg.use_deepspeed diff --git a/openrl/modules/networks/value_network_gpt.py b/openrl/modules/networks/value_network_gpt.py index afffffc2..0c5b1154 100644 --- a/openrl/modules/networks/value_network_gpt.py +++ b/openrl/modules/networks/value_network_gpt.py @@ -44,7 +44,6 @@ def __init__( device=torch.device("cpu"), extra_args=None, ): - self.device = device self.use_fp16 = cfg.use_fp16 diff --git a/openrl/modules/utils/valuenorm.py b/openrl/modules/utils/valuenorm.py index 0367084a..43aaad9c 100644 --- a/openrl/modules/utils/valuenorm.py +++ b/openrl/modules/utils/valuenorm.py @@ -24,9 +24,15 @@ def __init__( self.per_element_update = per_element_update self.tpdv = dict(dtype=torch.float32, device=device) - self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - self.running_mean_sq = nn.Parameter(torch.zeros(input_shape), requires_grad=False).to(**self.tpdv) - self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to(**self.tpdv) + self.running_mean = nn.Parameter( + torch.zeros(input_shape), requires_grad=False + ).to(**self.tpdv) + self.running_mean_sq = nn.Parameter( + torch.zeros(input_shape), requires_grad=False + ).to(**self.tpdv) + self.debiasing_term = nn.Parameter(torch.tensor(0.0), requires_grad=False).to( + **self.tpdv + ) # self.running_mean = nn.Parameter(torch.zeros(input_shape), requires_grad=False) # self.running_mean_sq = nn.Parameter( From 5b4dae2cc072923ccb5e2677da58c294068dd1d4 Mon Sep 17 00:00:00 2001 From: huangshiyu Date: Wed, 20 Dec 2023 21:03:51 +0800 Subject: [PATCH 78/78] update readme --- README.md | 28 +++++++++++++++------------- README_zh.md | 11 ++++++----- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index b58c3ea6..c76e3691 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ Currently, the features supported by OpenRL include: - Reinforcement learning training support for natural language tasks (such as dialogue) +- Support [DeepSpeed](https://github.com/microsoft/DeepSpeed) + - Support [Arena](https://openrl-docs.readthedocs.io/en/latest/arena/index.html) , which allows convenient evaluation of various agents (even submissions for [JiDi](https://openrl-docs.readthedocs.io/en/latest/arena/index.html#performing-local-evaluation-of-agents-submitted-to-the-jidi-platform-using-openrl)) in a competitive environment. @@ -160,19 +162,19 @@ Here we provide a table for the comparison of OpenRL and existing popular RL lib OpenRL employs a modular design and high-level abstraction, allowing users to accomplish training for various tasks through a unified and user-friendly interface. -| Library | NLP/RLHF | Multi-agent | Self-Play Training | Offline RL | Bilingual Document | -|:------------------------------------------------------------------:|:------------------:|:--------------------:|:--------------------:|:------------------:|:------------------:| -| **[OpenRL](https://github.com/OpenRL-Lab/openrl)** | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | -| [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) | :x: | :x: | :x: | :x: | :x: | -| [Ray/RLlib](https://github.com/ray-project/ray/tree/master/rllib/) | :x: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | -| [DI-engine](https://github.com/opendilab/DI-engine/) | :x: | :heavy_check_mark: | not fullly supported | :heavy_check_mark: | :heavy_check_mark: | -| [Tianshou](https://github.com/thu-ml/tianshou) | :x: | not fullly supported | not fullly supported | :heavy_check_mark: | :heavy_check_mark: | -| [MARLlib](https://github.com/Replicable-MARL/MARLlib) | :x: | :heavy_check_mark: | not fullly supported | :x: | :x: | -| [MAPPO Benchmark](https://github.com/marlbenchmark/on-policy) | :x: | :heavy_check_mark: | :x: | :x: | :x: | -| [RL4LMs](https://github.com/allenai/RL4LMs) | :heavy_check_mark: | :x: | :x: | :x: | :x: | -| [trlx](https://github.com/CarperAI/trlx) | :heavy_check_mark: | :x: | :x: | :x: | :x: | -| [trl](https://github.com/huggingface/trl) | :heavy_check_mark: | :x: | :x: | :x: | :x: | -| [TimeChamber](https://github.com/inspirai/TimeChamber) | :x: | :x: | :heavy_check_mark: | :x: | :x: | +| Library | NLP/RLHF | Multi-agent | Self-Play Training | Offline RL | [DeepSpeed](https://github.com/microsoft/DeepSpeed) | +|:------------------------------------------------------------------:|:------------------:|:--------------------:|:--------------------:|:------------------:|:--------------------:| +| **[OpenRL](https://github.com/OpenRL-Lab/openrl)** | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) | :x: | :x: | :x: | :x: | :x: | +| [Ray/RLlib](https://github.com/ray-project/ray/tree/master/rllib/) | :x: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | +| [DI-engine](https://github.com/opendilab/DI-engine/) | :x: | :heavy_check_mark: | not fullly supported | :heavy_check_mark: | :x: | +| [Tianshou](https://github.com/thu-ml/tianshou) | :x: | not fullly supported | not fullly supported | :heavy_check_mark: | :x: | +| [MARLlib](https://github.com/Replicable-MARL/MARLlib) | :x: | :heavy_check_mark: | not fullly supported | :x: | :x: | +| [MAPPO Benchmark](https://github.com/marlbenchmark/on-policy) | :x: | :heavy_check_mark: | :x: | :x: | :x: | +| [RL4LMs](https://github.com/allenai/RL4LMs) | :heavy_check_mark: | :x: | :x: | :x: | :x: | +| [trlx](https://github.com/CarperAI/trlx) | :heavy_check_mark: | :x: | :x: | :x: | :heavy_check_mark: | +| [trl](https://github.com/huggingface/trl) | :heavy_check_mark: | :x: | :x: | :x: | :heavy_check_mark: | +| [TimeChamber](https://github.com/inspirai/TimeChamber) | :x: | :x: | :heavy_check_mark: | :x: | :x: | ## Installation diff --git a/README_zh.md b/README_zh.md index 91cd7642..e75c76af 100644 --- a/README_zh.md +++ b/README_zh.md @@ -51,6 +51,7 @@ OpenRL基于PyTorch进行开发,目标是为强化学习研究社区提供一 - 支持通过专家数据进行离线强化学习训练 - 支持自博弈训练 - 支持自然语言任务(如对话任务)的强化学习训练 +- 支持[DeepSpeed](https://github.com/microsoft/DeepSpeed) - 支持[竞技场](https://openrl-docs.readthedocs.io/zh/latest/arena/index.html)功能,可以在多智能体对抗性环境中方便地对各种智能体(甚至是[及第平台](https://openrl-docs.readthedocs.io/zh/latest/arena/index.html#openrl)上提交的智能体)进行评测。 - 支持从[Hugging Face](https://huggingface.co/)上导入模型和数据。支持加载Hugging Face上[Stable-baselines3的模型](https://openrl-docs.readthedocs.io/zh/latest/sb3/index.html)来进行测试和训练。 - 提供用户自有环境接入OpenRL的[详细教程](https://openrl-docs.readthedocs.io/zh/latest/custom_env/index.html). @@ -128,18 +129,18 @@ OpenRL-Lab将持续维护和更新OpenRL,欢迎大家加入我们的[开源社 这里我们提供了一个表格,比较了OpenRL和其他常用的强化学习库。 OpenRL采用模块化设计和高层次的抽象,使得用户可以通过统一的简单易用的接口完成各种任务的训练。 -| 强化学习库 | 自然语言任务/RLHF | 多智能体训练 | 自博弈训练 | 离线强化学习 | 双语文档 | +| 强化学习库 | 自然语言任务/RLHF | 多智能体训练 | 自博弈训练 | 离线强化学习 | [DeepSpeed](https://github.com/microsoft/DeepSpeed) | |:------------------------------------------------------------------:|:------------------:|:--------------------:|:--------------------:|:------------------:|:------------------:| | **[OpenRL](https://github.com/OpenRL-Lab/openrl)** | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | | [Stable Baselines3](https://github.com/DLR-RM/stable-baselines3) | :x: | :x: | :x: | :x: | :x: | | [Ray/RLlib](https://github.com/ray-project/ray/tree/master/rllib/) | :x: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :x: | -| [DI-engine](https://github.com/opendilab/DI-engine/) | :x: | :heavy_check_mark: | not fullly supported | :heavy_check_mark: | :heavy_check_mark: | -| [Tianshou](https://github.com/thu-ml/tianshou) | :x: | not fullly supported | not fullly supported | :heavy_check_mark: | :heavy_check_mark: | +| [DI-engine](https://github.com/opendilab/DI-engine/) | :x: | :heavy_check_mark: | not fullly supported | :heavy_check_mark: | :x: | +| [Tianshou](https://github.com/thu-ml/tianshou) | :x: | not fullly supported | not fullly supported | :heavy_check_mark: | :x: | | [MARLlib](https://github.com/Replicable-MARL/MARLlib) | :x: | :heavy_check_mark: | not fullly supported | :x: | :x: | | [MAPPO Benchmark](https://github.com/marlbenchmark/on-policy) | :x: | :heavy_check_mark: | :x: | :x: | :x: | | [RL4LMs](https://github.com/allenai/RL4LMs) | :heavy_check_mark: | :x: | :x: | :x: | :x: | -| [trlx](https://github.com/CarperAI/trlx) | :heavy_check_mark: | :x: | :x: | :x: | :x: | -| [trl](https://github.com/huggingface/trl) | :heavy_check_mark: | :x: | :x: | :x: | :x: | +| [trlx](https://github.com/CarperAI/trlx) | :heavy_check_mark: | :x: | :x: | :x: | :heavy_check_mark: | +| [trl](https://github.com/huggingface/trl) | :heavy_check_mark: | :x: | :x: | :x: | :heavy_check_mark: | | [TimeChamber](https://github.com/inspirai/TimeChamber) | :x: | :x: | :heavy_check_mark: | :x: | :x: | ## 安装