diff --git a/cleanrl/cleanrl_explo/lil_maze.py b/cleanrl/cleanrl_explo/lil_maze.py new file mode 100644 index 00000000..40042e60 --- /dev/null +++ b/cleanrl/cleanrl_explo/lil_maze.py @@ -0,0 +1,104 @@ +import numpy as np +import pygame +import math +import os, imageio +import matplotlib.pyplot as plt +import torch +import gymnasium as gym +from gymnasium import spaces + + + +class LilMaze(gym.Env): + metadata = {"render_modes": ["rgb_array"]} + def __init__(self, render_mode = None): + super(LilMaze, self).__init__() + + # Define the action and observation spaces + self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32) + self.observation_space = spaces.Box(low=0, high=1, shape=(2,), dtype=np.float32) + + # The size of a step + self.step_size = 0.01 + + # The maximum number of steps + self.max_steps = 200 + + # Define the initial position of the agent + self.initial_agent_position = np.array([0.25, 0.25]) + + # Define the goal position + self.goal_position = np.array([0.25, 0.75]) + + # Wall positions + self.wall_positions = [ + [(0.0,0.5),(0.5, 0.5)], + ] + + self.world = np.zeros((1000, 1000, 3), dtype=np.uint8) + self.world.fill(255) + + self.draw(self.world, self.goal_position, (0, 255, 0)) + self.world_copy = self.world.copy() + + assert render_mode is None or render_mode in self.metadata["render_modes"] + self.render_mode = render_mode + + def draw(self, world, position, color): + pos = (int(position[0] * 1000), int(position[1] * 1000)) + world[pos[1]-2:pos[1]+2, pos[0]-2:pos[0]+2] = color + + + def reset(self, seed=None, options=None): + super().reset(seed=seed) + self.num_steps = 0 + self.world = self.world_copy.copy() + + # Reset the agent's position to the initial position + self.agent_position = self.initial_agent_position + self.draw(self.world, self.initial_agent_position, (255, 0, 0)) + self.draw(self.world_copy, self.initial_agent_position, (0, 0, 255)) + + infos = self._get_info() + return self.agent_position, infos + + def _get_info(self): + return {} + + def step(self, action): + self.num_steps += 1 + + action = np.clip(action, -1, 1) + new_position = self.agent_position + action * self.step_size + + # only made for 1 wall + cond1 = new_position[1] >= 0.5 and self.agent_position[1] < 0.5 + cond2 = new_position[1] < 0.5 and self.agent_position[1] >= 0.5 + + if cond1 and self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]) < 0.5 : + new_position = [self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]), 0.5 - 0.001] + if cond2 and self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]) < 0.5 : + new_position = [self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]), 0.5 + 0.001] + + + self.agent_position = np.clip(np.array(new_position), 0,1) + + self.draw(self.world, self.agent_position, (255, 0, 0)) + self.draw(self.world_copy, self.agent_position, (0, 0, 255)) + + + # Compute the reward + reward = - np.linalg.norm(self.agent_position - self.goal_position) + + done = self.num_steps >= self.max_steps + + infos = self._get_info() + + return self.agent_position, reward, done, None, infos + + def render(self): + if self.render_mode == 'rgb_array': + return self.world.copy() + else: + raise NotImplementedError() + \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_apt.py b/cleanrl/cleanrl_explo/sac_apt.py new file mode 100644 index 00000000..ac84cece --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_apt.py @@ -0,0 +1,474 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 12 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = True + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "SAC - exploration with APT" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = True + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 200000 + """total timesteps of the experiments""" + num_envs: int = 4 + """the number of parallel game environments to run""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + + + # encoder specific arguments + encoder_lr: float = 0.00001611 + """the learning rate of the encoder""" + encoder_epochs: int = 4 + """the number of epochs for the encoder""" + encoder_frequency: int = 300 + """the frequency of training encoder""" + latent_dim: int = 8 + """the dimension of the latent space""" + sigma: float = 0.048 + """the sigma for the data augmentation""" + k_nearest: int = 4 + """the number of nearest neighbors""" + + # intrinsic reward specific arguments + normalize_reward: bool = True + """if toggled, the intrinsic reward will be normalized""" + reward_update_rate : float = 0.001 + """the update rate of the runnign estimators of the reward""" + + + + keep_extrinsic_reward: bool = False + """if toggled, the extrinsic reward will be kept""" + coef_intrinsic : float = 0.1256 + """the coefficient of the intrinsic reward""" + coef_extrinsic : float = 0.5422 + """the coefficient of the extrinsic reward""" + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + + +class Encoder(nn.Module): + def __init__(self, envs, latent_dim, sigma, k_nearest): + super(Encoder, self).__init__() + state_dim = np.prod(envs.single_observation_space.shape) + action_dim = np.prod(envs.single_action_space.shape) + + # encoder network + self.f1 = nn.Linear(state_dim, 256) + self.f2 = nn.Linear(256, 64) + self.f3 = nn.Linear(64, latent_dim) + + + self.latent_dim = latent_dim + self.sigma = sigma + self.k_nearest = k_nearest + + + def forward(self, x): + x = F.relu(self.f1(x)) + x = F.relu(self.f2(x)) + x = self.f3(x) + return x + + def data_augmentation(self, x): + noise = torch.randn_like(x) * self.sigma + return x + noise + + def normalize(self, x): + return nn.functional.normalize(x, p=2, dim=1) + + def constrastive_loss(self, x, x_augmented): + x_norm = self.normalize(x) + x_augmented_norm = self.normalize(x_augmented) + numerator = torch.exp(torch.einsum('ij,ij->i', x_norm, x_augmented_norm)) + denominator = torch.sum(torch.exp(torch.einsum('ik,jk->ij', x_norm, x_norm)), dim=1) + return -torch.log(numerator/denominator).mean() + + def get_knn_sum(self, x, x_augmented): + x_encoded = self(x) + x_augmented_encoded = self(x_augmented) + + distances = torch.cdist(x_encoded, x_augmented_encoded) + knn_sum = torch.topk(distances, self.k_nearest, largest=False, sorted=False, dim=1).values.sum(dim=1) + return knn_sum + + + +def main(seed=None, sweep=False): + + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + if seed is not None: + args.seed = seed + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + + + # For hyperparameter optimization, see trainer.py file + if sweep: + episodic_returns_list = [] + corresponding_steps = [] + + import wandb + wandb.init() + + config = wandb.config + + for key, value in vars(args).items(): + if key in config: + setattr(args, key, config[key]) + + + else : + + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv( + [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] + ) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + encoder = Encoder(envs, args.latent_dim, args.sigma, args.k_nearest).to(device) + encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.encoder_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + + # The replay buffer parameters have been updated to handle multiple envs + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + n_envs=args.num_envs + ) + + intrinsic_reward_running_mean = torch.zeros(1, device=device, dtype=torch.float32) + intrinsic_reward_running_std = torch.ones(1, device=device, dtype=torch.float32) + + + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + if info is not None: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + if sweep: + episodic_returns_list.append(info["episode"]["r"]) + corresponding_steps.append(global_step) + else: + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + + if global_step % args.encoder_frequency == 0: + mean_encoder_loss = 0.0 + for _ in range(args.encoder_epochs): + data = rb.sample(args.batch_size) + obs_augmented = encoder.data_augmentation(data.observations) + obs_encoded = encoder(data.observations) + obs_augmented_encoded = encoder(obs_augmented) + + encoder_loss = encoder.constrastive_loss(obs_encoded, obs_augmented_encoded) + + encoder_optimizer.zero_grad() + encoder_loss.backward() + encoder_optimizer.step() + + mean_encoder_loss += encoder_loss.item() + + mean_encoder_loss /= args.encoder_epochs + if not sweep: + writer.add_scalar("losses/encoder_loss", mean_encoder_loss, global_step) + + + + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + intrinsic_reward = encoder.get_knn_sum(data.observations, data.next_observations)/args.latent_dim + extrinsic_reward = data.rewards.flatten() + + if args.normalize_reward: + intrinsic_reward = (intrinsic_reward - intrinsic_reward_running_mean) / (intrinsic_reward_running_std + 1e-6) + intrinsic_reward_running_mean = intrinsic_reward_running_mean + args.reward_update_rate * (intrinsic_reward.mean() - intrinsic_reward_running_mean) + intrinsic_reward_running_std = intrinsic_reward_running_std + args.reward_update_rate * (intrinsic_reward.std() - intrinsic_reward_running_std) + + if args.keep_extrinsic_reward: + rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic + else: + rewards = intrinsic_reward.flatten() *args.coef_intrinsic + next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + + + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0 and not sweep: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) + + envs.close() + if sweep: + return episodic_returns_list, corresponding_steps + writer.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_aux.py b/cleanrl/cleanrl_explo/sac_aux.py new file mode 100644 index 00000000..e6487f58 --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_aux.py @@ -0,0 +1,452 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 12 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = True + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "SAC - exploration with auxiliary VAE" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = True + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 200000 + """total timesteps of the experiments""" + num_envs: int = 4 + """the number of parallel game environments to run""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + + + # VAE specific arguments + vae_lr: float = 0.001139 + """the learning rate of the VAE""" + vae_epochs: int = 4 + """the number of epochs for the VAE""" + vae_frequency: int = 800 + """the frequency of training VAE""" + vae_latent_dim: int = 32 + """the latent dimension of the VAE""" + clip_vae: float = 120.0 + """the clipping of the VAE""" + vae_batch_size: int = 128 + """the batch size of the VAE""" + + + keep_extrinsic_reward: bool = False + """if toggled, the extrinsic reward will be kept""" + coef_intrinsic : float = 0.3472 + """the coefficient of the intrinsic reward""" + coef_extrinsic : float = 0.5422 + """the coefficient of the extrinsic reward""" + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + +class VAE(nn.Module): + def __init__(self, envs, latent_dim, clip_vae=120.0, scale_l = 1000.0): + super().__init__() + input_dim = np.prod(envs.single_observation_space.shape) + self.clip_vae = clip_vae + self.scale_l = scale_l + self.encoder = nn.Sequential( + nn.Linear(input_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + ) + self.mean_layer = nn.Linear(256, latent_dim) + self.logstd_layer = nn.Linear(256, latent_dim) + self.decoder = nn.Sequential( + nn.Linear(latent_dim, 256), + nn.ReLU(), + nn.Linear(256, 256), + nn.ReLU(), + nn.Linear(256, input_dim), + ) + def encode(self, x): + x = self.encoder(x) + mean = self.mean_layer(x) + logstd = self.logstd_layer(x) + return mean, logstd + + def decode(self, z): + return self.decoder(z) + + def forward(self, x): + mean, logstd = self.encode(x/self.scale_l) + z = mean + torch.randn_like(mean) * torch.exp(logstd) + x_recon = torch.clamp(self.decode(z), -self.clip_vae, self.clip_vae) + return x_recon, mean, logstd + + def loss(self, x, reduce=True): + x_recon, mean, logstd = self(x) + x = x/self.scale_l + recon_loss = F.mse_loss(x_recon, x, reduction='none').sum(1) + kl_loss = -0.5 * (1 + 2 * logstd - mean ** 2 - torch.exp(2 * logstd)).sum(1) + loss = recon_loss + kl_loss + if reduce: + return loss.mean() + return loss + +def main(seed=None, sweep=False): + + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + if seed is not None: + args.seed = seed + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + + + # For hyperparameter optimization, see trainer.py file + if sweep: + episodic_returns_list = [] + corresponding_steps = [] + + import wandb + wandb.init() + + config = wandb.config + + for key, value in vars(args).items(): + if key in config: + setattr(args, key, config[key]) + + + else : + + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv( + [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] + ) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + + vae = VAE(envs, + latent_dim=args.vae_latent_dim, + clip_vae=args.clip_vae).to(device) + vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + + # The replay buffer parameters have been updated to handle multiple envs + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + n_envs=args.num_envs + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + if info is not None: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + if sweep: + episodic_returns_list.append(info["episode"]["r"]) + corresponding_steps.append(global_step) + else: + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + + if global_step % args.vae_frequency == 0: + mean_vae_loss = 0.0 + for _ in range(args.vae_epochs): + data = rb.sample(args.batch_size) + + vae_loss = vae.loss(data.observations, reduce=True) + vae_optimizer.zero_grad() + vae_loss.backward() + vae_optimizer.step() + mean_vae_loss += vae_loss.item() + + mean_vae_loss /= args.vae_epochs + if not sweep: + writer.add_scalar("losses/vae_loss", mean_vae_loss, global_step) + + + + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + intrinsic_reward = vae.loss(data.observations, reduce = False) + extrinsic_reward = data.rewards.flatten() + if args.keep_extrinsic_reward: + rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic + else: + rewards = intrinsic_reward.flatten() *args.coef_intrinsic + next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + + + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0 and not sweep: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) + + envs.close() + if sweep: + return episodic_returns_list, corresponding_steps + writer.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_continuous_action.py b/cleanrl/cleanrl_explo/sac_continuous_action.py new file mode 100644 index 00000000..d28cb8e2 --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_continuous_action.py @@ -0,0 +1,310 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 1 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = False + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "cleanRL" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = False + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 1000000 + """total timesteps of the experiments""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + + +if __name__ == "__main__": + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)]) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + + envs.close() + writer.close() diff --git a/cleanrl/cleanrl_explo/sac_icm.py b/cleanrl/cleanrl_explo/sac_icm.py new file mode 100644 index 00000000..8909ece9 --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_icm.py @@ -0,0 +1,454 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 12 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = True + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "SAC - exploration with ICM" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = True + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 200000 + """total timesteps of the experiments""" + num_envs: int = 4 + """the number of parallel game environments to run""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + + + # icm specific arguments + icm_lr: float = 0.00082 + """the learning rate of the icm""" + icm_epochs: int = 4 + """the number of epochs for the icm""" + icm_frequency: int = 1000 + """the frequency of training icm""" + beta: float = 0.1083 + """the beta of the icm""" + clip_intrinsic_reward: float = 10.0 + """the clipping of the intrinsic reward""" + feature_dim: int = 64 + + + keep_extrinsic_reward: bool = False + """if toggled, the extrinsic reward will be kept""" + coef_intrinsic : float = 84.185 + """the coefficient of the intrinsic reward""" + coef_extrinsic : float = 1.96 + """the coefficient of the extrinsic reward""" + + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + + +class ICM(nn.Module): + def __init__(self, envs, feature_dim = 64, beta = 0.2): + super(ICM, self).__init__() + state_dim = np.prod(envs.single_observation_space.shape) + action_dim = np.prod(envs.single_action_space.shape) + # feature network + self.f1 = nn.Linear(state_dim, 256) + self.f2 = nn.Linear(256, 64) + self.f3 = nn.Linear(64, feature_dim) + # inverse model + self.i1 = nn.Linear(2*feature_dim, 64) + self.i2 = nn.Linear(64, action_dim) + # forward model + self.fo1 = nn.Linear(feature_dim + action_dim, 64) + self.fo2 = nn.Linear(64, feature_dim) + # beta + self.beta = beta + + def feature(self, x): + x = F.relu(self.f1(x)) + x = F.relu(self.f2(x)) + x = self.f3(x) + return x + def inverse(self, f1, f2): + x = torch.cat([f1, f2], dim = 1) + x = F.relu(self.i1(x)) + x = self.i2(x) + return x + def forward_t(self, f1, a): + x = torch.cat([f1, a], dim = 1) + x = F.relu(self.fo1(x)) + x = self.fo2(x) + return x + + def loss(self, obs, next_obs, action, reduce = True): + # feature + f = self.feature(obs) + f_next = self.feature(next_obs) + # inverse + a_pred = self.inverse(f, f_next) + # forward + f_next_pred = self.forward_t(f, action) + # loss + loss_inverse = F.mse_loss(a_pred, action, reduction = 'none').sum(1) if not reduce else F.mse_loss(a_pred, action) + loss_forward = F.mse_loss(f_next_pred, f_next, reduction = 'none').sum(1) if not reduce else F.mse_loss(f_next_pred, f_next) + return self.beta * loss_forward + (1 - self.beta) * loss_inverse + + +def main(seed=None, sweep=False): + + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + if seed is not None: + args.seed = seed + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + + + # For hyperparameter optimization, see trainer.py file + if sweep: + episodic_returns_list = [] + corresponding_steps = [] + + import wandb + wandb.init() + + config = wandb.config + + for key, value in vars(args).items(): + if key in config: + setattr(args, key, config[key]) + + + else : + + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv( + [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] + ) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + icm = ICM(envs, + feature_dim=64, + beta=args.beta).to(device) + icm_optimizer = optim.Adam(icm.parameters(), lr=args.icm_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + + # The replay buffer parameters have been updated to handle multiple envs + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + n_envs=args.num_envs + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + if info is not None: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + if sweep: + episodic_returns_list.append(info["episode"]["r"]) + corresponding_steps.append(global_step) + else: + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + + if global_step % args.icm_frequency == 0: + mean_icm_loss = 0.0 + for _ in range(args.icm_epochs): + data = rb.sample(args.batch_size) + + icm_loss = icm.loss(data.observations, data.next_observations, data.actions, reduce = True) + icm_optimizer.zero_grad() + icm_loss.backward() + icm_optimizer.step() + mean_icm_loss += icm_loss.item() + + mean_icm_loss /= args.icm_epochs + if not sweep: + writer.add_scalar("losses/icm_loss", mean_icm_loss, global_step) + + + + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + intrinsic_reward = icm.loss(data.observations, data.next_observations, data.actions, reduce = False) + extrinsic_reward = data.rewards.flatten() + if args.keep_extrinsic_reward: + rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic + else: + rewards = intrinsic_reward.flatten() *args.coef_intrinsic + next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + + + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0 and not sweep: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) + + envs.close() + if sweep: + return episodic_returns_list, corresponding_steps + writer.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_ngu.py b/cleanrl/cleanrl_explo/sac_ngu.py new file mode 100644 index 00000000..1200bf07 --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_ngu.py @@ -0,0 +1,572 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 12 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = True + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "SAC - exploration with NGU" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = True + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 200000 + """total timesteps of the experiments""" + num_envs: int = 4 + """the number of parallel game environments to run""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + + + # NGU specific arguments + ngu_lr: float = 0.00004501 + """the learning rate of the NGU""" + ngu_epochs: int = 4 + """the number of epochs for the NGU""" + ngu_frequency: int = 900 + """the frequency of training NGU""" + ngu_feature_dim: int = 64 + """the feature dimension of the NGU""" + k_nearest: int = 6 + """the number of nearest neighbors for the NGU""" + clip_reward: float = 0.3656 + """the clipping value of the reward""" + c: float = 0.001 + """the constant used not to divide by zero""" + L: float = 5.0 + """the maximum value for the multiplier in the intrinsic reward of NGU""" + epsilon_kernel: float = 1e-3 + """the epsilon value for the kernel of the NGU""" + + + keep_extrinsic_reward: bool = False + """if toggled, the extrinsic reward will be kept""" + coef_intrinsic : float = 48.311 + """the coefficient of the intrinsic reward""" + coef_extrinsic : float = 7.099 + """the coefficient of the extrinsic reward""" + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + +class NGU_ReplayBuffer(): + def __init__(self, buffer_size, observation_space, action_space, device, handle_timeout_termination=False, n_envs=1): + self.buffer_size = buffer_size + self.device = device + self.handle_timeout_termination = handle_timeout_termination + self.n_envs = n_envs + + self.observations = np.zeros((buffer_size, n_envs) + observation_space.shape, dtype=np.float32) + self.next_observations = np.zeros((buffer_size, n_envs) + observation_space.shape, dtype=np.float32) + self.actions = np.zeros((buffer_size, n_envs) + action_space.shape, dtype=np.float32) + self.rewards = np.zeros((buffer_size, n_envs), dtype=np.float32) + self.rewards_ngu = np.zeros((buffer_size, n_envs), dtype=np.float32) + self.dones = np.zeros((buffer_size, n_envs), dtype=np.float32) + self.ptr, self.size, self.max_size = 0, 0, buffer_size + + def add(self, obs, next_obs, action, reward, reward_ngu, done, info): + self.observations[self.ptr] = obs + self.next_observations[self.ptr] = next_obs + self.actions[self.ptr] = action + self.rewards[self.ptr] = reward + self.rewards_ngu[self.ptr] = reward_ngu + self.dones[self.ptr] = done + self.ptr = (self.ptr + 1) % self.max_size + self.size = min(self.size + 1, self.max_size) + + def sample(self, batch_size): + idxs = np.random.randint(0, self.size, size=batch_size) + idxs_2 = np.random.randint(0, self.n_envs, size=batch_size) + return ( + torch.as_tensor(self.observations[idxs,idxs_2,:], device=self.device), + torch.as_tensor(self.next_observations[idxs,idxs_2,:], device=self.device), + torch.as_tensor(self.actions[idxs,idxs_2,:], device=self.device), + torch.as_tensor(self.rewards[idxs,idxs_2], device=self.device), + torch.as_tensor(self.rewards_ngu[idxs,idxs_2], device=self.device), + torch.as_tensor(self.dones[idxs,idxs_2], device=self.device), + ) + + def __len__(self): + return self.size + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + +class NGU(nn.Module): + def __init__(self, envs, feature_dim, k_nearest, clip_reward, c, L, epsilon_kernel): + super().__init__() + state_dim = np.prod(envs.single_observation_space.shape) + action_dim = np.prod(envs.single_action_space.shape) + self.feature_dim = feature_dim + self.k_nearest = k_nearest + self.clip_reward = clip_reward + self.c = c + self.L = L + self.epsilon_kernel = epsilon_kernel + + # RND + # trained network + self.f1 = nn.Linear(state_dim, 128) + self.f2 = nn.Linear(128, 64) + self.f3 = nn.Linear(64, 1) + # target network + self.f1_t = nn.Linear(state_dim, 128) + self.f2_t = nn.Linear(128, 64) + self.f3_t = nn.Linear(64, 1) + # embedding network + self.f1_z = nn.Linear(state_dim, 128) + self.f2_z = nn.Linear(128, 64) + self.f3_z = nn.Linear(64, feature_dim) + # action network + self.f1_a = nn.Linear(feature_dim*2 , 128) + self.f2_a = nn.Linear(128, 64) + self.f3_a = nn.Linear(64, action_dim) + # running average of the squared Euclidean distance of the k-th nearest neighbors + self.dm2 = 0.0 + + def forward(self, x): + x = F.relu(self.f1(x)) + x = F.relu(self.f2(x)) + x = self.f3(x) + return x + + def forward_t(self, x): + with torch.no_grad(): + x = F.relu(self.f1_t(x)) + x = F.relu(self.f2_t(x)) + x = self.f3_t(x) + return x + + def rnd_loss(self, x, reduce = True): + return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none') + + def embedding(self, s): + x = F.relu(self.f1_z(s)) + x = F.relu(self.f2_z(x)) + x = self.f3_z(x) + return x + + def action_pred(self, s0, s1): + x = torch.cat([s0, s1], 1) + x = F.relu(self.f1_a(x)) + x = F.relu(self.f2_a(x)) + x = self.f3_a(x) + return x + + def reward_episode(self, s, episode): + z_s = self.embedding(s) + z_episode = self.embedding(episode) + + dist = torch.norm(z_s - z_episode, dim=1) + kernel = self.epsilon_kernel/(dist/self.dm2 + self.epsilon_kernel) + top_k_kernel = torch.topk(kernel, self.k_nearest, largest = True) + top_k = torch.topk(dist, self.k_nearest, largest = False) + self.dm2 = 0.99 * self.dm2 + 0.01 * top_k.values.mean().item() + reward_episodic = (1/(torch.sqrt(top_k_kernel.values.mean()) + self.c)).item() + + return reward_episodic + + + + def loss(self,s,s_next,a,d): + rnd_loss = self.rnd_loss(s) + + s0 = self.embedding(s) + s1 = self.embedding(s_next) + h_loss = torch.norm(self.action_pred(s0, s1) - a, dim=1) * (1-d) + + return rnd_loss + h_loss.mean() + +def main(seed=None, sweep=False): + + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + if seed is not None: + args.seed = seed + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + + + # For hyperparameter optimization, see trainer.py file + if sweep: + episodic_returns_list = [] + corresponding_steps = [] + + import wandb + wandb.init() + + config = wandb.config + + for key, value in vars(args).items(): + if key in config: + setattr(args, key, config[key]) + + + else : + + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv( + [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] + ) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + + ngu = NGU(envs, + feature_dim = args.ngu_feature_dim, + k_nearest = args.k_nearest, + clip_reward = args.clip_reward, + c = args.c, + L = args.L, + epsilon_kernel = args.epsilon_kernel + ).to(device) + ngu_optimizer = optim.Adam(ngu.parameters(), lr=args.ngu_lr) + episodes = [ [] for _ in range(args.num_envs)] + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + + + # This replay buffer is hand designed for NGU + # The replay buffer parameters have been updated to handle multiple envs + rb = NGU_ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + n_envs=args.num_envs + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # COMPUTE REWARD + reward_ngu = torch.zeros(args.num_envs) + for idx in range(args.num_envs): + with torch.no_grad(): + reward_ngu[idx] = ngu.reward_episode(torch.tensor(obs[idx]).unsqueeze(0).float().to(device), torch.tensor(np.array(episodes[idx])).float().to(device)) if len(episodes[idx]) > args.k_nearest else 0.0 + + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + if info is not None: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + if sweep: + episodic_returns_list.append(info["episode"]["r"]) + corresponding_steps.append(global_step) + else: + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, (done_, trunc) in enumerate(zip(terminations,truncations)): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + if done_ or trunc: + episodes[idx] = [] + rb.add(obs, real_next_obs, actions, rewards, reward_ngu, terminations, infos) + + for idx, ob in enumerate(obs): + episodes[idx].append(ob) + + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + + if global_step % args.ngu_frequency == 0: + mean_ngu_loss = 0.0 + for _ in range(args.ngu_epochs): + data = rb.sample(args.batch_size) + data_observations = data[0] + data_next_observations = data[1] + data_actions = data[2] + data_rewards = data[3] + data_rewards_ngu = data[4] + data_dones = data[5] + + ngu_loss = ngu.loss(data_observations, data_next_observations, data_actions, data_dones) + ngu_optimizer.zero_grad() + ngu_loss.backward() + ngu_optimizer.step() + mean_ngu_loss += ngu_loss.item() + + mean_ngu_loss /= args.ngu_epochs + if not sweep: + writer.add_scalar("losses/ngu_loss", mean_ngu_loss, global_step) + + + + data = rb.sample(args.batch_size) + data_observations = data[0] + data_next_observations = data[1] + data_actions = data[2] + data_rewards = data[3] + data_rewards_ngu = data[4] + data_dones = data[5] + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data_next_observations) + qf1_next_target = qf1_target(data_next_observations, next_state_actions) + qf2_next_target = qf2_target(data_next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + rnd_loss = ngu.rnd_loss(data_observations, reduce = False) + intrinsic_reward = data_rewards_ngu * torch.min(torch.max(rnd_loss.flatten(), torch.tensor(1).to(device)), torch.tensor(args.L).to(device)) + intrinsic_reward = torch.clip(intrinsic_reward, -args.clip_reward, args.clip_reward) + extrinsic_reward = data_rewards.flatten() + if args.keep_extrinsic_reward: + rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic + else: + rewards = intrinsic_reward *args.coef_intrinsic + next_q_value = rewards + (1 - data_dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + + qf1_a_values = qf1(data_observations, data_actions).view(-1) + qf2_a_values = qf2(data_observations, data_actions).view(-1) + + + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data_observations) + qf1_pi = qf1(data_observations, pi) + qf2_pi = qf2(data_observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data_observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0 and not sweep: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) + + envs.close() + if sweep: + return episodic_returns_list, corresponding_steps + writer.close() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_rnd.py b/cleanrl/cleanrl_explo/sac_rnd.py new file mode 100644 index 00000000..98462e98 --- /dev/null +++ b/cleanrl/cleanrl_explo/sac_rnd.py @@ -0,0 +1,426 @@ +# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy +import os +import random +import time +from dataclasses import dataclass + +import gymnasium as gym +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +import tyro +from stable_baselines3.common.buffers import ReplayBuffer +from torch.utils.tensorboard import SummaryWriter + +@dataclass +class Args: + exp_name: str = os.path.basename(__file__)[: -len(".py")] + """the name of this experiment""" + seed: int = 12 + """seed of the experiment""" + torch_deterministic: bool = True + """if toggled, `torch.backends.cudnn.deterministic=False`""" + cuda: bool = True + """if toggled, cuda will be enabled by default""" + track: bool = True + """if toggled, this experiment will be tracked with Weights and Biases""" + wandb_project_name: str = "SAC - exploration with RND" + """the wandb's project name""" + wandb_entity: str = None + """the entity (team) of wandb's project""" + capture_video: bool = True + """whether to capture videos of the agent performances (check out `videos` folder)""" + + # Algorithm specific arguments + env_id: str = "Hopper-v4" + """the environment id of the task""" + total_timesteps: int = 200000 + """total timesteps of the experiments""" + num_envs: int = 4 + """the number of parallel game environments to run""" + buffer_size: int = int(1e6) + """the replay memory buffer size""" + gamma: float = 0.99 + """the discount factor gamma""" + tau: float = 0.005 + """target smoothing coefficient (default: 0.005)""" + batch_size: int = 256 + """the batch size of sample from the reply memory""" + learning_starts: int = 5e3 + """timestep to start learning""" + policy_lr: float = 3e-4 + """the learning rate of the policy network optimizer""" + q_lr: float = 1e-3 + """the learning rate of the Q network network optimizer""" + policy_frequency: int = 2 + """the frequency of training policy (delayed)""" + target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. + """the frequency of updates for the target nerworks""" + alpha: float = 0.2 + """Entropy regularization coefficient.""" + autotune: bool = True + """automatic tuning of the entropy coefficient""" + + + + # RND specific arguments + rnd_lr: float = 0.004866 + """the learning rate of the RND""" + rnd_epochs: int = 4 + """the number of epochs for the RND""" + rnd_frequency: int = 900 + """the frequency of training RND""" + + + keep_extrinsic_reward: bool = False + """if toggled, the extrinsic reward will be kept""" + coef_intrinsic : float = 47.016 + """the coefficient of the intrinsic reward""" + coef_extrinsic : float = 1.631 + """the coefficient of the extrinsic reward""" + +def make_env(env_id, seed, idx, capture_video, run_name): + def thunk(): + if capture_video and idx == 0: + env = gym.make(env_id, render_mode="rgb_array") + env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") + else: + env = gym.make(env_id) + env = gym.wrappers.RecordEpisodeStatistics(env) + env.action_space.seed(seed) + return env + + return thunk + + +# ALGO LOGIC: initialize agent here: +class SoftQNetwork(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) + self.fc2 = nn.Linear(256, 256) + self.fc3 = nn.Linear(256, 1) + + def forward(self, x, a): + x = torch.cat([x, a], 1) + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +LOG_STD_MAX = 2 +LOG_STD_MIN = -5 + + +class Actor(nn.Module): + def __init__(self, env): + super().__init__() + self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.fc2 = nn.Linear(256, 256) + self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) + self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) + # action rescaling + self.register_buffer( + "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + self.register_buffer( + "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) + ) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = F.relu(self.fc2(x)) + mean = self.fc_mean(x) + log_std = self.fc_logstd(x) + log_std = torch.tanh(log_std) + log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats + + return mean, log_std + + def get_action(self, x): + mean, log_std = self(x) + std = log_std.exp() + normal = torch.distributions.Normal(mean, std) + x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) + y_t = torch.tanh(x_t) + action = y_t * self.action_scale + self.action_bias + log_prob = normal.log_prob(x_t) + # Enforcing Action Bound + log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) + log_prob = log_prob.sum(1, keepdim=True) + mean = torch.tanh(mean) * self.action_scale + self.action_bias + return action, log_prob, mean + + +class RND(nn.Module): + def __init__(self, env): + super(RND, self).__init__() + # trained network + self.f1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.f2 = nn.Linear(256, 256) + self.f3 = nn.Linear(256, 1) + # target network + self.f1_t = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) + self.f2_t = nn.Linear(256, 256) + self.f3_t = nn.Linear(256, 1) + + def forward(self, x): + x = F.relu(self.f1(x)) + x = F.relu(self.f2(x)) + x = self.f3(x) + return x + + def forward_t(self, x): + with torch.no_grad(): + x = F.relu(self.f1_t(x)) + x = F.relu(self.f2_t(x)) + x = self.f3_t(x) + return x + + def loss(self, x, reduce = True): + return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none') + + +def main(seed=None, sweep=False): + + import stable_baselines3 as sb3 + + if sb3.__version__ < "2.0": + raise ValueError( + """Ongoing migration: run the following command to install the new dependencies: +poetry run pip install "stable_baselines3==2.0.0a1" +""" + ) + + args = tyro.cli(Args) + if seed is not None: + args.seed = seed + run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" + + + # For hyperparameter optimization, see trainer.py file + if sweep: + episodic_returns_list = [] + corresponding_steps = [] + + import wandb + wandb.init() + + config = wandb.config + + for key, value in vars(args).items(): + if key in config: + setattr(args, key, config[key]) + + + else : + + if args.track: + import wandb + + wandb.init( + project=args.wandb_project_name, + entity=args.wandb_entity, + sync_tensorboard=True, + config=vars(args), + name=run_name, + monitor_gym=True, + save_code=True, + ) + writer = SummaryWriter(f"runs/{run_name}") + writer.add_text( + "hyperparameters", + "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), + ) + + # TRY NOT TO MODIFY: seeding + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.backends.cudnn.deterministic = args.torch_deterministic + + device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") + + # env setup + envs = gym.vector.SyncVectorEnv( + [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] + ) + assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" + + max_action = float(envs.single_action_space.high[0]) + + actor = Actor(envs).to(device) + qf1 = SoftQNetwork(envs).to(device) + qf2 = SoftQNetwork(envs).to(device) + qf1_target = SoftQNetwork(envs).to(device) + qf2_target = SoftQNetwork(envs).to(device) + qf1_target.load_state_dict(qf1.state_dict()) + qf2_target.load_state_dict(qf2.state_dict()) + q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) + actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) + + rnd = RND(envs).to(device) + rnd_optimizer = optim.Adam(list(rnd.parameters()), lr=args.rnd_lr) + + # Automatic entropy tuning + if args.autotune: + target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() + log_alpha = torch.zeros(1, requires_grad=True, device=device) + alpha = log_alpha.exp().item() + a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) + else: + alpha = args.alpha + + envs.single_observation_space.dtype = np.float32 + + # The replay buffer parameters have been updated to handle multiple envs + rb = ReplayBuffer( + args.buffer_size, + envs.single_observation_space, + envs.single_action_space, + device, + handle_timeout_termination=False, + n_envs=args.num_envs + ) + start_time = time.time() + + # TRY NOT TO MODIFY: start the game + obs, _ = envs.reset(seed=args.seed) + for global_step in range(args.total_timesteps): + # ALGO LOGIC: put action logic here + if global_step < args.learning_starts: + actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) + else: + actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) + actions = actions.detach().cpu().numpy() + + # TRY NOT TO MODIFY: execute the game and log data. + next_obs, rewards, terminations, truncations, infos = envs.step(actions) + + # TRY NOT TO MODIFY: record rewards for plotting purposes + if "final_info" in infos: + for info in infos["final_info"]: + if info is not None: + print(f"global_step={global_step}, episodic_return={info['episode']['r']}") + if sweep: + episodic_returns_list.append(info["episode"]["r"]) + corresponding_steps.append(global_step) + else: + writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) + writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) + break + + # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` + real_next_obs = next_obs.copy() + for idx, trunc in enumerate(truncations): + if trunc: + real_next_obs[idx] = infos["final_observation"][idx] + rb.add(obs, real_next_obs, actions, rewards, terminations, infos) + + # TRY NOT TO MODIFY: CRUCIAL step easy to overlook + obs = next_obs + + # ALGO LOGIC: training. + if global_step > args.learning_starts: + + if global_step % args.rnd_frequency == 0: + mean_rnd_loss = 0.0 + for _ in range(args.rnd_epochs): + data = rb.sample(args.batch_size) + + rnd_loss = rnd.loss(data.observations).mean() + rnd_optimizer.zero_grad() + rnd_loss.backward() + rnd_optimizer.step() + mean_rnd_loss += rnd_loss.item() + + mean_rnd_loss /= args.rnd_epochs + if not sweep: + writer.add_scalar("losses/vae_loss", mean_rnd_loss, global_step) + + + data = rb.sample(args.batch_size) + with torch.no_grad(): + next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) + qf1_next_target = qf1_target(data.next_observations, next_state_actions) + qf2_next_target = qf2_target(data.next_observations, next_state_actions) + min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi + intrinsic_reward = rnd.loss(data.observations, reduce = False) + extrinsic_reward = data.rewards.flatten() + if args.keep_extrinsic_reward: + rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward.flatten()*args.coef_intrinsic + else: + rewards = intrinsic_reward.flatten() *args.coef_intrinsic + next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) + + qf1_a_values = qf1(data.observations, data.actions).view(-1) + qf2_a_values = qf2(data.observations, data.actions).view(-1) + + + qf1_loss = F.mse_loss(qf1_a_values, next_q_value) + qf2_loss = F.mse_loss(qf2_a_values, next_q_value) + qf_loss = qf1_loss + qf2_loss + + # optimize the model + q_optimizer.zero_grad() + qf_loss.backward() + q_optimizer.step() + + if global_step % args.policy_frequency == 0: # TD 3 Delayed update support + for _ in range( + args.policy_frequency + ): # compensate for the delay by doing 'actor_update_interval' instead of 1 + pi, log_pi, _ = actor.get_action(data.observations) + qf1_pi = qf1(data.observations, pi) + qf2_pi = qf2(data.observations, pi) + min_qf_pi = torch.min(qf1_pi, qf2_pi) + actor_loss = ((alpha * log_pi) - min_qf_pi).mean() + + actor_optimizer.zero_grad() + actor_loss.backward() + actor_optimizer.step() + + if args.autotune: + with torch.no_grad(): + _, log_pi, _ = actor.get_action(data.observations) + alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() + + a_optimizer.zero_grad() + alpha_loss.backward() + a_optimizer.step() + alpha = log_alpha.exp().item() + + # update the target networks + if global_step % args.target_network_frequency == 0: + for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): + target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) + + if global_step % 100 == 0 and not sweep: + writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) + writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) + writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) + writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) + writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) + writer.add_scalar("losses/alpha", alpha, global_step) + print("SPS:", int(global_step / (time.time() - start_time))) + writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) + if args.autotune: + writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) + writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) + writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) + + envs.close() + if sweep: + return episodic_returns_list, corresponding_steps + writer.close() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/trainer.py b/cleanrl/cleanrl_explo/trainer.py new file mode 100644 index 00000000..d453aeba --- /dev/null +++ b/cleanrl/cleanrl_explo/trainer.py @@ -0,0 +1,172 @@ +import wandb +import importlib +import multiprocessing +import time +from dataclasses import dataclass + +import numpy as np + + +@dataclass +class Sweep_Args(): + + + ############################### IMPORTANT ################################ + """ + This code produces a sweep for training a SAC-exploration agent with different hyperparameters. + It will print an id. This id can be used to run the same sweep in parellel on different machines. + Thus, you can run the same sweep on different machines and the results will be aggregated in the same wandb project, + therefore speeding up the hyperparameter search. + + To do so you must run same script than this one on the other machines, but with the same sweep id. + So you must copy the sweep id from the output of this script and paste it in the other scripts. + """ + + ########################################################################### + + available_methods = ["aux", "icm", "ngu", "rnd", "apt", "our_method"] + "All the methods available for training" + + + method = "our_method" + "The method to use for training" + environment = "LilMaze" + "The environment to use for training" + nb_of_attempts: int = 1 + "Every hyperparameter combination will be tried this many times, the average will be used" + nb_of_parallel_jobs: int = 1 + "The number of parallel agents to run (remember that several environments will already be run for every single agent)" + count: int = 10 + "The number of hyperparameter combinations to try per agent" + + + fichier = f"sac_{method}" + "The file to run for training" + project: str = f"{method} sweep {environment}" + "The project name to use in wandb" + + + + """ + In order to run the sweep, you must create a sweep configuration dictionnary. + The documentation for the sweep configuration can be found here: https://docs.wandb.ai/guides/sweeps/configuration + """ + + sweep_config = { + "method": "bayes", + "metric": {"goal": "maximize", "name": "episodic_return"}, + "parameters": { + "classifier_lr": { + "distribution": "log_uniform_values", + "max": 1e-2, + "min": 1e-5, + }, + "coef_intrinsic": { + "distribution": "log_uniform_values", + "max": 100.0, + "min": 0.1, + }, + "coef_extrinsic": { + "distribution": "log_uniform_values", + "max": 100.0, + "min": 0.1, + }, + "total_timesteps": { + 'value': 200000, + }, + 'capture_video': { + 'value': False + }, + 'keep_extrinsic_reward': { + 'value': False + }, + 'env_id': { + 'value': f"{environment}" + }, + }, + } + + + assert method in available_methods, f"method must be in {available_methods}" + + sweep_id = wandb.sweep(sweep_config, project=project) + "The sweep id to use for the sweep" + +def train(args: Sweep_Args): + + try: + module = importlib.import_module(args.fichier) + values = [] + steps = [] + for i in range(args.nb_of_attempts): + v, t = module.main(seed=i, sweep=True) + + values += v + steps += t + + + + values = np.array(values) + steps = np.array(steps).reshape(-1, 1) + + + # We use the quantile regression to get the median and the 95% confidence interval + + + from sklearn.ensemble import GradientBoostingRegressor + + gbm_median = GradientBoostingRegressor(loss="quantile", alpha=0.5, n_estimators=100) + gbm_median.fit(steps, values) + + gbm_upper = GradientBoostingRegressor(loss="quantile", alpha=0.975, n_estimators=100) + gbm_upper.fit(steps, values) + + gbm_lower = GradientBoostingRegressor(loss="quantile", alpha=0.025, n_estimators=100) + gbm_lower.fit(steps, values) + + + plot_steps = np.linspace(steps.min(), steps.max(), 200)[:, np.newaxis] + + y_pred_median = gbm_median.predict(plot_steps).ravel() + y_pred_upper = gbm_upper.predict(plot_steps).ravel() + y_pred_lower = gbm_lower.predict(plot_steps).ravel() + + + for t, min, median, max in list(zip(steps, y_pred_lower, y_pred_median, y_pred_upper)): + wandb.log({ + "episodic_return": median, + "episodic_return_upper": max, + "episodic_return_lower": min + }, step=t[0]) + + + except ModuleNotFoundError: + print(f"Erreur: le module '{args.fichier}' n'a pas été trouvé.") + except AttributeError: + print(f"Erreur: le module '{args.fichier}' n'a pas de fonction 'main'.") + except Exception as e: + print(f"Erreur: {e}") + + +def agent(index: int, args: Sweep_Args): + print(f"Agent {index} started.") + + wandb.agent(args.sweep_id, function=lambda: train(args), project=args.project, count=args.count) + + print(f"Agent {index} finished.") + + +if __name__ == "__main__": + + args = Sweep_Args() + processes = [] + for i in range(args.nb_of_parallel_jobs): + p = multiprocessing.Process(target=agent, args=(i, args)) + p.start() + processes.append(p) + + + for p in processes: + p.join() + + print("All processes have finished.") \ No newline at end of file