diff --git a/cleanrl/cleanrl_explo/PA_version/apt_sac.py b/cleanrl/cleanrl_explo/PA_version/apt_sac.py deleted file mode 100644 index 44d55e5f..00000000 --- a/cleanrl/cleanrl_explo/PA_version/apt_sac.py +++ /dev/null @@ -1,533 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from envs.wenv import Wenv -from envs.config_env import config -from src.utils.wandb_utils import send_matrix - - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 1 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "contrastive_test_2" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = False - """whether to capture videos of the agent performances (check out `videos` folder)""" - use_hp_file : bool = False - """if toggled, will load the hyperparameters from file""" - hp_file: str = "hyper_parameters_sac.json" - """the path to the hyperparameters json file""" - sweep_mode: bool = False - """if toggled, will log the sweep id to wandb""" - - # GIF - make_gif: bool = True - """if toggled, will make gif """ - plotly: bool = False - """if toggled, will use plotly instead of matplotlib""" - fig_frequency: int = 1000 - """the frequency of logging the figures""" - metric_freq: int = 1000 - """the frequency of ploting metric""" - - - - # Algorithm specific arguments - env_id: str = "Maze-Ur-v0" - """the environment id of the task""" - total_timesteps: int = 1000000 - """total timesteps of the experiments""" - buffer_size: int = int(1e7) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 4 - """the frequency of training policy (delayed)""" - learning_frequency: int = 2 - """the frequency of training the Q network""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.1 - """Entropy regularization coefficient.""" - autotune: bool = False - """automatic tuning of the entropy coefficient""" - num_envs: int = 2 - """ num of parallel envs """ - - # APS SPECIFIC - """ the coefficient of the extrinsic reward""" - encoder_lr: float = 1e-3 - """the learning rate of the encoder""" - encoder_epochs: int = 1 - """the number of epochs for the encoder""" - nb_epoch_before_training: int = 4 - """ nb epoch between each training """ - encoder_batch_size: int = 256 - """the batch size of the encoder""" - latent_dim: int = 8 - """the dimension of the latent space""" - sigma: float = 0.05 - """the sigma for the data augmentation""" - knn: int = 8 - """the number of nearest neighbors""" - normalize_rwd: bool = False - """if toggled, the reward will be normalized""" - tau_update: float = 0.001 - """the update rate of the statistics""" - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 1.0 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 1.0 - - -def make_env(env_id, idx, capture_video, run_name): - def thunk(): - env = Wenv(env_id=env_id, xp_id=run_name, **config[env_id]) - env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - env = gym.wrappers.ClipAction(env) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - - - -class Encoder(torch.nn.Module): - def __init__(self, - observation_space, - device, - env_id, - latent_dim = 32, - feature_extractor = False, - sigma = 0.01): - super(Encoder, self).__init__() - self.relu = torch.nn.ReLU() - self.env_id = env_id - self.feature_extractor = feature_extractor - self.sigma = sigma - if feature_extractor: - self.fc1 = torch.nn.Linear(config[env_id]['coverage_idx'].shape[0], 128,device=device) - self.fc2 = torch.nn.Linear(128, 64, device=device) - self.fc3 = torch.nn.Linear(64, latent_dim, device=device) - else: - self.fc1 = torch.nn.Linear(observation_space.shape[0], 128,device=device) - self.fc2 = torch.nn.Linear(128, 64, device=device) - self.fc3 = torch.nn.Linear(64, latent_dim, device=device) - - def forward(self, x): - x = self.feature(x) if self.feature_extractor else x - x = self.relu(self.fc1(x)) - x = self.relu(self.fc2(x)) - x = self.fc3(x) - return x - - def data_augmentation(self, x): - return x + torch.randn_like(x) * self.sigma - - def normalize(self, x): - return nn.functional.normalize(x, p=2, dim=1) - - def feature(self, x): - x= x[:, :, config[self.env_id]['coverage_idx']] if x.dim() == 3 else x[:, config[self.env_id]['coverage_idx']] - return x - - def contrastive_loss(self,rep_sk, rep_sv): - # Normalize - rep_sk_norm = self.normalize(rep_sk) - rep_sv_norm = self.normalize(rep_sv) - positive_scores = torch.exp(torch.einsum('ij,ij->i', rep_sk_norm, rep_sv_norm)) - all_scores = torch.exp(torch.einsum('ik,jk->ij', rep_sk_norm, rep_sv_norm)) - all_scores_sum = all_scores.sum(dim=1) - loss = -torch.log(positive_scores / all_scores_sum).mean() - return loss - - def get_knn_sum(self, s_batch, states_batch, k): - """ - Compute the sum of distances for the k nearest neighbors of each 's' in the batch 's_batch' relative to a batch 'states_batch'. - :param s_batch: The batch of reference states (a tensor of size [B, input_size]) - :param states_batch: The batch of states (a tensor of size [N, input_size]) - :param k: The number of nearest neighbors to consider - :return: A tensor of size [B] where each element is the sum of distances of the k nearest neighbors for the corresponding element in 's_batch' - """ - # Encode the batches of states - s_encoded = self.forward(s_batch) # Shape: [B, encoded_size] - states_encoded = self.forward(states_batch) # Shape: [N, encoded_size] - - # Calculate Euclidean distances from each s in the batch to all states in the batch - distances = torch.cdist(s_encoded, states_encoded) # Shape: [B, N] - - # Get the k smallest distances for each element in the batch - k_smallest_distances, _ = torch.topk(distances, k, dim=1, largest=False) - - # Compute the sum of these k smallest distances for each element in the batch - distance_sum = k_smallest_distances.sum(dim=1) # Shape: [B] - - return distance_sum - - -if __name__ == "__main__": - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if args.use_hp_file: - import json - with open(args.hp_file, "r") as f: - type_id = config[args.env_id]['type_id'] - hp = json.load(f)['hyperparameters'][type_id][args.exp_name] - for k, v in hp.items(): - setattr(args, k, v) - - - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - if args.sweep_mode: - wandb.init() - # set config from sweep - wandb.config.update(args) - else : - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=False, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - # PLOTTING - if args.make_gif: - env_plot = Wenv(env_id=args.env_id, - render_bool_matplot=True, - xp_id=run_name, - **config[args.env_id]) - if args.plotly: - env_plot = Wenv(env_id=args.env_id, - render_bool_plotly=True, - xp_id=run_name, - **config[args.env_id]) - # coverage check env - env_check = Wenv(env_id=args.env_id, - render_bool_matplot=False, - xp_id=run_name, - **config[args.env_id]) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - max_step = config[args.env_id]['kwargs']['max_episode_steps'] - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - encoder = Encoder(observation_space = envs.single_observation_space, - device = device, - env_id = args.env_id, - latent_dim = args.latent_dim, - sigma = args.sigma) - optimizer_encoder = optim.Adam(encoder.parameters(), lr=args.encoder_lr, eps=1e-5) - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs= args.num_envs - ) - rwd_mean = torch.zeros(1, dtype=torch.float32).to(device) - rwd_std = torch.ones(1, dtype=torch.float32).to(device) - rwd_intrinsic_mean = torch.zeros(1, dtype=torch.float32).to(device) - rwd_intrinsic_std = torch.ones(1, dtype=torch.float32).to(device) - start_time = time.time() - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # coverage assessment - env_check.update_coverage(obs) - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - if info is not None: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}, episodic_length={info['episode']['l']}") - wandb.log({ - "charts/episodic_return" : info["episode"]["r"], - "charts/episodic_length" : info["episode"]["l"], - }, step = global_step) if args.track else None - - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - - - # ENCODER TRAINING - if global_step % args.nb_epoch_before_training*max_step == 0 and global_step > args.learning_starts: - mean_encoder_loss = 0 - for _ in range(args.encoder_epochs): - for _ in range(args.nb_epoch_before_training*max_step // args.encoder_batch_size): - data = rb.sample(args.encoder_batch_size) - b_obs = torch.Tensor(data.observations).to(device) - b_obs_aug = encoder.data_augmentation(b_obs) - b_z = encoder(b_obs) - b_z_aug = encoder(b_obs_aug) - encoder_loss = encoder.contrastive_loss(b_z, b_z_aug) - optimizer_encoder.zero_grad() - encoder_loss.backward() - optimizer_encoder.step() - mean_encoder_loss += encoder_loss.item() - wandb.log({ - "losses/encoder_loss" : encoder_loss.item(), - }, step = global_step) if args.track else None - # ALGO LOGIC: training. - if global_step > args.learning_starts and global_step % args.learning_frequency == 0: - data = rb.sample(args.batch_size) - data_intrinsic = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - intrinsic_reward = encoder.get_knn_sum(data.observations, data_intrinsic.observations, args.knn) / args.latent_dim - extrinsic_reward = data.rewards.flatten() - if args.keep_extrinsic_reward: - # update statistics - rwd_mean = rwd_mean * (1 - args.tau_update) + args.tau_update * extrinsic_reward.mean() - rwd_std = rwd_std * (1 - args.tau_update) + args.tau_update * extrinsic_reward.std() - rwd_intrinsic_mean = rwd_intrinsic_mean * (1 - args.tau_update) + args.tau_update * intrinsic_reward.mean() - rwd_intrinsic_std = rwd_intrinsic_std * (1 - args.tau_update) + args.tau_update * intrinsic_reward.std() - # normalize - extrinsic_reward = (extrinsic_reward - rwd_mean) / (rwd_std + 1e-8) if args.normalize_rwd else extrinsic_reward - intrinsic_reward = (intrinsic_reward - rwd_intrinsic_mean) / (rwd_intrinsic_std + 1e-8) if args.normalize_rwd else intrinsic_reward - # coef decay - coef_intrinsic = max(0, args.coef_intrinsic - global_step / args.total_timesteps) - rewards = extrinsic_reward.flatten()*args.coef_extrinsic + intrinsic_reward.flatten()*coef_intrinsic - else: - rewards = intrinsic_reward*args.coef_intrinsic - next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - - if global_step % 100 == 0: - wandb.log({ - "losses/qf1_values" : qf1_a_values.mean().item(), - "losses/qf2_values" : qf2_a_values.mean().item(), - "losses/qf1_loss" : qf1_loss.item(), - "losses/qf2_loss" : qf2_loss.item(), - "losses/qf_loss" : qf_loss.item() / 2.0, - "losses/actor_loss" : actor_loss.item(), - "losses/alpha" : alpha, - "charts/SPS" : int(global_step / (time.time() - start_time)), - "losses/alpha_loss" : alpha_loss.item() if args.autotune else 0.0, - "specific/intrinsic_reward_mean" : intrinsic_reward.mean(), - "specific/intrinsic_reward_max" : intrinsic_reward.max(), - "specific/intrinsic_reward_min" : intrinsic_reward.min(), - }, step = global_step) if args.track else None - - if global_step % args.metric_freq == 0 : - wandb.log({ - "charts/coverage" : env_check.get_coverage(), - "charts/shannon_entropy": env_check.shannon_entropy(), - }, step = global_step) if args.track else None - - if global_step % args.fig_frequency == 0 and global_step*args.num_envs > args.learning_starts: - if args.make_gif : - # print('size rho', size_rho) - # print('max x rho', rb.observations[max(rb.pos if not rb.full else rb.buffer_size-size_rho, 0):rb.pos if not rb.full else rb.buffer_size][0][:,0].max()) - image = env_plot.gif(obs_un = rb.observations[np.random.randint(0, rb.pos if not rb.full else rb.buffer_size, 100_000)], - classifier = None, - device= device) - send_matrix(wandb, image, "gif", global_step) - - envs.close() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/PA_version/aux_sac.py b/cleanrl/cleanrl_explo/PA_version/aux_sac.py deleted file mode 100644 index 022c3713..00000000 --- a/cleanrl/cleanrl_explo/PA_version/aux_sac.py +++ /dev/null @@ -1,483 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from envs.wenv import Wenv -from envs.config_env import config -from src.utils.wandb_utils import send_matrix -from src.utils.image_utils import resize_image - - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 1 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "contrastive_test_2" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = False - """whether to capture videos of the agent performances (check out `videos` folder)""" - use_hp_file : bool = False - """if toggled, will load the hyperparameters from file""" - hp_file: str = "hyper_parameters_sac.json" - """the path to the hyperparameters json file""" - sweep_mode: bool = False - """if toggled, will log the sweep id to wandb""" - - # GIF - make_gif: bool = True - """if toggled, will make gif """ - plotly: bool = False - """if toggled, will use plotly instead of matplotlib""" - fig_frequency: int = 1000 - """the frequency of logging the figures""" - metric_freq: int = 1000 - """the frequency of ploting metric""" - - - - # Algorithm specific arguments - env_id: str = "HalfCheetah-v3" - """the environment id of the task""" - total_timesteps: int = 1000000 - """total timesteps of the experiments""" - buffer_size: int = int(1e7) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 2 - """the frequency of training policy (delayed)""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.1 - """Entropy regularization coefficient.""" - autotune: bool = False - """automatic tuning of the entropy coefficient""" - num_envs: int = 4 - """ num of parallel envs """ - - # VAE SPECIFIC - vae_lr: float = 1e-4 - """the learning rate of the VAE""" - vae_epochs: int = 1 - """the number of epochs for the VAE""" - nb_epoch_before_training: int = 8 - """ nb epoch between each training """ - vae_latent_dim: int = 32 - """the latent dimension of the VAE""" - clip_vae: float = 120.0 - """the clipping of the VAE""" - vae_batch_size: int = 128 - """the batch size of the VAE""" - - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 100.0 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 1.0 - - -def make_env(env_id, idx, capture_video, run_name): - def thunk(): - env = Wenv(env_id=env_id, xp_id=run_name, **config[env_id]) - env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - env = gym.wrappers.ClipAction(env) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - -class VAE(nn.Module): - def __init__(self, input_dim, latent_dim, feature_extractor=False, env_id='Maze-U', clip_vae=120.0, scale_l = 1000.0): - super().__init__() - self.feature_extractor = feature_extractor - self.clip_vae = clip_vae - self.env_id = env_id - self.scale_l = scale_l - self.encoder = nn.Sequential( - nn.Linear(input_dim, 256) if not feature_extractor else nn.Linear(config[env_id]['coverage_idx'].shape[0], 256), - nn.ReLU(), - nn.Linear(256, 256), - nn.ReLU(), - ) - self.mean_layer = nn.Linear(256, latent_dim) - self.logstd_layer = nn.Linear(256, latent_dim) - self.decoder = nn.Sequential( - nn.Linear(latent_dim, 256), - nn.ReLU(), - nn.Linear(256, 256), - nn.ReLU(), - nn.Linear(256, input_dim) if not feature_extractor else nn.Linear(256, config[env_id]['coverage_idx'].shape[0]), - ) - def encode(self, x): - x = self.feature(x) if self.feature_extractor else x - x = self.encoder(x) - mean = self.mean_layer(x) - logstd = self.logstd_layer(x) - return mean, logstd - - def decode(self, z): - return self.decoder(z) - - def forward(self, x): - mean, logstd = self.encode(x/self.scale_l) - z = mean + torch.randn_like(mean) * torch.exp(logstd) - x_recon = torch.clamp(self.decode(z), -self.clip_vae, self.clip_vae) - return x_recon, mean, logstd - def loss(self, x, reduce=True): - x_recon, mean, logstd = self(x) - x = self.feature(x) if self.feature_extractor else x/self.scale_l - recon_loss = F.mse_loss(x_recon, x, reduction='none').sum(1) - kl_loss = -0.5 * (1 + 2 * logstd - mean ** 2 - torch.exp(2 * logstd)).sum(1) - loss = recon_loss + kl_loss - if reduce: - return loss.mean() - return loss - def feature(self, x): - x= x[:, :, config[self.env_id]['coverage_idx']] if x.dim() == 3 else x[:, config[self.env_id]['coverage_idx']] - return x - - -if __name__ == "__main__": - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if args.use_hp_file: - import json - with open(args.hp_file, "r") as f: - type_id = config[args.env_id]['type_id'] - hp = json.load(f)['hyperparameters'][type_id][args.exp_name] - for k, v in hp.items(): - setattr(args, k, v) - - - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - if args.sweep_mode: - wandb.init() - # set config from sweep - wandb.config.update(args) - else : - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=False, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - # PLOTTING - if args.make_gif: - env_plot = Wenv(env_id=args.env_id, - render_bool_matplot=True, - xp_id=run_name, - **config[args.env_id]) - if args.plotly: - env_plot = Wenv(env_id=args.env_id, - render_bool_plotly=True, - xp_id=run_name, - **config[args.env_id]) - # coverage check env - env_check = Wenv(env_id=args.env_id, - render_bool_matplot=False, - xp_id=run_name, - **config[args.env_id]) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - - max_step = config[args.env_id]['kwargs']['max_episode_steps'] - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - vae = VAE( input_dim=np.prod(envs.single_observation_space.shape), - latent_dim=args.vae_latent_dim, - feature_extractor=False, - env_id=args.env_id, - clip_vae=args.clip_vae).to(device) - optimizer_vae = optim.Adam(vae.parameters(), lr=args.vae_lr, eps=1e-5) - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs= args.num_envs - ) - start_time = time.time() - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # coverage assessment - env_check.update_coverage(obs) - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - if info is not None: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}, episodic_length={info['episode']['l']}") - wandb.log({ - "charts/episodic_return" : info["episode"]["r"], - "charts/episodic_length" : info["episode"]["l"], - }, step = global_step) if args.track else None - - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - # VAE TRAINING - if global_step % args.nb_epoch_before_training*max_step == 0 and global_step > args.learning_starts: - mean_vae_loss = 0 - for _ in range(args.vae_epochs): - # for _ in range(int(args.nb_epoch_before_training*max_step/args.vae_batch_size)): - data = rb.sample(args.vae_batch_size) - optimizer_vae.zero_grad() - loss = vae.loss(data.observations) - loss.backward() - optimizer_vae.step() - mean_vae_loss += loss.item() - wandb.log({ - "losses/vae_loss" : mean_vae_loss / int(args.nb_epoch_before_training*max_step/args.vae_batch_size) / args.vae_epochs - }, step = global_step) if args.track else None - - # ALGO LOGIC: training. - if global_step > args.learning_starts: - data = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - intrinsic_reward = vae.loss(data.observations, reduce=False) - extrinsic_reward = data.rewards.flatten() - if args.keep_extrinsic_reward: - rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic - else: - rewards = intrinsic_reward*args.coef_intrinsic - next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - - if global_step % 100 == 0: - wandb.log({ - "losses/qf1_values" : qf1_a_values.mean().item(), - "losses/qf2_values" : qf2_a_values.mean().item(), - "losses/qf1_loss" : qf1_loss.item(), - "losses/qf2_loss" : qf2_loss.item(), - "losses/qf_loss" : qf_loss.item() / 2.0, - "losses/actor_loss" : actor_loss.item(), - "losses/alpha" : alpha, - "charts/SPS" : int(global_step / (time.time() - start_time)), - "losses/alpha_loss" : alpha_loss.item() if args.autotune else 0.0, - "specific/intrinsic_reward_mean" : intrinsic_reward.mean().item(), - "specific/intrinsic_reward_max" : intrinsic_reward.max().item(), - "specific/intrinsic_reward_min" : intrinsic_reward.min().item(), - }, step = global_step) if args.track else None - - if global_step % args.metric_freq == 0 : - wandb.log({ - "charts/coverage" : env_check.get_coverage(), - "charts/shannon_entropy": env_check.shannon_entropy(), - }, step = global_step) if args.track else None - - if global_step % args.fig_frequency == 0 and global_step > args.learning_starts: - if args.make_gif : - # print('size rho', size_rho) - # print('max x rho', rb.observations[max(rb.pos if not rb.full else rb.buffer_size-size_rho, 0):rb.pos if not rb.full else rb.buffer_size][0][:,0].max()) - image = env_plot.gif(obs_un = rb.observations[np.random.randint(0, rb.pos if not rb.full else rb.buffer_size, 100_000)], - classifier = None, - device= device) - send_matrix(wandb, resize_image(image, 128,128), "gif", global_step) - - envs.close() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/PA_version/icm_sac.py b/cleanrl/cleanrl_explo/PA_version/icm_sac.py deleted file mode 100644 index 00d5a140..00000000 --- a/cleanrl/cleanrl_explo/PA_version/icm_sac.py +++ /dev/null @@ -1,476 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from envs.wenv import Wenv -from envs.config_env import config -from src.utils.wandb_utils import send_matrix - - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 1 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "contrastive_test_2" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = False - """whether to capture videos of the agent performances (check out `videos` folder)""" - use_hp_file : bool = False - """if toggled, will load the hyperparameters from file""" - hp_file: str = "hyper_parameters_sac.json" - """the path to the hyperparameters json file""" - sweep_mode: bool = False - """if toggled, will log the sweep id to wandb""" - - # GIF - make_gif: bool = True - """if toggled, will make gif """ - plotly: bool = False - """if toggled, will use plotly instead of matplotlib""" - fig_frequency: int = 1000 - """the frequency of logging the figures""" - metric_freq: int = 1000 - """the frequency of ploting metric""" - - - - # Algorithm specific arguments - env_id: str = "Maze-Ur-v0" - """the environment id of the task""" - total_timesteps: int = 1000000 - """total timesteps of the experiments""" - buffer_size: int = int(1e7) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 2 - """the frequency of training policy (delayed)""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.1 - """Entropy regularization coefficient.""" - autotune: bool = False - """automatic tuning of the entropy coefficient""" - num_envs: int = 4 - """ num of parallel envs """ - - # ICM SPECIFIC - nb_epoch_before_training: int = 8 - """ nb epoch between each training """ - icm_lr: float = 5e-4 - """the learning rate of the icm""" - icm_epochs: int = 4 - """the number of epochs for the icm""" - beta: float = 0.2 - """the beta of the icm""" - clip_intrinsic_reward: float = 10.0 - """the clipping of the intrinsic reward""" - - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 1.0 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 1.0 - """the coefficient of the extrinsic reward""" - -def make_env(env_id, idx, capture_video, run_name): - def thunk(): - env = Wenv(env_id=env_id, xp_id=run_name, **config[env_id]) - env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - env = gym.wrappers.ClipAction(env) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - -class ICM(nn.Module): - def __init__(self, state_dim, action_dim, feature_dim = 64, beta = 0.2, device = 'cpu'): - super(ICM, self).__init__() - # feature network - self.f1 = nn.Linear(state_dim, 256, device=device) - self.f2 = nn.Linear(256, 64, device=device) - self.f3 = nn.Linear(64, feature_dim, device=device) - # inverse model - self.i1 = nn.Linear(2*feature_dim, 64, device=device) - self.i2 = nn.Linear(64, action_dim, device=device) - # forward model - self.fo1 = nn.Linear(feature_dim + action_dim, 64, device=device) - self.fo2 = nn.Linear(64, feature_dim, device=device) - # beta - self.beta = beta - self.device = device - - def feature(self, x): - x = F.relu(self.f1(x)) - x = F.relu(self.f2(x)) - x = self.f3(x) - return x - def inverse(self, f1, f2): - x = torch.cat([f1, f2], dim = 1) - x = F.relu(self.i1(x)) - x = self.i2(x) - return x - def forward_t(self, f1, a): - x = torch.cat([f1, a], dim = 1) - x = F.relu(self.fo1(x)) - x = self.fo2(x) - return x - - def loss(self, obs, next_obs, action, dones, reduce = True): - # feature - f = self.feature(obs) - f_next = self.feature(next_obs) - # inverse - a_pred = self.inverse(f, f_next) - # forward - f_next_pred = self.forward_t(f, action) - # loss - loss_inverse = F.mse_loss(a_pred, action, reduction = 'none').sum(1) if not reduce else F.mse_loss(a_pred, action) - loss_forward = F.mse_loss(f_next_pred, f_next, reduction = 'none').sum(1) if not reduce else F.mse_loss(f_next_pred, f_next) - return self.beta * loss_forward + (1 - self.beta) * loss_inverse - - -if __name__ == "__main__": - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if args.use_hp_file: - import json - with open(args.hp_file, "r") as f: - type_id = config[args.env_id]['type_id'] - hp = json.load(f)['hyperparameters'][type_id][args.exp_name] - for k, v in hp.items(): - setattr(args, k, v) - - - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - if args.sweep_mode: - wandb.init() - # set config from sweep - wandb.config.update(args) - else : - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=False, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - # PLOTTING - if args.make_gif: - env_plot = Wenv(env_id=args.env_id, - render_bool_matplot=True, - xp_id=run_name, - **config[args.env_id]) - if args.plotly: - env_plot = Wenv(env_id=args.env_id, - render_bool_plotly=True, - xp_id=run_name, - **config[args.env_id]) - # coverage check env - env_check = Wenv(env_id=args.env_id, - render_bool_matplot=False, - xp_id=run_name, - **config[args.env_id]) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - - max_step = config[args.env_id]['kwargs']['max_episode_steps'] - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - icm = ICM(state_dim=np.array(envs.single_observation_space.shape).prod(), - action_dim=np.array(envs.single_action_space.shape).prod(), - feature_dim=64, - beta=args.beta, - device=device).to(device) - optimizer_icm = optim.Adam(icm.parameters(), lr=args.icm_lr, eps=1e-5) - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs= args.num_envs - ) - start_time = time.time() - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # coverage assessment - env_check.update_coverage(obs) - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - if info is not None: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}, episodic_length={info['episode']['l']}") - wandb.log({ - "charts/episodic_return" : info["episode"]["r"], - "charts/episodic_length" : info["episode"]["l"], - }, step = global_step) if args.track else None - - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - # ICM TRAINING - if global_step % args.nb_epoch_before_training*max_step == 0 and global_step > args.learning_starts: - mean_icm_loss = 0.0 - for _ in range(args.icm_epochs): - # for _ in range(int(args.nb_epoch_before_training*max_step/args.vae_batch_size)): - data = rb.sample(args.batch_size) - icm_loss = icm.loss(data.observations, data.next_observations, data.actions, data.dones, reduce = True) - optimizer_icm.zero_grad() - icm_loss.mean().backward() - optimizer_icm.step() - mean_icm_loss += icm_loss.mean().item() - wandb.log({ - "losses/icm_loss" : mean_icm_loss / args.icm_epochs, - }, step = global_step) if args.track else None - - # ALGO LOGIC: training. - if global_step > args.learning_starts: - data = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - intrinsic_reward = icm.loss(data.observations, data.next_observations, data.actions, data.dones, reduce = False) - extrinsic_reward = data.rewards.flatten() - if args.keep_extrinsic_reward: - rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic - else: - rewards = intrinsic_reward*args.coef_intrinsic - next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - - if global_step % 100 == 0: - wandb.log({ - "losses/qf1_values" : qf1_a_values.mean().item(), - "losses/qf2_values" : qf2_a_values.mean().item(), - "losses/qf1_loss" : qf1_loss.item(), - "losses/qf2_loss" : qf2_loss.item(), - "losses/qf_loss" : qf_loss.item() / 2.0, - "losses/actor_loss" : actor_loss.item(), - "losses/alpha" : alpha, - "charts/SPS" : int(global_step / (time.time() - start_time)), - "losses/alpha_loss" : alpha_loss.item() if args.autotune else 0.0, - "specific/intrinsic_reward_mean" : intrinsic_reward.mean().item(), - "specific/intrinsic_reward_max" : intrinsic_reward.max().item(), - "specific/intrinsic_reward_min" : intrinsic_reward.min().item(), - }, step = global_step) if args.track else None - - if global_step % args.metric_freq == 0 : - wandb.log({ - "charts/coverage" : env_check.get_coverage(), - "charts/shannon_entropy": env_check.shannon_entropy(), - }, step = global_step) if args.track else None - - if global_step % args.fig_frequency == 0 and global_step > args.learning_starts: - if args.make_gif : - # print('size rho', size_rho) - # print('max x rho', rb.observations[max(rb.pos if not rb.full else rb.buffer_size-size_rho, 0):rb.pos if not rb.full else rb.buffer_size][0][:,0].max()) - image = env_plot.gif(obs_un = rb.observations[np.random.randint(0, rb.pos if not rb.full else rb.buffer_size, 100_000)], - classifier = None, - device= device) - send_matrix(wandb, image, "gif", global_step) - - envs.close() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/PA_version/ngu_sac.py b/cleanrl/cleanrl_explo/PA_version/ngu_sac.py deleted file mode 100644 index 1f658098..00000000 --- a/cleanrl/cleanrl_explo/PA_version/ngu_sac.py +++ /dev/null @@ -1,549 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from envs.wenv import Wenv -from envs.config_env import config -from src.utils.wandb_utils import send_matrix - - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 1 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "contrastive_test_2" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = False - """whether to capture videos of the agent performances (check out `videos` folder)""" - use_hp_file : bool = False - """if toggled, will load the hyperparameters from file""" - hp_file: str = "hyper_parameters_sac.json" - """the path to the hyperparameters json file""" - sweep_mode: bool = False - """if toggled, will log the sweep id to wandb""" - - # GIF - make_gif: bool = True - """if toggled, will make gif """ - plotly: bool = False - """if toggled, will use plotly instead of matplotlib""" - fig_frequency: int = 1000 - """the frequency of logging the figures""" - metric_freq: int = 1000 - """the frequency of ploting metric""" - - - - # Algorithm specific arguments - env_id: str = "Maze-Ur-v0" - """the environment id of the task""" - total_timesteps: int = 1000000 - """total timesteps of the experiments""" - buffer_size: int = int(1e7) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 4 - """the frequency of training policy (delayed)""" - learning_frequency: int = 2 - """the frequency of training the Q network""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.1 - """Entropy regularization coefficient.""" - autotune: bool = False - """automatic tuning of the entropy coefficient""" - num_envs: int = 4 - """ num of parallel envs """ - - - # NGU SPECIFIC - ngu_lr: float = 1e-4 - """the learning rate of the ngu""" - ngu_epochs: int = 1 - """the number of epochs for the ngu""" - nb_epoch_before_training: int = 8 - """ nb epoch between each training """ - ngu_feature_dim: int = 64 - """the feature dimension of the ngu""" - k_nearest: int = 8 - """the number of nearest neighbors""" - clip_reward:float = 5.0 - """the clip reward""" - - - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 1.0 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 1.0 - """the coefficient of the extrinsic reward""" - - -def make_env(env_id, idx, capture_video, run_name): - def thunk(): - env = Wenv(env_id=env_id, xp_id=run_name, **config[env_id]) - env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - env = gym.wrappers.ClipAction(env) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - -class NGU(nn.Module): - def __init__(self, state_dim, action_dim, feature_dim, device, k = 10, c = 0.001, L=5, eps = 1e-3, clip_reward = 5.0): - super(NGU, self).__init__() - # RND - # trained network - self.f1 = nn.Linear(state_dim, 128, device=device) - self.f2 = nn.Linear(128, 64, device=device) - self.f3 = nn.Linear(64, 1, device=device) - # target network - self.f1_t = nn.Linear(state_dim, 128, device=device) - self.f2_t = nn.Linear(128, 64, device=device) - self.f3_t = nn.Linear(64, 1, device=device) - # embedding network - self.f1_z = nn.Linear(state_dim, 128, device=device) - self.f2_z = nn.Linear(128, 64, device=device) - self.f3_z = nn.Linear(64, feature_dim, device=device) - # action network - self.f1_a = nn.Linear(feature_dim*2 , 128, device=device) - self.f2_a = nn.Linear(128, 64, device=device) - self.f3_a = nn.Linear(64, action_dim, device=device) - # running mean and std of rnd error - self.running_rnd_mean = 0.0 - self.running_rnd_std = 1.0 - self.dm2 = 0.0 - # HP NGU - self.k = k - self.c = c - self.L = L - self.epsilon = eps - self.clip_reward = clip_reward - - def forward(self, x): - x = F.relu(self.f1(x)) - x = F.relu(self.f2(x)) - x = self.f3(x) - return x - - def forward_t(self, x): - with torch.no_grad(): - x = F.relu(self.f1_t(x)) - x = F.relu(self.f2_t(x)) - x = self.f3_t(x) - return x - - def rnd_loss(self, x, reduce = True): - return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none') - - def embedding(self, s): - x = F.relu(self.f1_z(s)) - x = F.relu(self.f2_z(x)) - x = self.f3_z(x) - return x - - def action_pred(self, s0, s1): - x = torch.cat([s0, s1], 1) - x = F.relu(self.f1_a(x)) - x = F.relu(self.f2_a(x)) - x = self.f3_a(x) - return x - - def reward_episode(self, s, episode): - z_s = self.embedding(s) - z_episode = self.embedding(episode) - dist = torch.norm(z_s - z_episode, dim=1) - kernel = self.epsilon/(dist/self.dm2 + self.epsilon) - top_k_kernel = torch.topk(kernel, self.k, largest = True) - top_k = torch.topk(dist, self.k, largest = False) - # update running mean and std - self.dm2 = 0.99 * self.dm2 + 0.01 * top_k.values.mean().item() - # rnd loss - rnd_loss = self.rnd_loss(s, reduce = False).item() - # episodic reward - reward_episodic = (1/(torch.sqrt(top_k_kernel.values.mean()) + self.c)).item() - # ngu reward - ngu_reward = reward_episodic * min(max(rnd_loss, 1), self.L) - # clip reward - ngu_reward = np.clip(ngu_reward, -self.clip_reward, self.clip_reward) - return ngu_reward - - - - def loss(self,s,s_next,a,d): - rnd_loss = self.rnd_loss(s) - # update running mean and std - self.running_rnd_mean = 0.99 * self.running_rnd_mean + 0.01 * rnd_loss.mean().item() - self.running_rnd_std = 0.99 * self.running_rnd_std + 0.01 * rnd_loss.std().item() - # NGU loss - s0 = self.embedding(s) - s1 = self.embedding(s_next) - h_loss = (self.action_pred(s0, s1) - a)**2 * (1-d) - return rnd_loss + h_loss.mean() - - - - - - - -if __name__ == "__main__": - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if args.use_hp_file: - import json - with open(args.hp_file, "r") as f: - type_id = config[args.env_id]['type_id'] - hp = json.load(f)['hyperparameters'][type_id][args.exp_name] - for k, v in hp.items(): - setattr(args, k, v) - - - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - if args.sweep_mode: - wandb.init() - # set config from sweep - wandb.config.update(args) - else : - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=False, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - # PLOTTING - if args.make_gif: - env_plot = Wenv(env_id=args.env_id, - render_bool_matplot=True, - xp_id=run_name, - **config[args.env_id]) - if args.plotly: - env_plot = Wenv(env_id=args.env_id, - render_bool_plotly=True, - xp_id=run_name, - **config[args.env_id]) - # coverage check env - env_check = Wenv(env_id=args.env_id, - render_bool_matplot=False, - xp_id=run_name, - **config[args.env_id]) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - - max_step = config[args.env_id]['kwargs']['max_episode_steps'] - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - ngu = NGU(state_dim=envs.single_observation_space.shape[0], - action_dim=envs.single_action_space.shape[0], - feature_dim=args.ngu_feature_dim, - device=device, - k = args.k_nearest, - clip_reward=args.clip_reward).to(device) - optimizer_ngu = optim.Adam(ngu.parameters(), lr=args.ngu_lr, eps=1e-5) - episodes = [ [] for _ in range(args.num_envs)] - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs= args.num_envs - ) - start_time = time.time() - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # coverage assessment - env_check.update_coverage(obs) - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - # COMPUTE REWARD - intrinsic_reward = torch.zeros(args.num_envs) - for idx in range(args.num_envs): - with torch.no_grad(): - # rewards NGU - intrinsic_reward[idx] = ngu.reward_episode(torch.tensor(obs[idx]).unsqueeze(0).to(device), torch.tensor(np.array(episodes[idx])).to(device)) if len(episodes[idx]) > args.k_nearest else 0.0 - extrinsic_reward = rewards - if args.keep_extrinsic_reward: - rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic - else: - rewards = intrinsic_reward*args.coef_intrinsic - - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - if info is not None: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}, episodic_length={info['episode']['l']}") - wandb.log({ - "charts/episodic_return" : info["episode"]["r"], - "charts/episodic_length" : info["episode"]["l"], - }, step = global_step) if args.track else None - - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - episodes[idx] = [] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - for idx, (ob, ac, rew, next_ob) in enumerate(zip(obs, actions, rewards, real_next_obs)): episodes[idx].append(ob) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - - - # ngu TRAINING - if global_step % args.nb_epoch_before_training*max_step == 0 and global_step > args.learning_starts: - mean_ngu_loss = 0.0 - for _ in range(args.ngu_epochs): - # for _ in range(int(args.nb_epoch_before_training*max_step/args.vae_batch_size)): - data = rb.sample(args.batch_size) - ngu_loss = ngu.loss(data.observations, data.next_observations, data.actions, data.dones) - optimizer_ngu.zero_grad() - ngu_loss.backward() - optimizer_ngu.step() - mean_ngu_loss += ngu_loss.item() - wandb.log({ - "losses/ngu_loss" : mean_ngu_loss / args.ngu_epochs, - }, step = global_step) if args.track else None - - - - # ALGO LOGIC: training. - if global_step > args.learning_starts and global_step % args.learning_frequency == 0: - data = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - - if global_step % 100 == 0: - wandb.log({ - "losses/qf1_values" : qf1_a_values.mean().item(), - "losses/qf2_values" : qf2_a_values.mean().item(), - "losses/qf1_loss" : qf1_loss.item(), - "losses/qf2_loss" : qf2_loss.item(), - "losses/qf_loss" : qf_loss.item() / 2.0, - "losses/actor_loss" : actor_loss.item(), - "losses/alpha" : alpha, - "charts/SPS" : int(global_step / (time.time() - start_time)), - "losses/alpha_loss" : alpha_loss.item() if args.autotune else 0.0, - "specific/intrinsic_reward_mean" :data.rewards.mean().item(), - "specific/intrinsic_reward_max" :data.rewards.max().item(), - "specific/intrinsic_reward_min" :data.rewards.min().item(), - }, step = global_step) if args.track else None - - if global_step % args.metric_freq == 0 : - wandb.log({ - "charts/coverage" : env_check.get_coverage(), - "charts/shannon_entropy": env_check.shannon_entropy(), - }, step = global_step) if args.track else None - - if global_step % args.fig_frequency == 0 and global_step > args.learning_starts: - if args.make_gif : - # print('size rho', size_rho) - # print('max x rho', rb.observations[max(rb.pos if not rb.full else rb.buffer_size-size_rho, 0):rb.pos if not rb.full else rb.buffer_size][0][:,0].max()) - image = env_plot.gif(obs_un = rb.observations[np.random.randint(0, rb.pos if not rb.full else rb.buffer_size, 100_000)], - classifier = None, - device= device) - send_matrix(wandb, image, "gif", global_step) - - envs.close() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/PA_version/rnd_sac.py b/cleanrl/cleanrl_explo/PA_version/rnd_sac.py deleted file mode 100644 index c37cbcf6..00000000 --- a/cleanrl/cleanrl_explo/PA_version/rnd_sac.py +++ /dev/null @@ -1,453 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from envs.wenv import Wenv -from envs.config_env import config -from src.utils.wandb_utils import send_matrix - - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 1 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "contrastive_test_2" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = False - """whether to capture videos of the agent performances (check out `videos` folder)""" - use_hp_file : bool = False - """if toggled, will load the hyperparameters from file""" - hp_file: str = "hyper_parameters.json" - """the path to the hyperparameters json file""" - sweep_mode: bool = False - """if toggled, will log the sweep id to wandb""" - - # GIF - make_gif: bool = True - """if toggled, will make gif """ - plotly: bool = False - """if toggled, will use plotly instead of matplotlib""" - fig_frequency: int = 1000 - """the frequency of logging the figures""" - metric_freq: int = 1000 - """the frequency of ploting metric""" - - - - # Algorithm specific arguments - env_id: str = "Maze-Ur-v0" - """the environment id of the task""" - total_timesteps: int = 1000000 - """total timesteps of the experiments""" - buffer_size: int = int(1e7) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 2 - """the frequency of training policy (delayed)""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.1 - """Entropy regularization coefficient.""" - autotune: bool = False - """automatic tuning of the entropy coefficient""" - num_envs: int = 4 - """ num of parallel envs """ - - - # RND SPECIFIC - rnd_lr: float = 1e-4 - """the learning rate of the RND""" - rnd_epochs: int = 1 - """the number of epochs for the RND""" - nb_epoch_before_training: int = 8 - """ nb epoch between each training """ - - - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 1000.0 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 1.0 - """the coefficient of the extrinsic reward""" - - -def make_env(env_id, idx, capture_video, run_name): - def thunk(): - env = Wenv(env_id=env_id, xp_id=run_name, **config[env_id]) - env = gym.wrappers.FlattenObservation(env) # deal with dm_control's Dict observation space - env = gym.wrappers.RecordEpisodeStatistics(env) - if capture_video: - if idx == 0: - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - env = gym.wrappers.ClipAction(env) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - -class RND(nn.Module): - def __init__(self, state_dim, device): - super(RND, self).__init__() - # trained network - self.f1 = nn.Linear(state_dim, 256, device=device) - self.f2 = nn.Linear(256, 256, device=device) - self.f3 = nn.Linear(256, 1, device=device) - # target network - self.f1_t = nn.Linear(state_dim, 256, device=device) - self.f2_t = nn.Linear(256, 256, device=device) - self.f3_t = nn.Linear(256, 1, device=device) - - def forward(self, x): - x = F.relu(self.f1(x)) - x = F.relu(self.f2(x)) - x = self.f3(x) - return x - - def forward_t(self, x): - with torch.no_grad(): - x = F.relu(self.f1_t(x)) - x = F.relu(self.f2_t(x)) - x = self.f3_t(x) - return x - - def loss(self, x, reduce = True): - return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none') - - -if __name__ == "__main__": - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if args.use_hp_file: - import json - with open(args.hp_file, "r") as f: - type_id = config[args.env_id]['type_id'] - hp = json.load(f)['hyperparameters'][type_id][args.exp_name] - for k, v in hp.items(): - setattr(args, k, v) - - - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - if args.track: - import wandb - if args.sweep_mode: - wandb.init() - # set config from sweep - wandb.config.update(args) - else : - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=False, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - - # PLOTTING - if args.make_gif: - env_plot = Wenv(env_id=args.env_id, - render_bool_matplot=True, - xp_id=run_name, - **config[args.env_id]) - if args.plotly: - env_plot = Wenv(env_id=args.env_id, - render_bool_plotly=True, - xp_id=run_name, - **config[args.env_id]) - # coverage check env - env_check = Wenv(env_id=args.env_id, - render_bool_matplot=False, - xp_id=run_name, - **config[args.env_id]) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - - max_step = config[args.env_id]['kwargs']['max_episode_steps'] - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - rnd = RND(envs.single_observation_space.shape[0], device).to(device) - optimizer_rnd = optim.Adam(rnd.parameters(), lr=args.rnd_lr, eps=1e-5) - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs= args.num_envs - ) - start_time = time.time() - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # coverage assessment - env_check.update_coverage(obs) - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}") - wandb.log({ - "charts/episodic_return" : info["episode"]["r"], - "charts/episodic_length" : info["episode"]["l"], - }, step = global_step) if args.track else None - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - # RND TRAINING - if global_step % args.nb_epoch_before_training*max_step == 0 and global_step > args.learning_starts: - mean_rnd_loss = 0.0 - for _ in range(args.rnd_epochs): - # for _ in range(int(args.nb_epoch_before_training*max_step/args.vae_batch_size)): - data = rb.sample(args.batch_size) - rnd_loss = rnd.loss(data.observations, reduce = True) - optimizer_rnd.zero_grad() - rnd_loss.backward() - optimizer_rnd.step() - mean_rnd_loss += rnd_loss.item() - wandb.log({ - "losses/rnd_loss" : mean_rnd_loss / args.rnd_epochs, - }, step = global_step) if args.track else None - - - - # ALGO LOGIC: training. - if global_step > args.learning_starts: - data = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - intrinsic_reward = rnd.loss(data.observations, reduce = False) - extrinsic_reward = data.rewards.flatten() - if args.keep_extrinsic_reward: - rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic - else: - rewards = intrinsic_reward*args.coef_intrinsic - next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - - if global_step % 100 == 0: - wandb.log({ - "losses/qf1_values" : qf1_a_values.mean().item(), - "losses/qf2_values" : qf2_a_values.mean().item(), - "losses/qf1_loss" : qf1_loss.item(), - "losses/qf2_loss" : qf2_loss.item(), - "losses/qf_loss" : qf_loss.item() / 2.0, - "losses/actor_loss" : actor_loss.item(), - "losses/alpha" : alpha, - "charts/SPS" : int(global_step / (time.time() - start_time)), - "losses/alpha_loss" : alpha_loss.item() if args.autotune else 0.0, - "specific/intrinsic_reward_mean" : intrinsic_reward.mean().item(), - "specific/intrinsic_reward_max" : intrinsic_reward.max().item(), - "specific/intrinsic_reward_min" : intrinsic_reward.min().item(), - }, step = global_step) if args.track else None - - if global_step % args.metric_freq == 0 : - wandb.log({ - "charts/coverage" : env_check.get_coverage(), - "charts/shannon_entropy": env_check.shannon_entropy(), - }, step = global_step) if args.track else None - - if global_step % args.fig_frequency == 0 and global_step > args.learning_starts: - if args.make_gif : - # print('size rho', size_rho) - # print('max x rho', rb.observations[max(rb.pos-size_rho, 0):rb.pos][0][:,0].max()) - image = env_plot.gif(obs_un = rb.observations[np.random.randint(0, rb.pos, 100_000)], - classifier = None, - device= device) - send_matrix(wandb, image, "gif", global_step) - - envs.close() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_apt.py b/cleanrl/cleanrl_explo/sac_apt.py index 9b5fcde2..ac84cece 100644 --- a/cleanrl/cleanrl_explo/sac_apt.py +++ b/cleanrl/cleanrl_explo/sac_apt.py @@ -13,7 +13,6 @@ import tyro from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze @dataclass @@ -100,12 +99,10 @@ class Args: def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array") env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - #env = gym.make(env_id) - env = LilMaze() + env = gym.make(env_id) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -327,9 +324,6 @@ def main(seed=None, sweep=False): start_time = time.time() - - pure_exploration_discrete_matrix = np.zeros((50,50)) - # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) for global_step in range(args.total_timesteps): @@ -343,21 +337,15 @@ def main(seed=None, sweep=False): # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - - # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: for info in infos["final_info"]: if info is not None: print(f"global_step={global_step}, episodic_return={info['episode']['r']}") if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) + episodic_returns_list.append(info["episode"]["r"]) corresponding_steps.append(global_step) else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break diff --git a/cleanrl/cleanrl_explo/sac_aux.py b/cleanrl/cleanrl_explo/sac_aux.py index 069f6e3f..e6487f58 100644 --- a/cleanrl/cleanrl_explo/sac_aux.py +++ b/cleanrl/cleanrl_explo/sac_aux.py @@ -13,7 +13,6 @@ import tyro from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze @dataclass class Args: @@ -92,12 +91,10 @@ class Args: def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array") env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - #env = gym.make(env_id) - env = LilMaze() + env = gym.make(env_id) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -315,8 +312,6 @@ def main(seed=None, sweep=False): ) start_time = time.time() - pure_exploration_discrete_matrix = np.zeros((50,50)) - # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) for global_step in range(args.total_timesteps): @@ -330,20 +325,15 @@ def main(seed=None, sweep=False): # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: for info in infos["final_info"]: if info is not None: print(f"global_step={global_step}, episodic_return={info['episode']['r']}") if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) + episodic_returns_list.append(info["episode"]["r"]) corresponding_steps.append(global_step) else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break diff --git a/cleanrl/cleanrl_explo/sac_icm.py b/cleanrl/cleanrl_explo/sac_icm.py index 71815ede..8909ece9 100644 --- a/cleanrl/cleanrl_explo/sac_icm.py +++ b/cleanrl/cleanrl_explo/sac_icm.py @@ -13,7 +13,6 @@ import tyro from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze @dataclass @@ -92,12 +91,10 @@ class Args: def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array") env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - #env = gym.make(env_id) - env = LilMaze() + env = gym.make(env_id) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -317,9 +314,6 @@ def main(seed=None, sweep=False): ) start_time = time.time() - - pure_exploration_discrete_matrix = np.zeros((50,50)) - # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) for global_step in range(args.total_timesteps): @@ -333,10 +327,6 @@ def main(seed=None, sweep=False): # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - - # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: @@ -344,11 +334,9 @@ def main(seed=None, sweep=False): if info is not None: print(f"global_step={global_step}, episodic_return={info['episode']['r']}") if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) + episodic_returns_list.append(info["episode"]["r"]) corresponding_steps.append(global_step) else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break diff --git a/cleanrl/cleanrl_explo/sac_ngu.py b/cleanrl/cleanrl_explo/sac_ngu.py index ac8b6d13..1200bf07 100644 --- a/cleanrl/cleanrl_explo/sac_ngu.py +++ b/cleanrl/cleanrl_explo/sac_ngu.py @@ -13,7 +13,6 @@ import tyro from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze @dataclass @@ -98,12 +97,10 @@ class Args: def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array") env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - #env = gym.make(env_id) - env = LilMaze() + env = gym.make(env_id) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -408,9 +405,6 @@ def main(seed=None, sweep=False): ) start_time = time.time() - - pure_exploration_discrete_matrix = np.zeros((50,50)) - # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) for global_step in range(args.total_timesteps): @@ -424,10 +418,6 @@ def main(seed=None, sweep=False): # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - - # COMPUTE REWARD reward_ngu = torch.zeros(args.num_envs) for idx in range(args.num_envs): @@ -441,11 +431,9 @@ def main(seed=None, sweep=False): if info is not None: print(f"global_step={global_step}, episodic_return={info['episode']['r']}") if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) + episodic_returns_list.append(info["episode"]["r"]) corresponding_steps.append(global_step) else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break diff --git a/cleanrl/cleanrl_explo/sac_our_method.py b/cleanrl/cleanrl_explo/sac_our_method.py deleted file mode 100644 index ce7adbb0..00000000 --- a/cleanrl/cleanrl_explo/sac_our_method.py +++ /dev/null @@ -1,435 +0,0 @@ -# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy -import os -import random -import time -from dataclasses import dataclass - -import gymnasium as gym -import numpy as np -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -import tyro -from stable_baselines3.common.buffers import ReplayBuffer -from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze - -@dataclass -class Args: - exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" - seed: int = 12 - """seed of the experiment""" - torch_deterministic: bool = True - """if toggled, `torch.backends.cudnn.deterministic=False`""" - cuda: bool = True - """if toggled, cuda will be enabled by default""" - track: bool = True - """if toggled, this experiment will be tracked with Weights and Biases""" - wandb_project_name: str = "SAC - exploration with auxiliary classifier" - """the wandb's project name""" - wandb_entity: str = None - """the entity (team) of wandb's project""" - capture_video: bool = True - """whether to capture videos of the agent performances (check out `videos` folder)""" - - # Algorithm specific arguments - env_id: str = "Hopper-v4" - """the environment id of the task""" - total_timesteps: int = 200000 - """total timesteps of the experiments""" - num_envs: int = 4 - """the number of parallel game environments to run""" - buffer_size: int = int(1e6) - """the replay memory buffer size""" - gamma: float = 0.99 - """the discount factor gamma""" - tau: float = 0.005 - """target smoothing coefficient (default: 0.005)""" - batch_size: int = 256 - """the batch size of sample from the reply memory""" - learning_starts: int = 5e3 - """timestep to start learning""" - policy_lr: float = 3e-4 - """the learning rate of the policy network optimizer""" - q_lr: float = 1e-3 - """the learning rate of the Q network network optimizer""" - policy_frequency: int = 2 - """the frequency of training policy (delayed)""" - target_network_frequency: int = 1 # Denis Yarats' implementation delays this by 2. - """the frequency of updates for the target nerworks""" - alpha: float = 0.2 - """Entropy regularization coefficient.""" - autotune: bool = True - """automatic tuning of the entropy coefficient""" - - - - # classifier specific arguments - classifier_lr: float = 0.005497 - """the learning rate of the classifier""" - classifier_epochs: int = 4 - """the number of epochs for the classifier""" - classifier_frequency: int = 800 - """the frequency of training classifier""" - classifier_lim : float = 10.0 - """the limit of the classifier output""" - - - keep_extrinsic_reward: bool = False - """if toggled, the extrinsic reward will be kept""" - coef_intrinsic : float = 0.6502 - """the coefficient of the intrinsic reward""" - coef_extrinsic : float = 10.59 - """the coefficient of the extrinsic reward""" - - -def make_env(env_id, seed, idx, capture_video, run_name): - def thunk(): - if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") - env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") - else: - #env = gym.make(env_id) - env = LilMaze() - env = gym.wrappers.RecordEpisodeStatistics(env) - env.action_space.seed(seed) - return env - - return thunk - - -# ALGO LOGIC: initialize agent here: -class SoftQNetwork(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - - def forward(self, x, a): - x = torch.cat([x, a], 1) - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return x - - -LOG_STD_MAX = 2 -LOG_STD_MIN = -5 - - -class Actor(nn.Module): - def __init__(self, env): - super().__init__() - self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256) - self.fc2 = nn.Linear(256, 256) - self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape)) - self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape)) - # action rescaling - self.register_buffer( - "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - self.register_buffer( - "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32) - ) - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - mean = self.fc_mean(x) - log_std = self.fc_logstd(x) - log_std = torch.tanh(log_std) - log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # From SpinUp / Denis Yarats - - return mean, log_std - - def get_action(self, x): - mean, log_std = self(x) - std = log_std.exp() - normal = torch.distributions.Normal(mean, std) - x_t = normal.rsample() # for reparameterization trick (mean + std * N(0,1)) - y_t = torch.tanh(x_t) - action = y_t * self.action_scale + self.action_bias - log_prob = normal.log_prob(x_t) - # Enforcing Action Bound - log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6) - log_prob = log_prob.sum(1, keepdim=True) - mean = torch.tanh(mean) * self.action_scale + self.action_bias - return action, log_prob, mean - - -class Classifier(nn.Module): - def __init__(self, input_size, lim = 10): - super().__init__() - self.fc1 = nn.Linear(input_size, 256) - self.fc2 = nn.Linear(256, 256) - self.fc3 = nn.Linear(256, 1) - self.sigmoid = nn.Sigmoid() - self.lim = lim - - def forward(self, x): - x = F.relu(self.fc1(x)) - x = F.relu(self.fc2(x)) - x = self.fc3(x) - return torch.clamp(x, -self.lim, self.lim) - - def loss(self, s : torch.Tensor): - imaginary_sample = torch.rand_like(s) - obj = (torch.log(1-self.sigmoid(self(s))) + torch.log(self.sigmoid(self(imaginary_sample)))).mean() - return obj - -def main(seed=None, sweep=False): - - import stable_baselines3 as sb3 - - if sb3.__version__ < "2.0": - raise ValueError( - """Ongoing migration: run the following command to install the new dependencies: -poetry run pip install "stable_baselines3==2.0.0a1" -""" - ) - - args = tyro.cli(Args) - if seed is not None: - args.seed = seed - run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}" - - - # For hyperparameter optimization, see trainer.py file - if sweep: - episodic_returns_list = [] - corresponding_steps = [] - - import wandb - wandb.init() - - config = wandb.config - - for key, value in vars(args).items(): - if key in config: - setattr(args, key, config[key]) - - - else : - - if args.track: - import wandb - - wandb.init( - project=args.wandb_project_name, - entity=args.wandb_entity, - sync_tensorboard=True, - config=vars(args), - name=run_name, - monitor_gym=True, - save_code=True, - ) - writer = SummaryWriter(f"runs/{run_name}") - writer.add_text( - "hyperparameters", - "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])), - ) - - # TRY NOT TO MODIFY: seeding - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - torch.backends.cudnn.deterministic = args.torch_deterministic - - device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") - - # env setup - envs = gym.vector.SyncVectorEnv( - [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)] - ) - assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported" - - max_action = float(envs.single_action_space.high[0]) - - actor = Actor(envs).to(device) - qf1 = SoftQNetwork(envs).to(device) - qf2 = SoftQNetwork(envs).to(device) - qf1_target = SoftQNetwork(envs).to(device) - qf2_target = SoftQNetwork(envs).to(device) - qf1_target.load_state_dict(qf1.state_dict()) - qf2_target.load_state_dict(qf2.state_dict()) - q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr) - actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr) - - classifier = Classifier(np.prod(envs.single_observation_space.shape), args.classifier_lim).to(device) - classifier_optimizer = optim.Adam(classifier.parameters(), lr=args.classifier_lr) - - # Automatic entropy tuning - if args.autotune: - target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item() - log_alpha = torch.zeros(1, requires_grad=True, device=device) - alpha = log_alpha.exp().item() - a_optimizer = optim.Adam([log_alpha], lr=args.q_lr) - else: - alpha = args.alpha - - envs.single_observation_space.dtype = np.float32 - - # The replay buffer parameters have been updated to handle multiple envs - rb = ReplayBuffer( - args.buffer_size, - envs.single_observation_space, - envs.single_action_space, - device, - handle_timeout_termination=False, - n_envs=args.num_envs - ) - start_time = time.time() - - - pure_exploration_discrete_matrix = np.zeros((50,50)) - - - # TRY NOT TO MODIFY: start the game - obs, _ = envs.reset(seed=args.seed) - for global_step in range(args.total_timesteps): - # ALGO LOGIC: put action logic here - if global_step < args.learning_starts: - actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)]) - else: - actions, _, _ = actor.get_action(torch.Tensor(obs).to(device)) - actions = actions.detach().cpu().numpy() - - # TRY NOT TO MODIFY: execute the game and log data. - next_obs, rewards, terminations, truncations, infos = envs.step(actions) - - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - - - # TRY NOT TO MODIFY: record rewards for plotting purposes - if "final_info" in infos: - for info in infos["final_info"]: - if info is not None: - print(f"global_step={global_step}, episodic_return={info['episode']['r']}") - if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) - corresponding_steps.append(global_step) - else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) - writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) - writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) - break - - # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation` - real_next_obs = next_obs.copy() - for idx, trunc in enumerate(truncations): - if trunc: - real_next_obs[idx] = infos["final_observation"][idx] - rb.add(obs, real_next_obs, actions, rewards, terminations, infos) - - # TRY NOT TO MODIFY: CRUCIAL step easy to overlook - obs = next_obs - - # ALGO LOGIC: training. - if global_step > args.learning_starts: - - if global_step % args.classifier_frequency == 0: - mean_classifier_loss = 0.0 - for _ in range(args.classifier_epochs): - data = rb.sample(args.batch_size) - - classifier_loss = classifier.loss(data.observations) - classifier_optimizer.zero_grad() - classifier_loss.backward() - classifier_optimizer.step() - mean_classifier_loss += classifier_loss.item() - - mean_classifier_loss /= args.classifier_epochs - if not sweep: - writer.add_scalar("losses/classifier_loss", mean_classifier_loss, global_step) - - - - data = rb.sample(args.batch_size) - with torch.no_grad(): - next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations) - qf1_next_target = qf1_target(data.next_observations, next_state_actions) - qf2_next_target = qf2_target(data.next_observations, next_state_actions) - min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi - intrinsic_reward = classifier(data.observations).flatten() - extrinsic_reward = data.rewards.flatten() - assert intrinsic_reward.shape == extrinsic_reward.shape - if args.keep_extrinsic_reward: - rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic - else: - rewards = intrinsic_reward.flatten() *args.coef_intrinsic - next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1) - - - qf1_a_values = qf1(data.observations, data.actions).view(-1) - qf2_a_values = qf2(data.observations, data.actions).view(-1) - - - qf1_loss = F.mse_loss(qf1_a_values, next_q_value) - qf2_loss = F.mse_loss(qf2_a_values, next_q_value) - qf_loss = qf1_loss + qf2_loss - - # optimize the model - q_optimizer.zero_grad() - qf_loss.backward() - q_optimizer.step() - - if global_step % args.policy_frequency == 0: # TD 3 Delayed update support - for _ in range( - args.policy_frequency - ): # compensate for the delay by doing 'actor_update_interval' instead of 1 - pi, log_pi, _ = actor.get_action(data.observations) - qf1_pi = qf1(data.observations, pi) - qf2_pi = qf2(data.observations, pi) - min_qf_pi = torch.min(qf1_pi, qf2_pi) - actor_loss = ((alpha * log_pi) - min_qf_pi).mean() - - actor_optimizer.zero_grad() - actor_loss.backward() - actor_optimizer.step() - - if args.autotune: - with torch.no_grad(): - _, log_pi, _ = actor.get_action(data.observations) - alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean() - - a_optimizer.zero_grad() - alpha_loss.backward() - a_optimizer.step() - alpha = log_alpha.exp().item() - - # update the target networks - if global_step % args.target_network_frequency == 0: - for param, target_param in zip(qf1.parameters(), qf1_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - for param, target_param in zip(qf2.parameters(), qf2_target.parameters()): - target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) - - if global_step % 100 == 0 and not sweep: - writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step) - writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step) - writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step) - writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step) - writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step) - writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step) - writer.add_scalar("losses/alpha", alpha, global_step) - print("SPS:", int(global_step / (time.time() - start_time))) - writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step) - if args.autotune: - writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step) - writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step) - writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step) - writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step) - - envs.close() - if sweep: - return episodic_returns_list, corresponding_steps - writer.close() - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/cleanrl/cleanrl_explo/sac_rnd.py b/cleanrl/cleanrl_explo/sac_rnd.py index 3dd2ec58..98462e98 100644 --- a/cleanrl/cleanrl_explo/sac_rnd.py +++ b/cleanrl/cleanrl_explo/sac_rnd.py @@ -13,7 +13,6 @@ import tyro from stable_baselines3.common.buffers import ReplayBuffer from torch.utils.tensorboard import SummaryWriter -from lil_maze import LilMaze @dataclass class Args: @@ -85,12 +84,10 @@ class Args: def make_env(env_id, seed, idx, capture_video, run_name): def thunk(): if capture_video and idx == 0: - #env = gym.make(env_id, render_mode="rgb_array") - env = LilMaze(render_mode="rgb_array") + env = gym.make(env_id, render_mode="rgb_array") env = gym.wrappers.RecordVideo(env, f"videos/{run_name}") else: - #env = gym.make(env_id) - env = LilMaze() + env = gym.make(env_id) env = gym.wrappers.RecordEpisodeStatistics(env) env.action_space.seed(seed) return env @@ -290,9 +287,6 @@ def main(seed=None, sweep=False): ) start_time = time.time() - - pure_exploration_discrete_matrix = np.zeros((50,50)) - # TRY NOT TO MODIFY: start the game obs, _ = envs.reset(seed=args.seed) for global_step in range(args.total_timesteps): @@ -306,20 +300,15 @@ def main(seed=None, sweep=False): # TRY NOT TO MODIFY: execute the game and log data. next_obs, rewards, terminations, truncations, infos = envs.step(actions) - for aaa in range(len(obs)): - pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] = min(1, pure_exploration_discrete_matrix[min(int(obs[aaa][0]*50),49)][min(int(obs[aaa][1]*50),49)] +1) - # TRY NOT TO MODIFY: record rewards for plotting purposes if "final_info" in infos: for info in infos["final_info"]: if info is not None: print(f"global_step={global_step}, episodic_return={info['episode']['r']}") if sweep: - #episodic_returns_list.append(info["episode"]["r"]) - episodic_returns_list.append(np.array([np.mean(pure_exploration_discrete_matrix)])) + episodic_returns_list.append(info["episode"]["r"]) corresponding_steps.append(global_step) else: - writer.add_scalar("charts/mean_exploration", np.mean(pure_exploration_discrete_matrix), global_step) writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step) writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step) break