diff --git a/cleanrl/cleanrl_explo/lil_maze.py b/cleanrl/cleanrl_explo/lil_maze.py
new file mode 100644
index 00000000..40042e60
--- /dev/null
+++ b/cleanrl/cleanrl_explo/lil_maze.py
@@ -0,0 +1,104 @@
+import numpy as np
+import pygame
+import math
+import os, imageio
+import matplotlib.pyplot as plt
+import torch
+import gymnasium as gym
+from gymnasium import spaces
+
+
+
+class LilMaze(gym.Env):
+    metadata = {"render_modes": ["rgb_array"]}
+    def __init__(self, render_mode = None):
+        super(LilMaze, self).__init__()
+
+        # Define the action and observation spaces
+        self.action_space = spaces.Box(low=-1, high=1, shape=(2,), dtype=np.float32)
+        self.observation_space = spaces.Box(low=0, high=1, shape=(2,), dtype=np.float32)
+
+        # The size of a step
+        self.step_size = 0.01
+
+        # The maximum number of steps
+        self.max_steps = 200
+
+        # Define the initial position of the agent
+        self.initial_agent_position = np.array([0.25, 0.25])
+
+        # Define the goal position
+        self.goal_position = np.array([0.25, 0.75])
+
+        # Wall positions
+        self.wall_positions = [
+            [(0.0,0.5),(0.5, 0.5)],
+        ]
+
+        self.world = np.zeros((1000, 1000, 3), dtype=np.uint8)
+        self.world.fill(255)
+        
+        self.draw(self.world, self.goal_position, (0, 255, 0))
+        self.world_copy = self.world.copy()
+
+        assert render_mode is None or render_mode in self.metadata["render_modes"]
+        self.render_mode = render_mode
+
+    def draw(self, world, position, color):
+        pos = (int(position[0] * 1000), int(position[1] * 1000))
+        world[pos[1]-2:pos[1]+2, pos[0]-2:pos[0]+2] = color
+
+
+    def reset(self, seed=None, options=None):
+        super().reset(seed=seed)
+        self.num_steps = 0
+        self.world = self.world_copy.copy()
+
+        # Reset the agent's position to the initial position
+        self.agent_position = self.initial_agent_position
+        self.draw(self.world, self.initial_agent_position, (255, 0, 0))
+        self.draw(self.world_copy, self.initial_agent_position, (0, 0, 255))
+
+        infos = self._get_info()
+        return self.agent_position, infos
+    
+    def _get_info(self):
+        return {}
+
+    def step(self, action):
+        self.num_steps += 1
+
+        action = np.clip(action, -1, 1)
+        new_position = self.agent_position + action * self.step_size
+
+        # only made for 1 wall
+        cond1 = new_position[1] >= 0.5 and self.agent_position[1] < 0.5
+        cond2 = new_position[1] < 0.5 and self.agent_position[1] >= 0.5
+
+        if cond1 and self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]) < 0.5 : 
+            new_position = [self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]), 0.5 - 0.001]
+        if cond2 and self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]) < 0.5 : 
+            new_position = [self.agent_position[0] + (new_position[0] - self.agent_position[0])/(new_position[1] - self.agent_position[1]) * (0.5 - self.agent_position[1]), 0.5 + 0.001]
+        
+        
+        self.agent_position = np.clip(np.array(new_position), 0,1)
+
+        self.draw(self.world, self.agent_position, (255, 0, 0))
+        self.draw(self.world_copy, self.agent_position, (0, 0, 255))
+        
+
+        # Compute the reward
+        reward = - np.linalg.norm(self.agent_position - self.goal_position)
+
+        done = self.num_steps >= self.max_steps
+
+        infos = self._get_info()
+
+        return self.agent_position, reward, done, None, infos
+
+    def render(self):
+        if self.render_mode == 'rgb_array':
+            return self.world.copy()
+        else:
+            raise NotImplementedError()
+        
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/sac_apt.py b/cleanrl/cleanrl_explo/sac_apt.py
new file mode 100644
index 00000000..ac84cece
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_apt.py
@@ -0,0 +1,474 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 12
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = True
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "SAC - exploration with APT"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = True
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 200000
+    """total timesteps of the experiments"""
+    num_envs: int = 4
+    """the number of parallel game environments to run"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+
+    # encoder specific arguments
+    encoder_lr: float = 0.00001611
+    """the learning rate of the encoder"""
+    encoder_epochs: int = 4
+    """the number of epochs for the encoder"""
+    encoder_frequency: int = 300
+    """the frequency of training encoder"""
+    latent_dim: int = 8
+    """the dimension of the latent space"""
+    sigma: float = 0.048
+    """the sigma for the data augmentation"""
+    k_nearest: int = 4
+    """the number of nearest neighbors"""
+
+    # intrinsic reward specific arguments
+    normalize_reward: bool = True
+    """if toggled, the intrinsic reward will be normalized"""
+    reward_update_rate : float = 0.001
+    """the update rate of the runnign estimators of the reward"""
+
+
+
+    keep_extrinsic_reward: bool = False
+    """if toggled, the extrinsic reward will be kept"""
+    coef_intrinsic : float = 0.1256
+    """the coefficient of the intrinsic reward"""
+    coef_extrinsic : float = 0.5422
+    """the coefficient of the extrinsic reward"""
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+    
+
+class Encoder(nn.Module):   
+    def __init__(self, envs, latent_dim, sigma, k_nearest):
+        super(Encoder, self).__init__()
+        state_dim = np.prod(envs.single_observation_space.shape)
+        action_dim = np.prod(envs.single_action_space.shape)
+
+        # encoder network
+        self.f1 = nn.Linear(state_dim, 256)
+        self.f2 = nn.Linear(256, 64)
+        self.f3 = nn.Linear(64, latent_dim)
+
+
+        self.latent_dim = latent_dim
+        self.sigma = sigma
+        self.k_nearest = k_nearest
+
+    
+    def forward(self, x):
+        x = F.relu(self.f1(x))
+        x = F.relu(self.f2(x))
+        x = self.f3(x)
+        return x
+
+    def data_augmentation(self, x):
+        noise = torch.randn_like(x) * self.sigma
+        return x + noise
+
+    def normalize(self, x):
+        return nn.functional.normalize(x, p=2, dim=1)
+
+    def constrastive_loss(self, x, x_augmented):
+        x_norm = self.normalize(x)
+        x_augmented_norm = self.normalize(x_augmented)
+        numerator = torch.exp(torch.einsum('ij,ij->i', x_norm, x_augmented_norm))
+        denominator = torch.sum(torch.exp(torch.einsum('ik,jk->ij', x_norm, x_norm)), dim=1)
+        return -torch.log(numerator/denominator).mean()
+
+    def get_knn_sum(self, x, x_augmented):
+        x_encoded = self(x)
+        x_augmented_encoded = self(x_augmented)
+
+        distances = torch.cdist(x_encoded, x_augmented_encoded)
+        knn_sum = torch.topk(distances, self.k_nearest, largest=False, sorted=False, dim=1).values.sum(dim=1)
+        return knn_sum
+
+    
+
+def main(seed=None, sweep=False):
+
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    if seed is not None:
+        args.seed = seed
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+
+
+    # For hyperparameter optimization, see trainer.py file
+    if sweep:
+        episodic_returns_list = []
+        corresponding_steps = []
+
+        import wandb
+        wandb.init()
+
+        config = wandb.config
+
+        for key, value in vars(args).items():
+            if key in config:
+                setattr(args, key, config[key])
+
+
+    else :
+        
+        if args.track:
+            import wandb
+
+            wandb.init(
+                project=args.wandb_project_name,
+                entity=args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(args),
+                name=run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        writer = SummaryWriter(f"runs/{run_name}")
+        writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+        )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv(
+        [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)]
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+    encoder = Encoder(envs, args.latent_dim, args.sigma, args.k_nearest).to(device)
+    encoder_optimizer = optim.Adam(encoder.parameters(), lr=args.encoder_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+
+    # The replay buffer parameters have been updated to handle multiple envs
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+        n_envs=args.num_envs
+    )
+
+    intrinsic_reward_running_mean = torch.zeros(1, device=device, dtype=torch.float32)
+    intrinsic_reward_running_std = torch.ones(1, device=device, dtype=torch.float32)
+
+
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                if info is not None:
+                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                    if sweep:
+                        episodic_returns_list.append(info["episode"]["r"])
+                        corresponding_steps.append(global_step)
+                    else:
+                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                    break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+
+            if global_step % args.encoder_frequency == 0:
+                mean_encoder_loss = 0.0
+                for _ in range(args.encoder_epochs):
+                    data = rb.sample(args.batch_size)
+                    obs_augmented = encoder.data_augmentation(data.observations)
+                    obs_encoded = encoder(data.observations)
+                    obs_augmented_encoded = encoder(obs_augmented)
+
+                    encoder_loss = encoder.constrastive_loss(obs_encoded, obs_augmented_encoded)
+
+                    encoder_optimizer.zero_grad()
+                    encoder_loss.backward()
+                    encoder_optimizer.step()
+                    
+                    mean_encoder_loss += encoder_loss.item()
+
+                mean_encoder_loss /= args.encoder_epochs
+                if not sweep:
+                    writer.add_scalar("losses/encoder_loss", mean_encoder_loss, global_step)
+                
+
+
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                intrinsic_reward = encoder.get_knn_sum(data.observations, data.next_observations)/args.latent_dim
+                extrinsic_reward = data.rewards.flatten()
+
+                if args.normalize_reward:
+                    intrinsic_reward = (intrinsic_reward - intrinsic_reward_running_mean) / (intrinsic_reward_running_std + 1e-6)
+                    intrinsic_reward_running_mean = intrinsic_reward_running_mean + args.reward_update_rate * (intrinsic_reward.mean() - intrinsic_reward_running_mean)
+                    intrinsic_reward_running_std = intrinsic_reward_running_std + args.reward_update_rate * (intrinsic_reward.std() - intrinsic_reward_running_std)
+                
+                if args.keep_extrinsic_reward:                  
+                    rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic
+                else:
+                    rewards = intrinsic_reward.flatten() *args.coef_intrinsic
+                next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+
+
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0 and not sweep:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step)
+                
+    envs.close()
+    if sweep:
+        return episodic_returns_list, corresponding_steps
+    writer.close()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/sac_aux.py b/cleanrl/cleanrl_explo/sac_aux.py
new file mode 100644
index 00000000..e6487f58
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_aux.py
@@ -0,0 +1,452 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 12
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = True
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "SAC - exploration with auxiliary VAE" 
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = True
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 200000
+    """total timesteps of the experiments"""
+    num_envs: int = 4
+    """the number of parallel game environments to run"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+
+    # VAE specific arguments
+    vae_lr: float = 0.001139
+    """the learning rate of the VAE"""
+    vae_epochs: int = 4
+    """the number of epochs for the VAE"""
+    vae_frequency: int = 800
+    """the frequency of training VAE"""
+    vae_latent_dim: int = 32
+    """the latent dimension of the VAE"""
+    clip_vae: float = 120.0
+    """the clipping of the VAE"""
+    vae_batch_size: int = 128
+    """the batch size of the VAE"""
+
+
+    keep_extrinsic_reward: bool = False
+    """if toggled, the extrinsic reward will be kept"""
+    coef_intrinsic : float = 0.3472
+    """the coefficient of the intrinsic reward"""
+    coef_extrinsic : float = 0.5422
+    """the coefficient of the extrinsic reward"""
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+    
+class VAE(nn.Module):
+    def __init__(self, envs, latent_dim, clip_vae=120.0, scale_l = 1000.0):
+        super().__init__()
+        input_dim = np.prod(envs.single_observation_space.shape)
+        self.clip_vae = clip_vae
+        self.scale_l = scale_l
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU(),
+        )
+        self.mean_layer = nn.Linear(256, latent_dim)
+        self.logstd_layer = nn.Linear(256, latent_dim)
+        self.decoder = nn.Sequential(
+            nn.Linear(latent_dim, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU(),
+            nn.Linear(256, input_dim),
+        )   
+    def encode(self, x):
+        x = self.encoder(x)
+        mean = self.mean_layer(x)
+        logstd = self.logstd_layer(x)
+        return mean, logstd
+
+    def decode(self, z):
+        return self.decoder(z)
+    
+    def forward(self, x):
+        mean, logstd = self.encode(x/self.scale_l)
+        z = mean + torch.randn_like(mean) * torch.exp(logstd)
+        x_recon = torch.clamp(self.decode(z), -self.clip_vae, self.clip_vae)
+        return x_recon, mean, logstd
+    
+    def loss(self, x, reduce=True):
+        x_recon, mean, logstd = self(x)
+        x = x/self.scale_l
+        recon_loss = F.mse_loss(x_recon, x, reduction='none').sum(1)
+        kl_loss = -0.5 * (1 + 2 * logstd - mean ** 2 - torch.exp(2 * logstd)).sum(1)
+        loss = recon_loss + kl_loss
+        if reduce:
+            return loss.mean()
+        return loss
+
+def main(seed=None, sweep=False):
+
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    if seed is not None:
+        args.seed = seed
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+
+
+    # For hyperparameter optimization, see trainer.py file
+    if sweep:
+        episodic_returns_list = []
+        corresponding_steps = []
+
+        import wandb
+        wandb.init()
+
+        config = wandb.config
+
+        for key, value in vars(args).items():
+            if key in config:
+                setattr(args, key, config[key])
+
+
+    else :
+        
+        if args.track:
+            import wandb
+
+            wandb.init(
+                project=args.wandb_project_name,
+                entity=args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(args),
+                name=run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        writer = SummaryWriter(f"runs/{run_name}")
+        writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+        )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv(
+        [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)]
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+
+    vae = VAE(envs, 
+              latent_dim=args.vae_latent_dim, 
+              clip_vae=args.clip_vae).to(device)
+    vae_optimizer = optim.Adam(vae.parameters(), lr=args.vae_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+
+    # The replay buffer parameters have been updated to handle multiple envs
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+        n_envs=args.num_envs
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                if info is not None:
+                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                    if sweep:
+                        episodic_returns_list.append(info["episode"]["r"])
+                        corresponding_steps.append(global_step)
+                    else:
+                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                    break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+
+            if global_step % args.vae_frequency == 0:
+                mean_vae_loss = 0.0
+                for _ in range(args.vae_epochs):
+                    data = rb.sample(args.batch_size)
+                    
+                    vae_loss = vae.loss(data.observations, reduce=True)
+                    vae_optimizer.zero_grad()
+                    vae_loss.backward()
+                    vae_optimizer.step()
+                    mean_vae_loss += vae_loss.item()
+
+                mean_vae_loss /= args.vae_epochs
+                if not sweep:
+                    writer.add_scalar("losses/vae_loss", mean_vae_loss, global_step)
+                
+
+
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                intrinsic_reward = vae.loss(data.observations, reduce = False)
+                extrinsic_reward = data.rewards.flatten()
+                if args.keep_extrinsic_reward:
+                    rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic
+                else:
+                    rewards = intrinsic_reward.flatten() *args.coef_intrinsic
+                next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+                
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+
+
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0 and not sweep:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step)
+                
+    envs.close()
+    if sweep:
+        return episodic_returns_list, corresponding_steps
+    writer.close()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/sac_continuous_action.py b/cleanrl/cleanrl_explo/sac_continuous_action.py
new file mode 100644
index 00000000..d28cb8e2
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_continuous_action.py
@@ -0,0 +1,310 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 1
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = False
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "cleanRL"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = False
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 1000000
+    """total timesteps of the experiments"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.action_space.high - env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.action_space.high + env.action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+
+
+if __name__ == "__main__":
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+    if args.track:
+        import wandb
+
+        wandb.init(
+            project=args.wandb_project_name,
+            entity=args.wandb_entity,
+            sync_tensorboard=True,
+            config=vars(args),
+            name=run_name,
+            monitor_gym=True,
+            save_code=True,
+        )
+    writer = SummaryWriter(f"runs/{run_name}")
+    writer.add_text(
+        "hyperparameters",
+        "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+    )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv([make_env(args.env_id, args.seed, 0, args.capture_video, run_name)])
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                next_q_value = data.rewards.flatten() + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+
+    envs.close()
+    writer.close()
diff --git a/cleanrl/cleanrl_explo/sac_icm.py b/cleanrl/cleanrl_explo/sac_icm.py
new file mode 100644
index 00000000..8909ece9
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_icm.py
@@ -0,0 +1,454 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 12
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = True
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "SAC - exploration with ICM"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = True
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 200000
+    """total timesteps of the experiments"""
+    num_envs: int = 4
+    """the number of parallel game environments to run"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+
+    # icm specific arguments
+    icm_lr: float = 0.00082
+    """the learning rate of the icm"""
+    icm_epochs: int = 4
+    """the number of epochs for the icm"""
+    icm_frequency: int = 1000
+    """the frequency of training icm"""
+    beta: float = 0.1083
+    """the beta of the icm"""
+    clip_intrinsic_reward: float = 10.0
+    """the clipping of the intrinsic reward"""
+    feature_dim: int = 64
+
+
+    keep_extrinsic_reward: bool = False
+    """if toggled, the extrinsic reward will be kept"""
+    coef_intrinsic : float = 84.185
+    """the coefficient of the intrinsic reward"""
+    coef_extrinsic : float = 1.96
+    """the coefficient of the extrinsic reward"""
+
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+    
+
+class ICM(nn.Module):   
+    def __init__(self, envs, feature_dim = 64, beta = 0.2):
+        super(ICM, self).__init__()
+        state_dim = np.prod(envs.single_observation_space.shape)
+        action_dim = np.prod(envs.single_action_space.shape)
+        # feature network
+        self.f1 = nn.Linear(state_dim, 256)
+        self.f2 = nn.Linear(256, 64)
+        self.f3 = nn.Linear(64, feature_dim)
+        # inverse model
+        self.i1 = nn.Linear(2*feature_dim, 64)
+        self.i2 = nn.Linear(64, action_dim)
+        # forward model
+        self.fo1 = nn.Linear(feature_dim + action_dim, 64)
+        self.fo2 = nn.Linear(64, feature_dim)
+        # beta
+        self.beta = beta
+
+    def feature(self, x):
+        x = F.relu(self.f1(x))
+        x = F.relu(self.f2(x))
+        x = self.f3(x)
+        return x
+    def inverse(self, f1, f2):
+        x = torch.cat([f1, f2], dim = 1)
+        x = F.relu(self.i1(x))
+        x = self.i2(x)
+        return x
+    def forward_t(self, f1, a):
+        x = torch.cat([f1, a], dim = 1)
+        x = F.relu(self.fo1(x))
+        x = self.fo2(x)
+        return x
+    
+    def loss(self, obs, next_obs, action, reduce = True):
+        # feature
+        f = self.feature(obs)
+        f_next = self.feature(next_obs)
+        # inverse
+        a_pred = self.inverse(f, f_next)
+        # forward
+        f_next_pred = self.forward_t(f, action)
+        # loss
+        loss_inverse = F.mse_loss(a_pred, action, reduction = 'none').sum(1) if not reduce else F.mse_loss(a_pred, action)
+        loss_forward = F.mse_loss(f_next_pred, f_next, reduction = 'none').sum(1) if not reduce else F.mse_loss(f_next_pred, f_next)
+        return self.beta * loss_forward + (1 - self.beta) * loss_inverse
+    
+
+def main(seed=None, sweep=False):
+
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    if seed is not None:
+        args.seed = seed
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+
+
+    # For hyperparameter optimization, see trainer.py file
+    if sweep:
+        episodic_returns_list = []
+        corresponding_steps = []
+
+        import wandb
+        wandb.init()
+
+        config = wandb.config
+
+        for key, value in vars(args).items():
+            if key in config:
+                setattr(args, key, config[key])
+
+
+    else :
+        
+        if args.track:
+            import wandb
+
+            wandb.init(
+                project=args.wandb_project_name,
+                entity=args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(args),
+                name=run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        writer = SummaryWriter(f"runs/{run_name}")
+        writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+        )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv(
+        [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)]
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+    icm = ICM(envs,
+              feature_dim=64,
+              beta=args.beta).to(device)
+    icm_optimizer = optim.Adam(icm.parameters(), lr=args.icm_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+
+    # The replay buffer parameters have been updated to handle multiple envs
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+        n_envs=args.num_envs
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                if info is not None:
+                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                    if sweep:
+                        episodic_returns_list.append(info["episode"]["r"])
+                        corresponding_steps.append(global_step)
+                    else:
+                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                    break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+
+            if global_step % args.icm_frequency == 0:
+                mean_icm_loss = 0.0
+                for _ in range(args.icm_epochs):
+                    data = rb.sample(args.batch_size)
+                    
+                    icm_loss = icm.loss(data.observations, data.next_observations, data.actions, reduce = True)
+                    icm_optimizer.zero_grad()
+                    icm_loss.backward()
+                    icm_optimizer.step()
+                    mean_icm_loss += icm_loss.item()
+
+                mean_icm_loss /= args.icm_epochs
+                if not sweep:
+                    writer.add_scalar("losses/icm_loss", mean_icm_loss, global_step)
+                
+
+
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                intrinsic_reward = icm.loss(data.observations, data.next_observations, data.actions, reduce = False)
+                extrinsic_reward = data.rewards.flatten()
+                if args.keep_extrinsic_reward:
+                    rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic
+                else:
+                    rewards = intrinsic_reward.flatten() *args.coef_intrinsic
+                next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+
+
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0 and not sweep:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step)
+                
+    envs.close()
+    if sweep:
+        return episodic_returns_list, corresponding_steps
+    writer.close()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/sac_ngu.py b/cleanrl/cleanrl_explo/sac_ngu.py
new file mode 100644
index 00000000..1200bf07
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_ngu.py
@@ -0,0 +1,572 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 12
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = True
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "SAC - exploration with NGU" 
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = True
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 200000
+    """total timesteps of the experiments"""
+    num_envs: int = 4
+    """the number of parallel game environments to run"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+
+    # NGU specific arguments
+    ngu_lr: float = 0.00004501
+    """the learning rate of the NGU"""
+    ngu_epochs: int = 4
+    """the number of epochs for the NGU"""
+    ngu_frequency: int = 900
+    """the frequency of training NGU"""
+    ngu_feature_dim: int = 64
+    """the feature dimension of the NGU"""
+    k_nearest: int = 6
+    """the number of nearest neighbors for the NGU"""
+    clip_reward: float = 0.3656
+    """the clipping value of the reward"""
+    c: float = 0.001
+    """the constant used not to divide by zero"""
+    L: float = 5.0
+    """the maximum value for the multiplier in the intrinsic reward of NGU"""
+    epsilon_kernel: float = 1e-3
+    """the epsilon value for the kernel of the NGU"""
+
+
+    keep_extrinsic_reward: bool = False
+    """if toggled, the extrinsic reward will be kept"""
+    coef_intrinsic : float = 48.311
+    """the coefficient of the intrinsic reward"""
+    coef_extrinsic : float = 7.099
+    """the coefficient of the extrinsic reward"""
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+class NGU_ReplayBuffer():
+    def __init__(self, buffer_size, observation_space, action_space, device, handle_timeout_termination=False, n_envs=1):
+        self.buffer_size = buffer_size
+        self.device = device
+        self.handle_timeout_termination = handle_timeout_termination
+        self.n_envs = n_envs
+
+        self.observations = np.zeros((buffer_size, n_envs) + observation_space.shape, dtype=np.float32)
+        self.next_observations = np.zeros((buffer_size, n_envs) + observation_space.shape, dtype=np.float32)
+        self.actions = np.zeros((buffer_size, n_envs) + action_space.shape, dtype=np.float32)
+        self.rewards = np.zeros((buffer_size, n_envs), dtype=np.float32)
+        self.rewards_ngu = np.zeros((buffer_size, n_envs), dtype=np.float32)
+        self.dones = np.zeros((buffer_size, n_envs), dtype=np.float32)
+        self.ptr, self.size, self.max_size = 0, 0, buffer_size
+
+    def add(self, obs, next_obs, action, reward, reward_ngu, done, info):
+        self.observations[self.ptr] = obs
+        self.next_observations[self.ptr] = next_obs
+        self.actions[self.ptr] = action
+        self.rewards[self.ptr] = reward
+        self.rewards_ngu[self.ptr] = reward_ngu
+        self.dones[self.ptr] = done
+        self.ptr = (self.ptr + 1) % self.max_size
+        self.size = min(self.size + 1, self.max_size)
+
+    def sample(self, batch_size):
+        idxs = np.random.randint(0, self.size, size=batch_size)
+        idxs_2 = np.random.randint(0, self.n_envs, size=batch_size)
+        return (
+            torch.as_tensor(self.observations[idxs,idxs_2,:], device=self.device),
+            torch.as_tensor(self.next_observations[idxs,idxs_2,:], device=self.device),
+            torch.as_tensor(self.actions[idxs,idxs_2,:], device=self.device),
+            torch.as_tensor(self.rewards[idxs,idxs_2], device=self.device),
+            torch.as_tensor(self.rewards_ngu[idxs,idxs_2], device=self.device),
+            torch.as_tensor(self.dones[idxs,idxs_2], device=self.device),
+        )
+
+    def __len__(self):
+        return self.size
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+    
+class NGU(nn.Module):
+    def __init__(self, envs, feature_dim, k_nearest, clip_reward, c, L, epsilon_kernel):
+        super().__init__()
+        state_dim = np.prod(envs.single_observation_space.shape)
+        action_dim = np.prod(envs.single_action_space.shape)
+        self.feature_dim = feature_dim
+        self.k_nearest = k_nearest
+        self.clip_reward = clip_reward
+        self.c = c
+        self.L = L
+        self.epsilon_kernel = epsilon_kernel
+
+        # RND
+        # trained network
+        self.f1 = nn.Linear(state_dim, 128)
+        self.f2 = nn.Linear(128, 64)
+        self.f3 = nn.Linear(64, 1)
+        # target network
+        self.f1_t = nn.Linear(state_dim, 128)
+        self.f2_t = nn.Linear(128, 64)
+        self.f3_t = nn.Linear(64, 1)
+        # embedding network
+        self.f1_z = nn.Linear(state_dim, 128)
+        self.f2_z = nn.Linear(128, 64)
+        self.f3_z = nn.Linear(64, feature_dim)
+        # action network
+        self.f1_a = nn.Linear(feature_dim*2 , 128)
+        self.f2_a = nn.Linear(128, 64)
+        self.f3_a = nn.Linear(64, action_dim)
+        # running average of the squared Euclidean distance of the k-th nearest neighbors
+        self.dm2 = 0.0
+
+    def forward(self, x):
+        x = F.relu(self.f1(x))
+        x = F.relu(self.f2(x))
+        x = self.f3(x)
+        return x
+
+    def forward_t(self, x):
+        with torch.no_grad():
+            x = F.relu(self.f1_t(x))
+            x = F.relu(self.f2_t(x))
+            x = self.f3_t(x)
+            return x
+    
+    def rnd_loss(self, x, reduce = True):
+        return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none')
+    
+    def embedding(self, s):
+        x = F.relu(self.f1_z(s))
+        x = F.relu(self.f2_z(x))
+        x = self.f3_z(x)
+        return x
+    
+    def action_pred(self, s0, s1):
+        x = torch.cat([s0, s1], 1)
+        x = F.relu(self.f1_a(x))
+        x = F.relu(self.f2_a(x))
+        x = self.f3_a(x)
+        return x
+    
+    def reward_episode(self, s, episode):
+        z_s = self.embedding(s)
+        z_episode = self.embedding(episode)
+
+        dist = torch.norm(z_s - z_episode, dim=1)
+        kernel = self.epsilon_kernel/(dist/self.dm2 + self.epsilon_kernel)
+        top_k_kernel = torch.topk(kernel, self.k_nearest, largest = True)
+        top_k = torch.topk(dist, self.k_nearest, largest = False)
+        self.dm2 = 0.99 * self.dm2 + 0.01 * top_k.values.mean().item()
+        reward_episodic = (1/(torch.sqrt(top_k_kernel.values.mean()) + self.c)).item() 
+
+        return reward_episodic
+    
+    
+
+    def loss(self,s,s_next,a,d): 
+        rnd_loss = self.rnd_loss(s)
+
+        s0 = self.embedding(s)
+        s1 = self.embedding(s_next)
+        h_loss = torch.norm(self.action_pred(s0, s1) - a, dim=1) * (1-d)
+
+        return rnd_loss + h_loss.mean()
+
+def main(seed=None, sweep=False):
+
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    if seed is not None:
+        args.seed = seed
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+
+
+    # For hyperparameter optimization, see trainer.py file
+    if sweep:
+        episodic_returns_list = []
+        corresponding_steps = []
+
+        import wandb
+        wandb.init()
+
+        config = wandb.config
+
+        for key, value in vars(args).items():
+            if key in config:
+                setattr(args, key, config[key])
+
+
+    else :
+        
+        if args.track:
+            import wandb
+
+            wandb.init(
+                project=args.wandb_project_name,
+                entity=args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(args),
+                name=run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        writer = SummaryWriter(f"runs/{run_name}")
+        writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+        )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv(
+        [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)]
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+
+    ngu = NGU(envs, 
+              feature_dim = args.ngu_feature_dim,
+              k_nearest = args.k_nearest,
+              clip_reward = args.clip_reward,
+              c = args.c,
+              L = args.L,
+              epsilon_kernel = args.epsilon_kernel
+              ).to(device)
+    ngu_optimizer = optim.Adam(ngu.parameters(), lr=args.ngu_lr)
+    episodes = [ [] for _ in range(args.num_envs)]
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+
+
+    # This replay buffer is hand designed for NGU
+    # The replay buffer parameters have been updated to handle multiple envs
+    rb = NGU_ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+        n_envs=args.num_envs
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # COMPUTE REWARD
+        reward_ngu = torch.zeros(args.num_envs)
+        for idx in range(args.num_envs):
+            with torch.no_grad():
+                reward_ngu[idx] = ngu.reward_episode(torch.tensor(obs[idx]).unsqueeze(0).float().to(device), torch.tensor(np.array(episodes[idx])).float().to(device)) if len(episodes[idx]) > args.k_nearest else 0.0
+        
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                if info is not None:
+                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                    if sweep:
+                        episodic_returns_list.append(info["episode"]["r"])
+                        corresponding_steps.append(global_step)
+                    else:
+                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                    break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, (done_, trunc) in enumerate(zip(terminations,truncations)):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+            if done_ or trunc:
+                episodes[idx] = []
+        rb.add(obs, real_next_obs, actions, rewards, reward_ngu, terminations, infos)
+
+        for idx, ob in enumerate(obs):
+            episodes[idx].append(ob)
+            
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+
+            if global_step % args.ngu_frequency == 0:
+                mean_ngu_loss = 0.0
+                for _ in range(args.ngu_epochs):
+                    data = rb.sample(args.batch_size)
+                    data_observations = data[0]
+                    data_next_observations = data[1]
+                    data_actions = data[2]
+                    data_rewards = data[3]
+                    data_rewards_ngu = data[4]
+                    data_dones = data[5]
+                    
+                    ngu_loss = ngu.loss(data_observations, data_next_observations, data_actions, data_dones)
+                    ngu_optimizer.zero_grad()
+                    ngu_loss.backward()
+                    ngu_optimizer.step()
+                    mean_ngu_loss += ngu_loss.item()
+
+                mean_ngu_loss /= args.ngu_epochs
+                if not sweep:
+                    writer.add_scalar("losses/ngu_loss", mean_ngu_loss, global_step)
+                
+
+
+            data = rb.sample(args.batch_size)
+            data_observations = data[0]
+            data_next_observations = data[1]
+            data_actions = data[2]
+            data_rewards = data[3]
+            data_rewards_ngu = data[4]
+            data_dones = data[5]
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data_next_observations)
+                qf1_next_target = qf1_target(data_next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data_next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                rnd_loss = ngu.rnd_loss(data_observations, reduce = False)
+                intrinsic_reward = data_rewards_ngu * torch.min(torch.max(rnd_loss.flatten(), torch.tensor(1).to(device)), torch.tensor(args.L).to(device))
+                intrinsic_reward = torch.clip(intrinsic_reward, -args.clip_reward, args.clip_reward)
+                extrinsic_reward = data_rewards.flatten()
+                if args.keep_extrinsic_reward:
+                    rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward*args.coef_intrinsic
+                else:
+                    rewards = intrinsic_reward *args.coef_intrinsic
+                next_q_value = rewards + (1 - data_dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+                
+
+            qf1_a_values = qf1(data_observations, data_actions).view(-1)
+            qf2_a_values = qf2(data_observations, data_actions).view(-1)
+
+
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data_observations)
+                    qf1_pi = qf1(data_observations, pi)
+                    qf2_pi = qf2(data_observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data_observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0 and not sweep:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step)
+                
+    envs.close()
+    if sweep:
+        return episodic_returns_list, corresponding_steps
+    writer.close()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/sac_rnd.py b/cleanrl/cleanrl_explo/sac_rnd.py
new file mode 100644
index 00000000..98462e98
--- /dev/null
+++ b/cleanrl/cleanrl_explo/sac_rnd.py
@@ -0,0 +1,426 @@
+# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/sac/#sac_continuous_actionpy
+import os
+import random
+import time
+from dataclasses import dataclass
+
+import gymnasium as gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import tyro
+from stable_baselines3.common.buffers import ReplayBuffer
+from torch.utils.tensorboard import SummaryWriter
+
+@dataclass
+class Args:
+    exp_name: str = os.path.basename(__file__)[: -len(".py")]
+    """the name of this experiment"""
+    seed: int = 12
+    """seed of the experiment"""
+    torch_deterministic: bool = True
+    """if toggled, `torch.backends.cudnn.deterministic=False`"""
+    cuda: bool = True
+    """if toggled, cuda will be enabled by default"""
+    track: bool = True
+    """if toggled, this experiment will be tracked with Weights and Biases"""
+    wandb_project_name: str = "SAC - exploration with RND"
+    """the wandb's project name"""
+    wandb_entity: str = None
+    """the entity (team) of wandb's project"""
+    capture_video: bool = True
+    """whether to capture videos of the agent performances (check out `videos` folder)"""
+
+    # Algorithm specific arguments
+    env_id: str = "Hopper-v4"
+    """the environment id of the task"""
+    total_timesteps: int = 200000
+    """total timesteps of the experiments"""
+    num_envs: int = 4
+    """the number of parallel game environments to run"""
+    buffer_size: int = int(1e6)
+    """the replay memory buffer size"""
+    gamma: float = 0.99
+    """the discount factor gamma"""
+    tau: float = 0.005
+    """target smoothing coefficient (default: 0.005)"""
+    batch_size: int = 256
+    """the batch size of sample from the reply memory"""
+    learning_starts: int = 5e3
+    """timestep to start learning"""
+    policy_lr: float = 3e-4
+    """the learning rate of the policy network optimizer"""
+    q_lr: float = 1e-3
+    """the learning rate of the Q network network optimizer"""
+    policy_frequency: int = 2
+    """the frequency of training policy (delayed)"""
+    target_network_frequency: int = 1  # Denis Yarats' implementation delays this by 2.
+    """the frequency of updates for the target nerworks"""
+    alpha: float = 0.2
+    """Entropy regularization coefficient."""
+    autotune: bool = True
+    """automatic tuning of the entropy coefficient"""
+
+
+
+    # RND specific arguments
+    rnd_lr: float = 0.004866
+    """the learning rate of the RND"""
+    rnd_epochs: int = 4
+    """the number of epochs for the RND"""
+    rnd_frequency: int = 900
+    """the frequency of training RND"""
+
+
+    keep_extrinsic_reward: bool = False
+    """if toggled, the extrinsic reward will be kept"""
+    coef_intrinsic : float = 47.016
+    """the coefficient of the intrinsic reward"""
+    coef_extrinsic : float = 1.631
+    """the coefficient of the extrinsic reward"""
+
+def make_env(env_id, seed, idx, capture_video, run_name):
+    def thunk():
+        if capture_video and idx == 0:
+            env = gym.make(env_id, render_mode="rgb_array")
+            env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")
+        else:
+            env = gym.make(env_id)
+        env = gym.wrappers.RecordEpisodeStatistics(env)
+        env.action_space.seed(seed)
+        return env
+
+    return thunk
+
+
+# ALGO LOGIC: initialize agent here:
+class SoftQNetwork(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod() + np.prod(env.single_action_space.shape), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc3 = nn.Linear(256, 1)
+
+    def forward(self, x, a):
+        x = torch.cat([x, a], 1)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+
+LOG_STD_MAX = 2
+LOG_STD_MIN = -5
+
+
+class Actor(nn.Module):
+    def __init__(self, env):
+        super().__init__()
+        self.fc1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.fc2 = nn.Linear(256, 256)
+        self.fc_mean = nn.Linear(256, np.prod(env.single_action_space.shape))
+        self.fc_logstd = nn.Linear(256, np.prod(env.single_action_space.shape))
+        # action rescaling
+        self.register_buffer(
+            "action_scale", torch.tensor((env.single_action_space.high - env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+        self.register_buffer(
+            "action_bias", torch.tensor((env.single_action_space.high + env.single_action_space.low) / 2.0, dtype=torch.float32)
+        )
+
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        mean = self.fc_mean(x)
+        log_std = self.fc_logstd(x)
+        log_std = torch.tanh(log_std)
+        log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)  # From SpinUp / Denis Yarats
+
+        return mean, log_std
+
+    def get_action(self, x):
+        mean, log_std = self(x)
+        std = log_std.exp()
+        normal = torch.distributions.Normal(mean, std)
+        x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
+        y_t = torch.tanh(x_t)
+        action = y_t * self.action_scale + self.action_bias
+        log_prob = normal.log_prob(x_t)
+        # Enforcing Action Bound
+        log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + 1e-6)
+        log_prob = log_prob.sum(1, keepdim=True)
+        mean = torch.tanh(mean) * self.action_scale + self.action_bias
+        return action, log_prob, mean
+    
+
+class RND(nn.Module):   
+    def __init__(self, env):
+        super(RND, self).__init__()
+        # trained network
+        self.f1 = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.f2 = nn.Linear(256, 256)
+        self.f3 = nn.Linear(256, 1)
+        # target network
+        self.f1_t = nn.Linear(np.array(env.single_observation_space.shape).prod(), 256)
+        self.f2_t = nn.Linear(256, 256)
+        self.f3_t = nn.Linear(256, 1)
+
+    def forward(self, x):
+        x = F.relu(self.f1(x))
+        x = F.relu(self.f2(x))
+        x = self.f3(x)
+        return x
+    
+    def forward_t(self, x):
+        with torch.no_grad():
+            x = F.relu(self.f1_t(x))
+            x = F.relu(self.f2_t(x))
+            x = self.f3_t(x)
+            return x
+    
+    def loss(self, x, reduce = True):
+        return F.mse_loss(self.forward(x), self.forward_t(x)) if reduce else F.mse_loss(self.forward(x), self.forward_t(x), reduction = 'none')
+
+
+def main(seed=None, sweep=False):
+
+    import stable_baselines3 as sb3
+
+    if sb3.__version__ < "2.0":
+        raise ValueError(
+            """Ongoing migration: run the following command to install the new dependencies:
+poetry run pip install "stable_baselines3==2.0.0a1"
+"""
+        )
+
+    args = tyro.cli(Args)
+    if seed is not None:
+        args.seed = seed
+    run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
+
+
+    # For hyperparameter optimization, see trainer.py file
+    if sweep:
+        episodic_returns_list = []
+        corresponding_steps = []
+
+        import wandb
+        wandb.init()
+
+        config = wandb.config
+
+        for key, value in vars(args).items():
+            if key in config:
+                setattr(args, key, config[key])
+
+
+    else :
+        
+        if args.track:
+            import wandb
+
+            wandb.init(
+                project=args.wandb_project_name,
+                entity=args.wandb_entity,
+                sync_tensorboard=True,
+                config=vars(args),
+                name=run_name,
+                monitor_gym=True,
+                save_code=True,
+            )
+        writer = SummaryWriter(f"runs/{run_name}")
+        writer.add_text(
+            "hyperparameters",
+            "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
+        )
+
+    # TRY NOT TO MODIFY: seeding
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.backends.cudnn.deterministic = args.torch_deterministic
+
+    device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")
+
+    # env setup
+    envs = gym.vector.SyncVectorEnv(
+        [make_env(args.env_id, args.seed, i, args.capture_video, run_name) for i in range(args.num_envs)]
+    )
+    assert isinstance(envs.single_action_space, gym.spaces.Box), "only continuous action space is supported"
+
+    max_action = float(envs.single_action_space.high[0])
+
+    actor = Actor(envs).to(device)
+    qf1 = SoftQNetwork(envs).to(device)
+    qf2 = SoftQNetwork(envs).to(device)
+    qf1_target = SoftQNetwork(envs).to(device)
+    qf2_target = SoftQNetwork(envs).to(device)
+    qf1_target.load_state_dict(qf1.state_dict())
+    qf2_target.load_state_dict(qf2.state_dict())
+    q_optimizer = optim.Adam(list(qf1.parameters()) + list(qf2.parameters()), lr=args.q_lr)
+    actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.policy_lr)
+
+    rnd = RND(envs).to(device)
+    rnd_optimizer = optim.Adam(list(rnd.parameters()), lr=args.rnd_lr)
+
+    # Automatic entropy tuning
+    if args.autotune:
+        target_entropy = -torch.prod(torch.Tensor(envs.single_action_space.shape).to(device)).item()
+        log_alpha = torch.zeros(1, requires_grad=True, device=device)
+        alpha = log_alpha.exp().item()
+        a_optimizer = optim.Adam([log_alpha], lr=args.q_lr)
+    else:
+        alpha = args.alpha
+
+    envs.single_observation_space.dtype = np.float32
+
+    # The replay buffer parameters have been updated to handle multiple envs
+    rb = ReplayBuffer(
+        args.buffer_size,
+        envs.single_observation_space,
+        envs.single_action_space,
+        device,
+        handle_timeout_termination=False,
+        n_envs=args.num_envs
+    )
+    start_time = time.time()
+
+    # TRY NOT TO MODIFY: start the game
+    obs, _ = envs.reset(seed=args.seed)
+    for global_step in range(args.total_timesteps):
+        # ALGO LOGIC: put action logic here
+        if global_step < args.learning_starts:
+            actions = np.array([envs.single_action_space.sample() for _ in range(envs.num_envs)])
+        else:
+            actions, _, _ = actor.get_action(torch.Tensor(obs).to(device))
+            actions = actions.detach().cpu().numpy()
+
+        # TRY NOT TO MODIFY: execute the game and log data.
+        next_obs, rewards, terminations, truncations, infos = envs.step(actions)
+
+        # TRY NOT TO MODIFY: record rewards for plotting purposes
+        if "final_info" in infos:
+            for info in infos["final_info"]:
+                if info is not None:
+                    print(f"global_step={global_step}, episodic_return={info['episode']['r']}")
+                    if sweep:
+                        episodic_returns_list.append(info["episode"]["r"])
+                        corresponding_steps.append(global_step)
+                    else:
+                        writer.add_scalar("charts/episodic_return", info["episode"]["r"], global_step)
+                        writer.add_scalar("charts/episodic_length", info["episode"]["l"], global_step)
+                    break
+
+        # TRY NOT TO MODIFY: save data to reply buffer; handle `final_observation`
+        real_next_obs = next_obs.copy()
+        for idx, trunc in enumerate(truncations):
+            if trunc:
+                real_next_obs[idx] = infos["final_observation"][idx]
+        rb.add(obs, real_next_obs, actions, rewards, terminations, infos)
+
+        # TRY NOT TO MODIFY: CRUCIAL step easy to overlook
+        obs = next_obs
+
+        # ALGO LOGIC: training.
+        if global_step > args.learning_starts:
+
+            if global_step % args.rnd_frequency == 0:
+                mean_rnd_loss = 0.0
+                for _ in range(args.rnd_epochs):
+                    data = rb.sample(args.batch_size)
+                    
+                    rnd_loss = rnd.loss(data.observations).mean()
+                    rnd_optimizer.zero_grad()
+                    rnd_loss.backward()
+                    rnd_optimizer.step()
+                    mean_rnd_loss += rnd_loss.item()
+
+                mean_rnd_loss /= args.rnd_epochs
+                if not sweep:
+                    writer.add_scalar("losses/vae_loss", mean_rnd_loss, global_step)                
+
+
+            data = rb.sample(args.batch_size)
+            with torch.no_grad():
+                next_state_actions, next_state_log_pi, _ = actor.get_action(data.next_observations)
+                qf1_next_target = qf1_target(data.next_observations, next_state_actions)
+                qf2_next_target = qf2_target(data.next_observations, next_state_actions)
+                min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - alpha * next_state_log_pi
+                intrinsic_reward = rnd.loss(data.observations, reduce = False)
+                extrinsic_reward = data.rewards.flatten()
+                if args.keep_extrinsic_reward:
+                    rewards = extrinsic_reward*args.coef_extrinsic + intrinsic_reward.flatten()*args.coef_intrinsic
+                else:
+                    rewards = intrinsic_reward.flatten() *args.coef_intrinsic
+                next_q_value = rewards + (1 - data.dones.flatten()) * args.gamma * (min_qf_next_target).view(-1)
+
+            qf1_a_values = qf1(data.observations, data.actions).view(-1)
+            qf2_a_values = qf2(data.observations, data.actions).view(-1)
+
+
+            qf1_loss = F.mse_loss(qf1_a_values, next_q_value)
+            qf2_loss = F.mse_loss(qf2_a_values, next_q_value)
+            qf_loss = qf1_loss + qf2_loss
+
+            # optimize the model
+            q_optimizer.zero_grad()
+            qf_loss.backward()
+            q_optimizer.step()
+
+            if global_step % args.policy_frequency == 0:  # TD 3 Delayed update support
+                for _ in range(
+                    args.policy_frequency
+                ):  # compensate for the delay by doing 'actor_update_interval' instead of 1
+                    pi, log_pi, _ = actor.get_action(data.observations)
+                    qf1_pi = qf1(data.observations, pi)
+                    qf2_pi = qf2(data.observations, pi)
+                    min_qf_pi = torch.min(qf1_pi, qf2_pi)
+                    actor_loss = ((alpha * log_pi) - min_qf_pi).mean()
+
+                    actor_optimizer.zero_grad()
+                    actor_loss.backward()
+                    actor_optimizer.step()
+
+                    if args.autotune:
+                        with torch.no_grad():
+                            _, log_pi, _ = actor.get_action(data.observations)
+                        alpha_loss = (-log_alpha.exp() * (log_pi + target_entropy)).mean()
+
+                        a_optimizer.zero_grad()
+                        alpha_loss.backward()
+                        a_optimizer.step()
+                        alpha = log_alpha.exp().item()
+
+            # update the target networks
+            if global_step % args.target_network_frequency == 0:
+                for param, target_param in zip(qf1.parameters(), qf1_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+                for param, target_param in zip(qf2.parameters(), qf2_target.parameters()):
+                    target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
+
+            if global_step % 100 == 0 and not sweep:
+                writer.add_scalar("losses/qf1_values", qf1_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf2_values", qf2_a_values.mean().item(), global_step)
+                writer.add_scalar("losses/qf1_loss", qf1_loss.item(), global_step)
+                writer.add_scalar("losses/qf2_loss", qf2_loss.item(), global_step)
+                writer.add_scalar("losses/qf_loss", qf_loss.item() / 2.0, global_step)
+                writer.add_scalar("losses/actor_loss", actor_loss.item(), global_step)
+                writer.add_scalar("losses/alpha", alpha, global_step)
+                print("SPS:", int(global_step / (time.time() - start_time)))
+                writer.add_scalar("charts/SPS", int(global_step / (time.time() - start_time)), global_step)
+                if args.autotune:
+                    writer.add_scalar("losses/alpha_loss", alpha_loss.item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_mean", intrinsic_reward.mean().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_max", intrinsic_reward.max().item(), global_step)
+                writer.add_scalar("specific/intrinsic_reward_min", intrinsic_reward.min().item(), global_step)
+                
+    envs.close()
+    if sweep:
+        return episodic_returns_list, corresponding_steps
+    writer.close()
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/cleanrl/cleanrl_explo/trainer.py b/cleanrl/cleanrl_explo/trainer.py
new file mode 100644
index 00000000..d453aeba
--- /dev/null
+++ b/cleanrl/cleanrl_explo/trainer.py
@@ -0,0 +1,172 @@
+import wandb
+import importlib
+import multiprocessing
+import time
+from dataclasses import dataclass
+
+import numpy as np
+
+
+@dataclass
+class Sweep_Args():
+
+    
+    ############################### IMPORTANT ################################
+    """
+    This code produces a sweep for training a SAC-exploration agent with different hyperparameters.
+    It will print an id. This id can be used to run the same sweep in parellel on different machines.
+    Thus, you can run the same sweep on different machines and the results will be aggregated in the same wandb project,
+    therefore speeding up the hyperparameter search.
+
+    To do so you must run same script than this one on the other machines, but with the same sweep id. 
+    So you must copy the sweep id from the output of this script and paste it in the other scripts.
+    """
+
+    ###########################################################################
+
+    available_methods = ["aux", "icm", "ngu", "rnd", "apt", "our_method"]
+    "All the methods available for training"
+
+
+    method = "our_method"
+    "The method to use for training"
+    environment = "LilMaze"
+    "The environment to use for training"
+    nb_of_attempts: int = 1
+    "Every hyperparameter combination will be tried this many times, the average will be used"
+    nb_of_parallel_jobs: int = 1
+    "The number of parallel agents to run (remember that several environments will already be run for every single agent)"
+    count: int = 10
+    "The number of hyperparameter combinations to try per agent"
+
+
+    fichier = f"sac_{method}"
+    "The file to run for training" 
+    project: str = f"{method} sweep {environment}"
+    "The project name to use in wandb"
+    
+
+
+    """
+    In order to run the sweep, you must create a sweep configuration dictionnary.
+    The documentation for the sweep configuration can be found here: https://docs.wandb.ai/guides/sweeps/configuration
+    """
+
+    sweep_config = {
+        "method": "bayes",
+        "metric": {"goal": "maximize", "name": "episodic_return"},
+        "parameters": {
+            "classifier_lr": {
+                "distribution": "log_uniform_values",
+                "max": 1e-2,
+                "min": 1e-5,
+            },
+            "coef_intrinsic": {
+                "distribution": "log_uniform_values",
+                "max": 100.0,
+                "min": 0.1,
+            },
+            "coef_extrinsic": {
+                "distribution": "log_uniform_values",
+                "max": 100.0,
+                "min": 0.1,
+            },
+            "total_timesteps": {
+                'value': 200000,   
+            },
+            'capture_video': {
+                'value': False
+            },
+            'keep_extrinsic_reward': {
+                'value': False
+            },
+            'env_id': {
+                'value': f"{environment}"
+            },
+        },
+    }
+
+
+    assert method in available_methods, f"method must be in {available_methods}"
+
+    sweep_id = wandb.sweep(sweep_config, project=project)
+    "The sweep id to use for the sweep"
+
+def train(args: Sweep_Args):
+
+    try:
+        module = importlib.import_module(args.fichier)
+        values = []
+        steps = []
+        for i in range(args.nb_of_attempts):
+            v, t = module.main(seed=i, sweep=True)
+
+            values += v
+            steps += t
+
+
+
+        values = np.array(values)
+        steps = np.array(steps).reshape(-1, 1)
+
+
+        # We use the quantile regression to get the median and the 95% confidence interval
+        
+
+        from sklearn.ensemble import GradientBoostingRegressor
+
+        gbm_median = GradientBoostingRegressor(loss="quantile", alpha=0.5, n_estimators=100)
+        gbm_median.fit(steps, values)
+
+        gbm_upper = GradientBoostingRegressor(loss="quantile", alpha=0.975, n_estimators=100)
+        gbm_upper.fit(steps, values)
+
+        gbm_lower = GradientBoostingRegressor(loss="quantile", alpha=0.025, n_estimators=100)
+        gbm_lower.fit(steps, values)
+
+
+        plot_steps = np.linspace(steps.min(), steps.max(), 200)[:, np.newaxis]
+
+        y_pred_median = gbm_median.predict(plot_steps).ravel()
+        y_pred_upper = gbm_upper.predict(plot_steps).ravel()
+        y_pred_lower = gbm_lower.predict(plot_steps).ravel()
+
+
+        for t, min, median, max in list(zip(steps, y_pred_lower, y_pred_median, y_pred_upper)):
+            wandb.log({
+                "episodic_return": median,
+                "episodic_return_upper": max,
+                "episodic_return_lower": min
+            }, step=t[0])
+
+        
+    except ModuleNotFoundError:
+        print(f"Erreur: le module '{args.fichier}' n'a pas été trouvé.")
+    except AttributeError:
+        print(f"Erreur: le module '{args.fichier}' n'a pas de fonction 'main'.")
+    except Exception as e:
+        print(f"Erreur: {e}")
+
+
+def agent(index: int, args: Sweep_Args):
+    print(f"Agent {index} started.")
+    
+    wandb.agent(args.sweep_id, function=lambda: train(args), project=args.project, count=args.count)
+    
+    print(f"Agent {index} finished.")
+
+
+if __name__ == "__main__":
+
+    args = Sweep_Args()
+    processes = []
+    for i in range(args.nb_of_parallel_jobs):
+        p = multiprocessing.Process(target=agent, args=(i, args))
+        p.start()
+        processes.append(p)
+
+
+    for p in processes:
+        p.join()
+
+    print("All processes have finished.")
\ No newline at end of file