From c5ca042d8d53de3d3e370b36f5378add72540146 Mon Sep 17 00:00:00 2001 From: jsrimr Date: Mon, 5 Jul 2021 19:17:30 +0900 Subject: [PATCH 1/2] implemented ape-x --- ape-x.py | 181 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 ape-x.py diff --git a/ape-x.py b/ape-x.py new file mode 100644 index 0000000..83978a1 --- /dev/null +++ b/ape-x.py @@ -0,0 +1,181 @@ +import random + +import collections +import gym +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +# Hyperparameters +learning_rate = 0.0005 +gamma = 0.98 +buffer_limit = 50000 +batch_size = 32 + + +class ReplayBuffer(): + def __init__(self): + self.buffer = collections.deque(maxlen=buffer_limit) + + def sample(self, n): + return random.sample(self.buffer, n) + + def size(self): + return len(self.buffer) + + +class Qnet(nn.Module): + def __init__(self): + super(Qnet, self).__init__() + self.fc1 = nn.Linear(4, 256) + self.fc2 = nn.Linear(256, 2) + + def forward(self, x): + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return x + + def sample_action(self, obs, epsilon): + out = self.forward(obs) + coin = random.random() + if coin < epsilon: + return random.randint(0, 1) + else: + return out.argmax().item() + + +def learner_process(model, target_model, exp_q): + leaner = Learner(model, target_model, exp_q) + leaner.run() + + +class Learner: + def __init__(self, model, target_model, conn): + self.memory = ReplayBuffer() + self.q = model + self.q_target = target_model + self.optimizer = optim.Adam(self.q.parameters()) + self.n_epochs = 0 + self.conn = conn + + def run(self): + while True: + if self.memory.size() > 2000: + self.train() + self.n_epochs += 1 + if self.n_epochs % 10 == 0: + self.q_target.load_state_dict(self.q.state_dict()) + + while not self.conn.empty(): + try: + experience = self.conn.get() + self.memory.buffer.append(experience) + except: + print("memory load failed") + + def train(self): + for i in range(3): + mini_batch = self.memory.sample(batch_size) + s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], [] + + for transition in mini_batch: + s, a, r, s_prime, done_mask = transition + s_lst.append(s) + a_lst.append([a]) + r_lst.append([r]) + s_prime_lst.append(s_prime) + done_mask_lst.append([done_mask]) + + s, a, r, s_prime, done_mask = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \ + torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \ + torch.tensor(done_mask_lst) + + q_out = self.q(s) + q_a = q_out.gather(1, a) + max_q_prime = self.q_target(s_prime).max(1)[0].unsqueeze(1) + target = r + gamma * max_q_prime * done_mask + loss = F.smooth_l1_loss(q_a, target) + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + +def actor_process(actor_id, n_actors, model, target_model, exp_q): + actor = Actor(actor_id, n_actors, model, target_model, exp_q) + actor.run() + + +class Actor: + def __init__(self, actor_id, n_actors, model, target_model, conn): + self.env = gym.make('CartPole-v1') + self.state = self.env.reset() + + self.actor_id = actor_id + self.epsilon = 0.1 + (actor_id / 7) / n_actors # 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) + + self.memory = ReplayBuffer() + self.q = model + self.q_target = target_model + self.episode_reward = 0 + self.n_episodes = 0 + self.net_load_interval = 10 + + self.conn = conn + + def run(self): + while True: + s = self.state + epsilon = max(0.01, self.epsilon - 0.01 * (self.n_episodes / 200)) # Linear annealing from 8% to 1% + a = self.q.sample_action(torch.from_numpy(s).float(), epsilon) + + s_prime, r, done, info = self.env.step(a) + done_mask = 0.0 if done else 1.0 + self.conn.put((s, a, r / 100.0, s_prime, done_mask)) + self.state = s_prime + + self.episode_reward += r + + if done: # episode ends + self.state = self.env.reset() + self.n_episodes += 1 + + if self.n_episodes % 20 == 0: + print('episodes:', self.n_episodes, 'actor_id:', self.actor_id, 'reward:', self.episode_reward) + self.episode_reward = 0 + + +import torch.multiprocessing as mp + + +def main(): + model = Qnet() + target_model = Qnet() + target_model.load_state_dict(model.state_dict()) + model.share_memory() + target_model.share_memory() + + q = mp.Queue() + + # learner process + processes = [mp.Process( + target=learner_process, + args=(model, target_model, q))] + + # actor process + n_actors = 2 + for actor_id in range(n_actors): + processes.append(mp.Process( + target=actor_process, + args=(actor_id, n_actors, model, target_model, q))) + + for i in range(len(processes)): + processes[i].start() + + for p in processes: + p.join() + + +if __name__ == '__main__': + main() From 12e6bb1d4412dd32f5384211793b2e09a9dfcfca Mon Sep 17 00:00:00 2001 From: jsrimr Date: Fri, 9 Jul 2021 17:31:59 +0900 Subject: [PATCH 2/2] =?UTF-8?q?implemented=20r2d2=20:=203000=20eps=20?= =?UTF-8?q?=EC=A0=95=EB=8F=84=EC=97=90=EC=84=9C=20=EC=88=98=EB=A0=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- r2d2.py | 298 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 r2d2.py diff --git a/r2d2.py b/r2d2.py new file mode 100644 index 0000000..f24746d --- /dev/null +++ b/r2d2.py @@ -0,0 +1,298 @@ +import random +from collections import deque, namedtuple + +import gym +import numpy as np +import torch +import torch.multiprocessing as mp +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim + +# Hyperparameters +learning_rate = 0.0005 +gamma = 0.98 +buffer_limit = 100 +batch_size = 32 +cell_size = 16 +sequence_length = 10 +over_lapping_length = 5 +burn_in_length = 3 +Transition = namedtuple('Transition', + ('state', 'next_state', 'action', 'reward', 'mask', 'rnn_state', 'target_rnn_state')) + + +class Qnet(nn.Module): + def __init__(self, num_inputs=4, num_outputs=2): + super().__init__() + self.num_inputs = num_inputs + self.num_outputs = num_outputs + + self.lstm = nn.LSTM(input_size=num_inputs, hidden_size=cell_size, batch_first=True) + self.fc1 = nn.Linear(cell_size, 32) + self.fc2 = nn.Linear(32, num_outputs) + + for m in self.modules(): + if isinstance(m, nn.Linear): + nn.init.xavier_uniform(m.weight) + + def forward(self, x, hidden=None): + if len(x.shape) == 1: + batch_size = 1 + sequence_length = 1 + x = x.view(batch_size, sequence_length, -1) + else: # [batch_size, sequence_length, num_inputs] + batch_size = x.size()[0] + sequence_length = x.size()[1] + out, hidden = self.lstm(x, hidden) + + out = F.relu(self.fc1(out)) + qvalue = self.fc2(out).view(batch_size, sequence_length, self.num_outputs) + + return qvalue, hidden + + +class LocalBuffer(object): + def __init__(self): + self.local_memory = [] + self.memory = [] + self.over_lapping_from_prev = [] + + def push(self, state, next_state, action, reward, mask, rnn_state, target_rnn_state): + self.local_memory.append( + Transition(state, next_state, action, reward, mask, torch.stack(rnn_state).view(2, -1), + torch.stack(target_rnn_state).view(2, -1))) + if (len(self.local_memory) + len(self.over_lapping_from_prev)) == sequence_length or mask == 0: + self.local_memory = self.over_lapping_from_prev + self.local_memory + length = len(self.local_memory) + while len(self.local_memory) < sequence_length: # zero padding to standardize length of the each experience + self.local_memory.append(Transition(torch.tensor([.0, .0, .0, .0], dtype=torch.float), + torch.tensor([.0, .0, .0, .0], dtype=torch.float), 0, 0, 0, + torch.zeros([2, 1, cell_size]).view(2, -1), torch.zeros([2, 1, cell_size]).view(2, -1))) # rnn state = [(hidden,cell), seq_, dim] + self.memory.append([self.local_memory, length]) # length tells true length of the memory + if mask == 0: + self.over_lapping_from_prev = [] + else: + self.over_lapping_from_prev = self.local_memory[over_lapping_length:] + self.local_memory = [] + + def get(self): + episodes = self.memory + batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state, batch_target_rnn_state = [], [], [], [], [], [], [] + lengths = [] + for episode, length in episodes: + batch = Transition(*zip(*episode)) + + batch_state.append(torch.stack(list(batch.state))) + batch_next_state.append(torch.stack(list(batch.next_state))) + batch_action.append(torch.tensor(list(batch.action))) + batch_reward.append(torch.tensor(list(batch.reward))) + batch_mask.append(torch.tensor(list(batch.mask))) + batch_rnn_state.append(torch.stack(list(batch.rnn_state))) + batch_target_rnn_state.append(torch.stack(list(batch.target_rnn_state))) + lengths.append(length) + + self.memory = [] + return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask, + batch_rnn_state, batch_target_rnn_state), lengths + + +class Memory(object): + def __init__(self): + self.memory = deque(maxlen=buffer_limit) + + def size(self): + return len(self.memory) + + def put(self, batch, lengths): + for i in range(len(batch)): + self.memory.append([Transition(batch.state[i], batch.next_state[i], batch.action[i], batch.reward[i], + batch.mask[i], batch.rnn_state[i], batch.target_rnn_state[i]), lengths[i]]) + + def sample(self, batch_size): + indexes = np.random.choice(range(len(self.memory)), batch_size) + episodes = [self.memory[idx][0] for idx in indexes] + lengths = [self.memory[idx][1] for idx in indexes] + + batch_state, batch_next_state, batch_action, batch_reward, batch_mask, batch_rnn_state, batch_target_rnn_state = [], [], [], [], [], [], [] + for episode in episodes: + batch_state.append(episode.state) + batch_next_state.append(episode.next_state) + batch_action.append(episode.action) + batch_reward.append(episode.reward) + batch_mask.append(episode.mask) + batch_rnn_state.append(episode.rnn_state) + batch_target_rnn_state.append(episode.target_rnn_state) + + return Transition(batch_state, batch_next_state, batch_action, batch_reward, batch_mask, + batch_rnn_state, batch_target_rnn_state), indexes, lengths + + +def learner_process(model, target_model, exp_q, lock): + leaner = Learner(model, target_model, exp_q, lock) + leaner.run() + + +class Learner: + def __init__(self, model, target_model, share_exp_mem, lock): + self.q = model + self.q_target = target_model + self.optimizer = optim.Adam(self.q.parameters()) + self.share_exp_mem = share_exp_mem + self.lock = lock + + self.n_epochs = 0 + + def run(self): + while True: + if self.share_exp_mem.size() > batch_size: + batch, indexes, lengths = self.share_exp_mem.sample(batch_size) + for _ in range(8): + self.train(batch, lengths) + self.n_epochs += 1 + if self.n_epochs % 5 == 0: + self.q_target.load_state_dict(self.q.state_dict()) + + def train(self, batch, lengths): + def slice_burn_in(item): + return item[:, burn_in_length:, :] + + batch_size = torch.stack(batch.state).size()[0] + states = torch.stack(batch.state).view(batch_size, sequence_length, self.q.num_inputs) + next_states = torch.stack(batch.next_state).view(batch_size, sequence_length, self.q.num_inputs) + actions = torch.stack(batch.action).view(batch_size, sequence_length, -1).long() + rewards = torch.stack(batch.reward).view(batch_size, sequence_length, -1) + masks = torch.stack(batch.mask).view(batch_size, sequence_length, -1) + rnn_state = torch.stack(batch.rnn_state).view(batch_size, sequence_length, 2, -1) + target_rnn_state = torch.stack(batch.target_rnn_state).view(batch_size, sequence_length, 2, -1) + + [h0, c0] = rnn_state[:, 0, :, :].transpose(0, 1) # the first hidden state among sequence_length + h0 = h0.unsqueeze(0).detach() + c0 = c0.unsqueeze(0).detach() + + [h1, c1] = rnn_state[:, 0, :, :].transpose(0, 1) # the first hidden state among sequence_length + h1 = h1.unsqueeze(0).detach() + c1 = c1.unsqueeze(0).detach() + + [target_h1, target_c1] = target_rnn_state[:, 1, :, :].transpose(0, 1) # the second hidden state among sequence_length + target_h1 = target_h1.unsqueeze(0).detach() + target_c1 = target_c1.unsqueeze(0).detach() + + pred, _ = self.q(states, (h0, c0)) + next_pred_online, _ = self.q(next_states, (h1, c1)) + next_pred, _ = self.q_target(next_states, (target_h1, target_c1)) + + pred = slice_burn_in(pred) + next_pred = slice_burn_in(next_pred) + actions = slice_burn_in(actions) + rewards = slice_burn_in(rewards) + masks = slice_burn_in(masks) + next_pred_online = slice_burn_in(next_pred_online) + + pred = pred.gather(2, actions) # [batch_size, seq_len - burn_in_length, num_outputs] + _, next_pred_online_action = next_pred_online.max(2) + target = rewards + masks * gamma * next_pred.gather(2, next_pred_online_action.unsqueeze(2)) + td_error = pred - target.detach() + for idx, length in enumerate(lengths): + td_error[idx][length - burn_in_length:][:] = 0 + loss = pow(td_error, 2).mean() + + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + +def actor_process(actor_id, n_actors, model, target_model, exp_q, lock): + actor = Actor(actor_id, n_actors, model, target_model, exp_q, lock) + actor.run() + + +class Actor: + def __init__(self, actor_id, n_actors, model, target_model, share_exp_mem, lock): + self.env = gym.make('CartPole-v1') + self.actor_id = actor_id + self.epsilon = 0.1 + (actor_id / 7) / n_actors # 0.4 ** (1 + actor_id * 7 / (n_actors - 1)) + + self.local_buffer = LocalBuffer() + self.q = model + self.q_target = target_model + self.overlap_length = 5 + + self.share_exp_mem = share_exp_mem + self.lock = lock + + def run(self): + for e in range(30000): + done = False + score = 0 + state = self.env.reset() + state = torch.tensor(state, dtype=torch.float) + target_hidden = hidden = (torch.zeros(1, 1, cell_size), torch.zeros(1, 1, cell_size)) + + while not done: + epsilon = max(0.01, self.epsilon - 0.01 * (e / 200)) # Linear annealing from 8% to 1% + with torch.no_grad(): + q_value, new_hidden = self.q(state) + _, target_new_hidden = self.q_target(state) + if random.random() < epsilon: + action = random.randint(0, 1) + else: + action = q_value.argmax().item() + + next_state, reward, done, _ = self.env.step(action) + next_state = torch.tensor(next_state, dtype=torch.float) + + mask = 0 if done else 1 + self.local_buffer.push(state, next_state, action, reward, mask, hidden, target_hidden) + hidden = new_hidden + target_hidden = target_new_hidden + + if len(self.local_buffer.memory) == batch_size: + batch, lengths = self.local_buffer.get() + + self.lock.acquire() + self.share_exp_mem.put(batch, lengths) + self.lock.release() + + score += reward + state = next_state + + if e % 20 == 0: + print('episodes:', e, 'actor_id:', self.actor_id, 'reward:', score) + + +def main(): + model = Qnet() + model.share_memory() + + target_model = Qnet() + target_model.load_state_dict(model.state_dict()) + target_model.share_memory() + + mp.Manager().register('Memory', Memory) + manager = mp.Manager() + experience_memory = manager.Memory() + + lock = mp.Lock() + + # learner process + processes = [mp.Process( + target=learner_process, + args=(model, target_model, experience_memory, lock))] + + # actor process + n_actors = 2 + for actor_id in range(n_actors): + processes.append(mp.Process( + target=actor_process, + args=(actor_id, n_actors, model, target_model, experience_memory, lock))) + + for i in range(len(processes)): + processes[i].start() + + for p in processes: + p.join() + + +if __name__ == '__main__': + main() \ No newline at end of file