-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathreplay_buffer.py
140 lines (118 loc) · 6.5 KB
/
replay_buffer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import numpy as np
import torch
import utils
class ReplayBuffer(object):
"""Buffer to store environment transitions."""
def __init__(self, obs_shape, action_shape, capacity, device, window=1, store_image=False, image_size=300):
self.capacity = capacity
self.device = device
# the proprioceptive obs is stored as float32, pixels obs as uint8
obs_dtype = np.float32 if len(obs_shape) == 1 else np.uint8
self.obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
self.next_obses = np.empty((capacity, *obs_shape), dtype=obs_dtype)
self.actions = np.empty((capacity, *action_shape), dtype=np.float32)
self.rewards = np.empty((capacity, 1), dtype=np.float32)
self.not_dones = np.empty((capacity, 1), dtype=np.float32)
self.not_dones_no_max = np.empty((capacity, 1), dtype=np.float32)
self.window = window
self.store_image = store_image
if self.store_image:
self.images = np.empty((capacity, image_size, image_size, 3), dtype=np.uint8)
self.idx = 0
self.last_save = 0
self.full = False
def __len__(self):
return self.capacity if self.full else self.idx
def add(self, obs, action, reward, next_obs, done, done_no_max, image=None):
np.copyto(self.obses[self.idx], obs)
np.copyto(self.actions[self.idx], action)
np.copyto(self.rewards[self.idx], reward)
np.copyto(self.next_obses[self.idx], next_obs)
np.copyto(self.not_dones[self.idx], not done)
np.copyto(self.not_dones_no_max[self.idx], not done_no_max)
if image is not None and self.store_image:
np.copyto(self.images[self.idx], image)
self.idx = (self.idx + 1) % self.capacity
self.full = self.full or self.idx == 0
def add_batch(self, obs, action, reward, next_obs, done, done_no_max):
next_index = self.idx + self.window
if next_index >= self.capacity:
self.full = True
maximum_index = self.capacity - self.idx
np.copyto(self.obses[self.idx:self.capacity], obs[:maximum_index])
np.copyto(self.actions[self.idx:self.capacity], action[:maximum_index])
np.copyto(self.rewards[self.idx:self.capacity], reward[:maximum_index])
np.copyto(self.next_obses[self.idx:self.capacity], next_obs[:maximum_index])
np.copyto(self.not_dones[self.idx:self.capacity], done[:maximum_index] <= 0)
np.copyto(self.not_dones_no_max[self.idx:self.capacity], done_no_max[:maximum_index] <= 0)
remain = self.window - (maximum_index)
if remain > 0:
np.copyto(self.obses[0:remain], obs[maximum_index:])
np.copyto(self.actions[0:remain], action[maximum_index:])
np.copyto(self.rewards[0:remain], reward[maximum_index:])
np.copyto(self.next_obses[0:remain], next_obs[maximum_index:])
np.copyto(self.not_dones[0:remain], done[maximum_index:] <= 0)
np.copyto(self.not_dones_no_max[0:remain], done_no_max[maximum_index:] <= 0)
self.idx = remain
else:
np.copyto(self.obses[self.idx:next_index], obs)
np.copyto(self.actions[self.idx:next_index], action)
np.copyto(self.rewards[self.idx:next_index], reward)
np.copyto(self.next_obses[self.idx:next_index], next_obs)
np.copyto(self.not_dones[self.idx:next_index], done <= 0)
np.copyto(self.not_dones_no_max[self.idx:next_index], done_no_max <= 0)
self.idx = next_index
def relabel_with_predictor(self, predictor):
if not self.store_image:
batch_size = 200
else:
batch_size = 32
total_iter = int(self.idx/batch_size)
if self.idx > batch_size*total_iter:
total_iter += 1
for index in range(total_iter):
last_index = (index+1)*batch_size
if (index+1)*batch_size > self.idx:
last_index = self.idx
if not self.store_image:
obses = self.obses[index*batch_size:last_index]
actions = self.actions[index*batch_size:last_index]
inputs = np.concatenate([obses, actions], axis=-1)
else:
inputs = self.images[index*batch_size:last_index]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = inputs.astype(np.float32) / 255.0
pred_reward = predictor.r_hat_batch(inputs)
self.rewards[index*batch_size:last_index] = pred_reward
torch.cuda.empty_cache()
def sample(self, batch_size):
idxs = np.random.randint(0,
self.capacity if self.full else self.idx,
size=batch_size)
obses = torch.as_tensor(self.obses[idxs], device=self.device).float()
actions = torch.as_tensor(self.actions[idxs], device=self.device)
rewards = torch.as_tensor(self.rewards[idxs], device=self.device)
next_obses = torch.as_tensor(self.next_obses[idxs],
device=self.device).float()
not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device)
not_dones_no_max = torch.as_tensor(self.not_dones_no_max[idxs],
device=self.device)
return obses, actions, rewards, next_obses, not_dones, not_dones_no_max
def sample_state_ent(self, batch_size):
idxs = np.random.randint(0,
self.capacity if self.full else self.idx,
size=batch_size)
obses = torch.as_tensor(self.obses[idxs], device=self.device).float()
actions = torch.as_tensor(self.actions[idxs], device=self.device)
rewards = torch.as_tensor(self.rewards[idxs], device=self.device)
next_obses = torch.as_tensor(self.next_obses[idxs],
device=self.device).float()
not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device)
not_dones_no_max = torch.as_tensor(self.not_dones_no_max[idxs],
device=self.device)
if self.full:
full_obs = self.obses
else:
full_obs = self.obses[: self.idx]
full_obs = torch.as_tensor(full_obs, device=self.device)
return obses, full_obs, actions, rewards, next_obses, not_dones, not_dones_no_max