forked from Gouet/DDPG_PendulumV1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
119 lines (90 loc) · 3.04 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import gym
import time
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
import ddpg
import os
TRAIN_MODE = False
try:
os.mkdir('./saved')
except OSError:
print ("Creation of the directory failed")
else:
print ("Successfully created the directory")
print(tf.__version__)
env = gym.make('Pendulum-v0')
critic = ddpg.Critic()
actor = ddpg.Actor()
target_critic = ddpg.TargetCritic()
target_actor = ddpg.TargetActor()
try:
critic.load()
actor.load()
except Exception as e:
print(e.__repr__)
target_actor.hard_copy(actor.model.trainable_variables)
target_critic.hard_copy(critic.model.trainable_variables)
ou = ddpg.OrnsteinUhlenbeckActionNoise(mu=np.zeros(1,))
buffer = ddpg.ReplayBuffer(100000)
global ep_ave_max_q_value
ep_ave_max_q_value = 0
global total_reward
total_reward = 0
def create_tensorboard():
global_step = tf.train.get_or_create_global_step()
logdir = "./logs/"
writer = tf.contrib.summary.create_file_writer(logdir)
writer.set_as_default()
return global_step, writer
global global_step
global_step, writer = create_tensorboard()
def train(action, reward, state, state2, done):
global ep_ave_max_q_value
buffer.add(state, action, reward, done, state2)
batch_size = 64
if buffer.size() > batch_size:
s_batch, a_batch, r_batch, t_batch, s2_batch = buffer.sample_batch(batch_size)
target_action2 = target_actor.model.predict(s2_batch)
predicted_q_value = target_critic.model.predict([s2_batch, target_action2])
yi = []
for i in range(batch_size):
if t_batch[i]:
yi.append(r_batch[i])
else:
yi.append(r_batch[i] + 0.99 * predicted_q_value[i])
predictions = critic.train_step(s_batch, a_batch, yi)
ep_ave_max_q_value += np.amax(predictions)
grad = critic.actor_gradient(s_batch, actor)
actor.train_step(s_batch, grad)
target_actor.update(actor.model.trainable_variables)
target_critic.update(critic.model.trainable_variables)
for episode in range(10000):
global_step.assign_add(1)
obs = env.reset()
done = False
j = 0
ep_ave_max_q_value = 0
total_reward = 0
while not done:
if not TRAIN_MODE:
env.render()
obs = obs.reshape((1, 3))
noise = ou()
action = actor.model.predict(obs)
if TRAIN_MODE:
action = action + noise
obs2, reward, done, info = env.step(action)
total_reward += reward
if TRAIN_MODE:
train(action, reward, obs, obs2.reshape((1, 3)), done)
obs = obs2
j += 1
with writer.as_default(), tf.contrib.summary.always_record_summaries():
tf.contrib.summary.scalar("average_max_q", ep_ave_max_q_value / float(j))
tf.contrib.summary.scalar("reward", total_reward)
if TRAIN_MODE:
critic.save()
actor.save()
print('average_max_q: ', ep_ave_max_q_value / float(j), 'reward: ', total_reward, 'episode:', episode)
env.close()