forked from valohai/qlearning-simple
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
54 lines (46 loc) · 2.12 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import random
import json
import argparse
import time
from drunkard import Drunkard
from accountant import Accountant
from gambler import Gambler
from deep_gambler import DeepGambler
from dungeon_simulator import DungeonSimulator
def main():
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--agent', type=str, default='GAMBLER', help='Which agent to use')
parser.add_argument('--learning_rate', type=float, default=0.1, help='How quickly the algorithm tries to learn')
parser.add_argument('--discount', type=float, default=0.95, help='Discount for estimated future action')
parser.add_argument('--iterations', type=int, default=2000, help='Iteration count')
FLAGS, unparsed = parser.parse_known_args()
# select agent
if FLAGS.agent == 'GAMBLER':
agent = Gambler(learning_rate=FLAGS.learning_rate, discount=FLAGS.discount, iterations=FLAGS.iterations)
elif FLAGS.agent == 'ACCOUNTANT':
agent = Accountant()
elif FLAGS.agent == 'DEEPGAMBLER':
agent = DeepGambler(learning_rate=FLAGS.learning_rate, discount=FLAGS.discount, iterations=FLAGS.iterations)
else:
agent = Drunkard()
# setup simulation
dungeon = DungeonSimulator()
dungeon.reset()
total_reward = 0 # Score keeping
last_total = 0
# main loop
for step in range(FLAGS.iterations):
old_state = dungeon.state # Store current state
action = agent.get_next_action(old_state) # Query agent for the next action
new_state, reward = dungeon.take_action(action) # Take action, get new state and reward
agent.update(old_state, new_state, action, reward) # Let the agent update internals
total_reward += reward # Keep score
if step % 250 == 0: # Print out metadata every 250th iteration
performance = (total_reward - last_total) / 250.0
print(json.dumps({'step': step, 'performance': performance, 'total_reward': total_reward}))
last_total = total_reward
time.sleep(0.0001) # Avoid spamming stdout too fast!
# print("Final Q-table", agent.q_table)
if __name__ == "__main__":
main()