-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathqlearning.py
42 lines (30 loc) · 1.04 KB
/
qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
br = 100 #big reward
sr = 0 #small reward
# init reward table
R = np.array([[-1, -1, -1, -1, sr, -1],
[-1, -1, -1, sr, -1, br],
[-1, -1, -1, sr, -1, -1],
[-1, sr, sr, -1, sr, -1],
[sr, -1, -1, sr, -1, br],
[-1, sr, -1, -1, sr, br]])
# alpha = 0.9
gamma = 0.8
# init Q table
Q = np.zeros(R.shape)
#pick a starting state
state = np.random.randint(0, 5)
for i in range(1000):
# list of all posible actions in this state
possible_actions = [i for i, x in enumerate(R[state, :]) if x >= 0]
# choose radomly one action form the list of all posible actions
action = np.random.choice(possible_actions)
# action is our next state form now
next_state = action
# update Q matrix
Q[state,action] = R[state,action] + gamma*np.max(Q[next_state,:])
# if R[state,action] == 100: break
# set next state as current state
state = next_state
# print Q matrix scaled in percents (and floored)
print(np.array(Q/np.max(Q)*100).astype(int))