-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgridworld.py
126 lines (105 loc) · 4.27 KB
/
gridworld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import numpy as np
import sys
from gym.envs.toy_text import discrete
UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3
class GridworldEnv(discrete.DiscreteEnv):
"""
Grid World environment from Sutton's Reinforcement Learning book chapter 4.
You are an agent on an MxN grid and your goal is to reach the terminal
state at the top left or the bottom right corner.
For example, a 4x4 grid looks as follows:
T o o o
o x o o
o o o o
o o o T
x is your position and T are the two terminal states.
You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3).
Actions going off the edge leave you in your current state.
You receive a reward of -1 at each step until you reach a terminal state.
"""
metadata = {'render.modes': ['human', 'ansi']}
def __init__(self, shape=[4,4]):
if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
raise ValueError('shape argument must be a list/tuple of length 2')
self.shape = shape
nS = np.prod(shape)
nA = 4
MAX_Y = shape[0]
MAX_X = shape[1]
P = {}
grid = np.arange(nS).reshape(shape)
it = np.nditer(grid, flags=['multi_index'])
while not it.finished:
'''
生成概率分布字典P Generate a probability distribution dictionary P:
P = {Sn: {An: [(prob, next_state, reward, done)] }}
Sn: 当前状态ID0-15 Current status 0-15
An: 动作 0-3 对应上下左右 Corresponds to Up, Down, Left, Right
prob: Sn状态下An动作的概率 The probability of an action in the Sn state
next_state: 执行An动作后的状态ID The status ID after executing the An action
reward: 当前状态奖励值 The reward value of current status
done: Sn是否为结束 Whether Sn is the end
'''
s = it.iterindex
y, x = it.multi_index
P[s] = {a : [] for a in range(nA)}
# Set the start and end locations to Ture
is_done = lambda s: s == 0 or s == (nS - 1)
# is_done = lambda s: s == 40 # or s == 50
reward = 0.0 if is_done(s) else -1.0
# We're stuck in a terminal state
if is_done(s):
P[s][UP] = [(1.0, s, reward, True)]
P[s][RIGHT] = [(1.0, s, reward, True)]
P[s][DOWN] = [(1.0, s, reward, True)]
P[s][LEFT] = [(1.0, s, reward, True)]
# Not a terminal state
else:
ns_up = s if (y == 0 or s == 5 or s == 6) else s - MAX_X
ns_right = s if x == (MAX_X - 1) else s + 1
ns_down = s if (y == (MAX_Y - 1) or s == 1 or s == 2) else s + MAX_X
ns_left = s if x == 0 else s - 1
P[s][UP] = [(1.0, ns_up, reward, is_done(ns_up))]
P[s][RIGHT] = [(1.0, ns_right, reward, is_done(ns_right))]
P[s][DOWN] = [(1.0, ns_down, reward, is_done(ns_down))]
P[s][LEFT] = [(1.0, ns_left, reward, is_done(ns_left))]
it.iternext()
# Initial state distribution is uniform
isd = np.ones(nS) / nS
# We expose the model of the environment for educational purposes
# This should not be used in any model-free learning algorithm
self.P = P
super(GridworldEnv, self).__init__(nS, nA, P, isd)
# def _render(self, mode='human', close=False):
# if close:
# return
#
# outfile = StringIO() if mode == 'ansi' else sys.stdout
#
# grid = np.arange(self.nS).reshape(self.shape)
# it = np.nditer(grid, flags=['multi_index'])
# while not it.finished:
# s = it.iterindex
# y, x = it.multi_index
#
# if self.s == s:
# output = " x "
# elif s == 0 or s == self.nS - 1:
# output = " T "
# else:
# output = " o "
#
# if x == 0:
# output = output.lstrip()
# if x == self.shape[1] - 1:
# output = output.rstrip()
#
# outfile.write(output)
#
# if x == self.shape[1] - 1:
# outfile.write("\n")
#
# it.iternext()