gridworld.py

import numpy as np
import sys
from gym.envs.toy_text import discrete

UP = 0
RIGHT = 1
DOWN = 2
LEFT = 3

class GridworldEnv(discrete.DiscreteEnv):
    """
    Grid World environment from Sutton's Reinforcement Learning book chapter 4.
    You are an agent on an MxN grid and your goal is to reach the terminal
    state at the top left or the bottom right corner.

    For example, a 4x4 grid looks as follows:

    T  o  o  o
    o  x  o  o
    o  o  o  o
    o  o  o  T

    x is your position and T are the two terminal states.

    You can take actions in each direction (UP=0, RIGHT=1, DOWN=2, LEFT=3).
    Actions going off the edge leave you in your current state.
    You receive a reward of -1 at each step until you reach a terminal state.
    """

    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, shape=[4,4]):
        if not isinstance(shape, (list, tuple)) or not len(shape) == 2:
            raise ValueError('shape argument must be a list/tuple of length 2')

        self.shape = shape

        nS = np.prod(shape)
        nA = 4

        MAX_Y = shape[0]
        MAX_X = shape[1]

        P = {}
        grid = np.arange(nS).reshape(shape)
        it = np.nditer(grid, flags=['multi_index'])

        while not it.finished:
            '''
            生成概率分布字典P Generate a probability distribution dictionary P:
            P = {Sn: {An: [(prob, next_state, reward, done)] }}
            Sn: 当前状态ID0-15 Current status 0-15
            An: 动作 0-3 对应上下左右 Corresponds to Up, Down, Left, Right
            prob: Sn状态下An动作的概率 The probability of an action in the Sn state
            next_state: 执行An动作后的状态ID The status ID after executing the An action
            reward: 当前状态奖励值 The reward value of current status
            done: Sn是否为结束 Whether Sn is the end
            '''
            s = it.iterindex
            y, x = it.multi_index

            P[s] = {a : [] for a in range(nA)}

            # Set the start and end locations to Ture
            is_done = lambda s: s == 0 or s == (nS - 1)
            # is_done = lambda s: s == 40  # or s == 50
            reward = 0.0 if is_done(s) else -1.0

                # We're stuck in a terminal state
            if is_done(s):
                P[s][UP] = [(1.0, s, reward, True)]
                P[s][RIGHT] = [(1.0, s, reward, True)]
                P[s][DOWN] = [(1.0, s, reward, True)]
                P[s][LEFT] = [(1.0, s, reward, True)]
            # Not a terminal state
            else:
                ns_up = s if (y == 0 or s == 5 or s == 6) else s - MAX_X
                ns_right = s if x == (MAX_X - 1) else s + 1
                ns_down = s if (y == (MAX_Y - 1) or s == 1 or s == 2) else s + MAX_X
                ns_left = s if x == 0 else s - 1
                P[s][UP] = [(1.0, ns_up, reward, is_done(ns_up))]
                P[s][RIGHT] = [(1.0, ns_right, reward, is_done(ns_right))]
                P[s][DOWN] = [(1.0, ns_down, reward, is_done(ns_down))]
                P[s][LEFT] = [(1.0, ns_left, reward, is_done(ns_left))]

            it.iternext()

        # Initial state distribution is uniform
        isd = np.ones(nS) / nS

        # We expose the model of the environment for educational purposes
        # This should not be used in any model-free learning algorithm
        self.P = P

        super(GridworldEnv, self).__init__(nS, nA, P, isd)

    # def _render(self, mode='human', close=False):
    #     if close:
    #         return
    #
    #     outfile = StringIO() if mode == 'ansi' else sys.stdout
    #
    #     grid = np.arange(self.nS).reshape(self.shape)
    #     it = np.nditer(grid, flags=['multi_index'])
    #     while not it.finished:
    #         s = it.iterindex
    #         y, x = it.multi_index
    #
    #         if self.s == s:
    #             output = " x "
    #         elif s == 0 or s == self.nS - 1:
    #             output = " T "
    #         else:
    #             output = " o "
    #
    #         if x == 0:
    #             output = output.lstrip()
    #         if x == self.shape[1] - 1:
    #             output = output.rstrip()
    #
    #         outfile.write(output)
    #
    #         if x == self.shape[1] - 1:
    #             outfile.write("\n")
    #
    #         it.iternext()