R/environment_gridworld.R In reinforcelearn: Reinforcement Learning

Defines functions visualizeGridworldmakeRewardMatrixgogo.leftgo.rightgo.upgo.downgo.leftupgo.leftdowngo.rightupgo.rightdownapplyWindgetIntoBounds

```#' Gridworld
#'
#' Creates gridworld environments.
#'
#' A gridworld is an episodic navigation task, the goal is to get from start state to goal state.
#'
#' Possible actions include going left, right, up or down. If `diagonal.moves = TRUE` diagonal
#' moves are also possible, leftup, leftdown, rightup and rightdown.
#'
#' When stepping into a cliff state you get a reward of `reward.cliff`,
#' usually a high negative reward and transition to a state specified by `cliff.transition.states`.
#'
#' In each column a deterministic wind specified via `wind` pushes you up a specific number of
#' grid cells (for the next action).
#'
#' A stochastic gridworld is a gridworld where with probability `stochasticity` the next state
#' is chosen at random from all neighbor states independent of the actual action.
#'
#' If an action would take you off the grid, the new state is the nearest cell inside the grid.
#' For each step you get a reward of `reward.step`, until you reach a goal state,
#' then the episode is done.
#'
#' States are enumerated row-wise and numeration starts with 0.
#' Here is an example 4x4 grid:
#' \tabular{rrrr}{
#'  0 \tab 1 \tab 2 \tab 3 \cr
#'  4 \tab 5 \tab 6 \tab 7 \cr
#'  8 \tab 9 \tab 10 \tab 11 \cr
#'  12 \tab 13 \tab 14 \tab 15 \cr
#' }
#' So a board position could look like this (G: goal state, x: current state, C: cliff state):
#' \tabular{rrrr}{
#'  G \tab o \tab o \tab o \cr
#'  o \tab o \tab o \tab o \cr
#'  o \tab x \tab o \tab o \cr
#'  o \tab o \tab o \tab C \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("gridworld", shape = NULL, goal.states = NULL, cliff.states = NULL,
#'   reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]),
#'   cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...)`
#'
#' @param shape \[`integer(2)`] \cr
#'   Shape of the gridworld (number of rows x number of columns).
#' @param goal.states \[`integer`] \cr
#'   Goal states in the gridworld.
#' @param cliff.states \[`integer`] \cr
#'   Cliff states in the gridworld.
#' @param reward.step \[`integer(1)`] \cr
#'   Reward for taking a step.
#' @param cliff.transition.states \[`integer`] \cr
#'   States to which the environment transitions if stepping into the cliff.
#'   If it is a vector, all states will have equal probability.
#'   Only used when `cliff.transition.done == FALSE`,
#'   else specify the `initial.state` argument.
#' @param reward.cliff \[`integer(1)`] \cr
#'   Reward for taking a step in the cliff state.
#' @param diagonal.moves \[`logical(1)`] \cr
#'   Should diagonal moves be allowed?
#' @param wind \[`integer`] \cr
#'   Strength of the upward wind in each cell.
#' @param cliff.transition.done \[`logical(1)`] \cr
#'   Should the episode end after stepping into the cliff?
#' @param stochasticity \[`numeric(1)`] \cr
#'   Probability of random transition to any of the neighboring states when taking any action.
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @name Gridworld
#' @md
#' @inheritSection Environment Methods
#' @export
#'
#' @examples
#' # Gridworld Environment (Sutton & Barto Example 4.1)
#' env1 = makeEnvironment("gridworld", shape = c(4L, 4L), goal.states = 0L,
#'   initial.state = 15L)
#' env1\$reset()
#' env1\$visualize()
#' env1\$step(0L)
#' env1\$visualize()
#'
#' # Windy Gridworld (Sutton & Barto Example 6.5)
#' env2 = makeEnvironment("gridworld", shape = c(7, 10), goal.states = 37L,
#'   reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0),
#'   initial.state = 30L)
#'
#' # Cliff Walking (Sutton & Barto Example 6.6)
#' env3 = makeEnvironment("gridworld", shape = c(4, 12), goal.states = 47L,
#'   cliff.states = 37:46, reward.step = -1, reward.cliff = -100,
#'   cliff.transition.states = 36L, initial.state = 36L)
NULL

Gridworld = R6::R6Class("Gridworld",
inherit = MdpEnvironment,

public = list(

initialize = function(shape = NULL, goal.states = NULL, cliff.states = NULL,
reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]),
cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...) {

checkmate::assertIntegerish(shape, len = 2)
if (prod(shape) <= 1) {
stop("A gridworld with only one state is not allowed!")
}
checkmate::assertIntegerish(goal.states)
goal.states = goal.states + 1
checkmate::assertIntegerish(cliff.states, null.ok = TRUE)
if (!is.null(cliff.states)) {
cliff.states = cliff.states + 1
}
checkmate::assertIntegerish(cliff.transition.states, null.ok = TRUE)
if (!is.null(cliff.transition.states)) {
cliff.transition.states = cliff.transition.states + 1
}
if (any(goal.states > prod(shape)) || any(cliff.states > prod(shape)) |
any(cliff.transition.states > prod(shape))) {
stop("All states must be inside the grid! States are numerated row-wise starting with 0, check Details!")
}
checkmate::assertIntegerish(wind, len = shape[2])
checkmate::assertNumber(reward.step)
checkmate::assertNumber(reward.cliff)
checkmate::assertFlag(diagonal.moves)
checkmate::assertFlag(cliff.transition.done)
checkmate::assertNumber(stochasticity, lower = 0, upper = 1)

n.states = prod(shape)
states = seq_len(n.states)
n.col = shape[2]
if (diagonal.moves) {
n.actions = 8
action.names = c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L,
"leftup" = 4L, "leftdown" = 5L, "rightup" = 6L, "rightdown" = 7L)
} else {
n.actions = 4
action.names = c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L)
}

rewards = makeRewardMatrix(reward.step, reward.cliff, n.states, n.actions,
cliff.states, goal.states)

transitions = array(matrix(0, nrow = n.states, ncol = n.states),
dim = c(n.states, n.states, 8))

border.states = list(left = seq(1, n.states - n.col + 1, n.col),
right = seq(n.col, n.states, n.col),
up = seq(1, n.col),
down = seq(n.states - n.col + 1, n.states))

non.terminal.states = setdiff(states, c(goal.states, cliff.states))
actions = list("left", "right", "up", "down", "leftup",
"leftdown", "rightup", "rightdown")
actions = lapply(actions, function(x) {class(x) = x; x})

m.cliff = NULL
if (cliff.transition.done) {
goal.states = c(goal.states, cliff.states)
} else {
if (!is.null(cliff.states)) {
if (!is.null(cliff.transition.states)) {
cliff.pairs = as.matrix(expand.grid(cliff.states, cliff.transition.states))
cliff.prob = 1 / length(cliff.transition.states)
m.cliff = cbind(cliff.pairs, cliff.prob)
} else {
non.terminal.states = c(non.terminal.states, cliff.states)
}
}
}

n.states = length(non.terminal.states)
new.states = vapply(actions, go, states = non.terminal.states, border.states = border.states,
n.col = n.col, FUN.VALUE = numeric(n.states))

if (!is.matrix(new.states)) {
new.states = as.matrix(new.states, nrow = 1)
}

m.stoch = matrix(0, nrow = n.states * 8, ncol = 3)
m.stoch[, 1] = rep(non.terminal.states, 8)
m.stoch[, 2] = c(new.states)
m.stoch[, 3] = stochasticity / 8

m.goal = matrix(c(goal.states, goal.states, rep(1, length(goal.states))), ncol = 3)
m = rbind(m.cliff, m.goal, m.stoch)
m = m[rep(seq_len(nrow(m)), each = 8), ]
m = cbind(m, action = rep(1:8, nrow(m) / 8))

new.states = c(apply(new.states, 2, applyWind, states = non.terminal.states, n.col = n.col, wind = wind))
new.states = getIntoBounds(new.states, n.col = n.col)

m2 = matrix(c(rep(non.terminal.states, 8), new.states, rep(1 - stochasticity, length(new.states)),
rep(1:8, each = length(non.terminal.states))), ncol = 4)
m = rbind(m, m2)
colnames(m) = c("row", "col", "prob", "action")

m = as.matrix(aggregate(prob ~ row + col + action, data = as.data.frame(m), FUN = "sum"))
transitions[m[, c("row", "col", "action")]] = m[, "prob"]
transitions = transitions[, , seq_len(n.actions)]

visualize = function(env) {
message(cat(visualizeGridworld(shape, env\$state), "\n"))
}
# fixme: make action.names and visualize overwriteable
super\$initialize(transitions = transitions, rewards = rewards,
visualize = visualize, action.names = action.names, ...)
}
)
)

visualizeGridworld = function(shape, current.state) {
one.row = paste(rep("-", shape[2]), collapse = " ")
grid.vis = paste("", one.row, collapse = "")
for (i in seq_len(shape[1] - 1L)) {
grid.vis = paste(grid.vis, "\n", one.row)
}

n = current.state + 1
# find position of nth -
str.pos = gregexpr("-", grid.vis)[[1]][n]
# replace nth - with o (current state in grid)
grid.vis = sub(paste0("^(.{", str.pos - 1, "})(.)(.*\$)", collapse = ""),
"\\1o\\3", grid.vis)
grid.vis
}

makeRewardMatrix = function(reward.step, reward.cliff, n.states, n.actions,
cliff.states, goal.states) {
rewards = matrix(reward.step, nrow = n.states, ncol = n.actions)
rewards[cliff.states, ] = reward.cliff
rewards[goal.states, ] = 0
rewards
}

go = function(x, ...) {
UseMethod("go", x)
}

#' @export
go.left = function(x, states, border.states, ...) {
ifelse(states %in% border.states[["left"]], states, states - 1)
}

#' @export
go.right = function(x, states, border.states, ...) {
ifelse(states %in% border.states[["right"]], states, states + 1)
}

#' @export
go.up = function(x, states, border.states, n.col) {
ifelse(states %in% border.states[["up"]], states, states - n.col)
}

#' @export
go.down = function(x, states, border.states, n.col) {
ifelse(states %in% border.states[["down"]], states, states + n.col)
}

#' @export
go.leftup = function(x, states, border.states, n.col) {
go.left(x, go.up(x, states, border.states, n.col), border.states)
}

#' @export
go.leftdown = function(x, states, border.states, n.col) {
go.left(x, go.down(x, states, border.states, n.col), border.states)
}

#' @export
go.rightup = function(x, states, border.states, n.col) {
go.right(x, go.up(x, states, border.states, n.col), border.states)
}

#' @export
go.rightdown = function(x, states, border.states, n.col) {
go.right(x, go.down(x, states, border.states, n.col), border.states)
}

applyWind = function(states, new.states, wind, n.col) {
column = states %% n.col
column[column == 0] = n.col
new.states - wind[column] * n.col
}

getIntoBounds = function(new.states, n.col) {
while (any(new.states <= 0)) {
new.states[new.states <= 0] = new.states[new.states <= 0] + n.col
}
new.states
}

#' Cliff Walking
#'
#' Gridworld environment for reinforcement learning from Sutton & Barto (2017).
#' Grid of shape 4x12 with a goal state in the bottom right of the grid.
#' Episodes start in the lower left state. Possible actions include going left, right, up and down.
#' Some states in the lower part of the grid are a cliff,
#' so taking a step into this cliff will yield a high negative reward of - 100 and move the agent
#' back to the starting state.
#' Elsewise rewards are - 1, for the goal state 0.
#'
#' This is the gridworld (goal state denoted G, cliff states denoted C, start state denoted S):
#' \tabular{rrrrrrrrrrrr}{
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  S \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab G \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("cliff.walking", ...)`
#'
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @md
#'
#' @name CliffWalking
#' @aliases cliff.walking
#'
#' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.6
#' @export
#'
#' @inheritSection Environment Methods
#' @examples
#' env = makeEnvironment("cliff.walking")
NULL

CliffWalking = R6::R6Class("CliffWalking",
inherit = Gridworld,
public = list(
initialize = function(...) {
super\$initialize(shape = c(4, 12), goal.states = 47,
cliff.states = 37:46, reward.step = -1, reward.cliff = -100,
cliff.transition.states = 36, initial.state = 36, ...)
}
)
)

#' Windy Gridworld
#'
#' Windy Gridworld problem for reinforcement learning. Actions include
#' going left, right, up and down. In each column the wind pushes you up a
#' specific number of steps (for the next action). If an action would
#' take you off the grid, you remain in the previous state. For each step you
#' get a reward of -1, until you reach into a terminal state.
#'
#' This is the gridworld (goal state denoted G, start state denoted S).
#' The last row specifies the upward wind in each column.
#' \tabular{rrrrrrrrrr}{
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  S \tab . \tab . \tab . \tab . \tab . \tab . \tab G \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  0 \tab 0 \tab 0 \tab 1 \tab 1 \tab 1 \tab 2 \tab 2 \tab 1 \tab 0 \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("windy.gridworld", ...)`
#'
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @md
#'
#' @name WindyGridworld
#' @aliases windy.gridworld
#'
#' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.5
#' @export
#'
#' @inheritSection Environment Methods
#' @examples
#' env = makeEnvironment("windy.gridworld")
NULL

WindyGridworld = R6::R6Class("WindyGridworld",
inherit = Gridworld,
public = list(
initialize = function(...) {
super\$initialize(shape = c(7, 10), goal.states = 37,
reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0),
initial.state = 30, ...)
}
)
)

# fixme: add character actions (e.g. "left")
```

Try the reinforcelearn package in your browser

Any scripts or data that you put into this service are public.

reinforcelearn documentation built on May 2, 2019, 9:20 a.m.