reinforcelearn: Reinforcement Learning

#' Gridworld
#'
#' Creates gridworld environments.
#'
#' A gridworld is an episodic navigation task, the goal is to get from start state to goal state.
#'
#' Possible actions include going left, right, up or down. If `diagonal.moves = TRUE` diagonal
#' moves are also possible, leftup, leftdown, rightup and rightdown.
#'
#' When stepping into a cliff state you get a reward of `reward.cliff`,
#' usually a high negative reward and transition to a state specified by `cliff.transition.states`.
#'
#' In each column a deterministic wind specified via `wind` pushes you up a specific number of
#' grid cells (for the next action).
#'
#' A stochastic gridworld is a gridworld where with probability `stochasticity` the next state
#' is chosen at random from all neighbor states independent of the actual action.
#'
#' If an action would take you off the grid, the new state is the nearest cell inside the grid.
#' For each step you get a reward of `reward.step`, until you reach a goal state,
#' then the episode is done.
#'
#' States are enumerated row-wise and numeration starts with 0.
#' Here is an example 4x4 grid:
#' \tabular{rrrr}{
#'  0 \tab 1 \tab 2 \tab 3 \cr
#'  4 \tab 5 \tab 6 \tab 7 \cr
#'  8 \tab 9 \tab 10 \tab 11 \cr
#'  12 \tab 13 \tab 14 \tab 15 \cr
#' }
#' So a board position could look like this (G: goal state, x: current state, C: cliff state):
#' \tabular{rrrr}{
#'  G \tab o \tab o \tab o \cr
#'  o \tab o \tab o \tab o \cr
#'  o \tab x \tab o \tab o \cr
#'  o \tab o \tab o \tab C \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("gridworld", shape = NULL, goal.states = NULL, cliff.states = NULL,
#'   reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]),
#'   cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...)`
#'
#' @param shape \[`integer(2)`] \cr
#'   Shape of the gridworld (number of rows x number of columns).
#' @param goal.states \[`integer`] \cr
#'   Goal states in the gridworld.
#' @param cliff.states \[`integer`] \cr
#'   Cliff states in the gridworld.
#' @param reward.step \[`integer(1)`] \cr
#'   Reward for taking a step.
#' @param cliff.transition.states \[`integer`] \cr
#'   States to which the environment transitions if stepping into the cliff.
#'   If it is a vector, all states will have equal probability.
#'   Only used when `cliff.transition.done == FALSE`,
#'   else specify the `initial.state` argument.
#' @param reward.cliff \[`integer(1)`] \cr
#'   Reward for taking a step in the cliff state.
#' @param diagonal.moves \[`logical(1)`] \cr
#'   Should diagonal moves be allowed?
#' @param wind \[`integer`] \cr
#'   Strength of the upward wind in each cell.
#' @param cliff.transition.done \[`logical(1)`] \cr
#'   Should the episode end after stepping into the cliff?
#' @param stochasticity \[`numeric(1)`] \cr
#'   Probability of random transition to any of the neighboring states when taking any action.
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @name Gridworld
#' @md
#' @inheritSection Environment Methods
#' @export
#'
#' @examples
#' # Gridworld Environment (Sutton & Barto Example 4.1)
#' env1 = makeEnvironment("gridworld", shape = c(4L, 4L), goal.states = 0L,
#'   initial.state = 15L)
#' env1$reset()
#' env1$visualize()
#' env1$step(0L)
#' env1$visualize()
#'
#' # Windy Gridworld (Sutton & Barto Example 6.5)
#' env2 = makeEnvironment("gridworld", shape = c(7, 10), goal.states = 37L,
#'   reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0),
#'   initial.state = 30L)
#'
#' # Cliff Walking (Sutton & Barto Example 6.6)
#' env3 = makeEnvironment("gridworld", shape = c(4, 12), goal.states = 47L,
#'   cliff.states = 37:46, reward.step = -1, reward.cliff = -100,
#'   cliff.transition.states = 36L, initial.state = 36L)
NULL

Gridworld = R6::R6Class("Gridworld",
  inherit = MdpEnvironment,

  public = list(

    initialize = function(shape = NULL, goal.states = NULL, cliff.states = NULL,
      reward.step = -1, reward.cliff = -100, diagonal.moves = FALSE, wind = rep(0, shape[2]),
      cliff.transition.states = NULL, cliff.transition.done = FALSE, stochasticity = 0, ...) {

      checkmate::assertIntegerish(shape, len = 2)
      if (prod(shape) <= 1) {
        stop("A gridworld with only one state is not allowed!")
      }
      checkmate::assertIntegerish(goal.states)
      goal.states = goal.states + 1
      checkmate::assertIntegerish(cliff.states, null.ok = TRUE)
      if (!is.null(cliff.states)) {
        cliff.states = cliff.states + 1
      }
      checkmate::assertIntegerish(cliff.transition.states, null.ok = TRUE)
      if (!is.null(cliff.transition.states)) {
        cliff.transition.states = cliff.transition.states + 1
      }
      if (any(goal.states > prod(shape)) || any(cliff.states > prod(shape)) |
          any(cliff.transition.states > prod(shape))) {
        stop("All states must be inside the grid! States are numerated row-wise starting with 0, check Details!")
      }
      checkmate::assertIntegerish(wind, len = shape[2])
      checkmate::assertNumber(reward.step)
      checkmate::assertNumber(reward.cliff)
      checkmate::assertFlag(diagonal.moves)
      checkmate::assertFlag(cliff.transition.done)
      checkmate::assertNumber(stochasticity, lower = 0, upper = 1)

      n.states = prod(shape)
      states = seq_len(n.states)
      n.col = shape[2]
      if (diagonal.moves) {
        n.actions = 8
        action.names = c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L,
          "leftup" = 4L, "leftdown" = 5L, "rightup" = 6L, "rightdown" = 7L)
      } else {
        n.actions = 4
        action.names = c("left" = 0L, "right" = 1L, "up" = 2L, "down" = 3L)
      }

      rewards = makeRewardMatrix(reward.step, reward.cliff, n.states, n.actions,
        cliff.states, goal.states)

      transitions = array(matrix(0, nrow = n.states, ncol = n.states),
        dim = c(n.states, n.states, 8))

      border.states = list(left = seq(1, n.states - n.col + 1, n.col),
        right = seq(n.col, n.states, n.col),
        up = seq(1, n.col),
        down = seq(n.states - n.col + 1, n.states))

      non.terminal.states = setdiff(states, c(goal.states, cliff.states))
      actions = list("left", "right", "up", "down", "leftup",
        "leftdown", "rightup", "rightdown")
      actions = lapply(actions, function(x) {class(x) = x; x})

      m.cliff = NULL
      if (cliff.transition.done) {
        goal.states = c(goal.states, cliff.states)
      } else {
        if (!is.null(cliff.states)) {
          if (!is.null(cliff.transition.states)) {
            cliff.pairs = as.matrix(expand.grid(cliff.states, cliff.transition.states))
            cliff.prob = 1 / length(cliff.transition.states)
            m.cliff = cbind(cliff.pairs, cliff.prob)
          } else {
            non.terminal.states = c(non.terminal.states, cliff.states)
          }
        }
      }

      n.states = length(non.terminal.states)
      new.states = vapply(actions, go, states = non.terminal.states, border.states = border.states,
        n.col = n.col, FUN.VALUE = numeric(n.states))

      if (!is.matrix(new.states)) {
        new.states = as.matrix(new.states, nrow = 1)
      }

      m.stoch = matrix(0, nrow = n.states * 8, ncol = 3)
      m.stoch[, 1] = rep(non.terminal.states, 8)
      m.stoch[, 2] = c(new.states)
      m.stoch[, 3] = stochasticity / 8

      m.goal = matrix(c(goal.states, goal.states, rep(1, length(goal.states))), ncol = 3)
      m = rbind(m.cliff, m.goal, m.stoch)
      m = m[rep(seq_len(nrow(m)), each = 8), ]
      m = cbind(m, action = rep(1:8, nrow(m) / 8))

      new.states = c(apply(new.states, 2, applyWind, states = non.terminal.states, n.col = n.col, wind = wind))
      new.states = getIntoBounds(new.states, n.col = n.col)

      m2 = matrix(c(rep(non.terminal.states, 8), new.states, rep(1 - stochasticity, length(new.states)),
        rep(1:8, each = length(non.terminal.states))), ncol = 4)
      m = rbind(m, m2)
      colnames(m) = c("row", "col", "prob", "action")

      m = as.matrix(aggregate(prob ~ row + col + action, data = as.data.frame(m), FUN = "sum"))
      transitions[m[, c("row", "col", "action")]] = m[, "prob"]
      transitions = transitions[, , seq_len(n.actions)]

      visualize = function(env) {
        message(cat(visualizeGridworld(shape, env$state), "\n"))
      }
      # fixme: make action.names and visualize overwriteable
      super$initialize(transitions = transitions, rewards = rewards,
        visualize = visualize, action.names = action.names, ...)
    }
  )
)

visualizeGridworld = function(shape, current.state) {
  one.row = paste(rep("-", shape[2]), collapse = " ")
  grid.vis = paste("", one.row, collapse = "")
  for (i in seq_len(shape[1] - 1L)) {
    grid.vis = paste(grid.vis, "\n", one.row)
  }

  n = current.state + 1
  # find position of nth -
  str.pos = gregexpr("-", grid.vis)[[1]][n]
  # replace nth - with o (current state in grid)
  grid.vis = sub(paste0("^(.{", str.pos - 1, "})(.)(.*$)", collapse = ""),
    "\\1o\\3", grid.vis)
  grid.vis
}

makeRewardMatrix = function(reward.step, reward.cliff, n.states, n.actions,
  cliff.states, goal.states) {
  rewards = matrix(reward.step, nrow = n.states, ncol = n.actions)
  rewards[cliff.states, ] = reward.cliff
  rewards[goal.states, ] = 0
  rewards
}

go = function(x, ...) {
  UseMethod("go", x)
}

#' @export
go.left = function(x, states, border.states, ...) {
  ifelse(states %in% border.states[["left"]], states, states - 1)
}

#' @export
go.right = function(x, states, border.states, ...) {
  ifelse(states %in% border.states[["right"]], states, states + 1)
}

#' @export
go.up = function(x, states, border.states, n.col) {
  ifelse(states %in% border.states[["up"]], states, states - n.col)
}

#' @export
go.down = function(x, states, border.states, n.col) {
  ifelse(states %in% border.states[["down"]], states, states + n.col)
}

#' @export
go.leftup = function(x, states, border.states, n.col) {
  go.left(x, go.up(x, states, border.states, n.col), border.states)
}

#' @export
go.leftdown = function(x, states, border.states, n.col) {
  go.left(x, go.down(x, states, border.states, n.col), border.states)
}

#' @export
go.rightup = function(x, states, border.states, n.col) {
  go.right(x, go.up(x, states, border.states, n.col), border.states)
}

#' @export
go.rightdown = function(x, states, border.states, n.col) {
  go.right(x, go.down(x, states, border.states, n.col), border.states)
}

applyWind = function(states, new.states, wind, n.col) {
  column = states %% n.col
  column[column == 0] = n.col
  new.states - wind[column] * n.col
}

getIntoBounds = function(new.states, n.col) {
  while (any(new.states <= 0)) {
    new.states[new.states <= 0] = new.states[new.states <= 0] + n.col
  }
  new.states
}

#' Cliff Walking
#'
#' Gridworld environment for reinforcement learning from Sutton & Barto (2017).
#' Grid of shape 4x12 with a goal state in the bottom right of the grid.
#' Episodes start in the lower left state. Possible actions include going left, right, up and down.
#' Some states in the lower part of the grid are a cliff,
#' so taking a step into this cliff will yield a high negative reward of - 100 and move the agent
#' back to the starting state.
#' Elsewise rewards are - 1, for the goal state 0.
#'
#' This is the gridworld (goal state denoted G, cliff states denoted C, start state denoted S):
#' \tabular{rrrrrrrrrrrr}{
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  S \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab C \tab G \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("cliff.walking", ...)`
#'
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @md
#'
#' @name CliffWalking
#' @aliases cliff.walking
#'
#' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.6
#' @export
#'
#' @inheritSection Environment Methods
#' @examples
#' env = makeEnvironment("cliff.walking")
NULL

CliffWalking = R6::R6Class("CliffWalking",
  inherit = Gridworld,
  public = list(
    initialize = function(...) {
      super$initialize(shape = c(4, 12), goal.states = 47,
        cliff.states = 37:46, reward.step = -1, reward.cliff = -100,
        cliff.transition.states = 36, initial.state = 36, ...)
    }
  )
)

#' Windy Gridworld
#'
#' Windy Gridworld problem for reinforcement learning. Actions include
#' going left, right, up and down. In each column the wind pushes you up a
#' specific number of steps (for the next action). If an action would
#' take you off the grid, you remain in the previous state. For each step you
#' get a reward of -1, until you reach into a terminal state.
#'
#' This is the gridworld (goal state denoted G, start state denoted S).
#' The last row specifies the upward wind in each column.
#' \tabular{rrrrrrrrrr}{
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  S \tab . \tab . \tab . \tab . \tab . \tab . \tab G \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \tab . \cr
#'  0 \tab 0 \tab 0 \tab 1 \tab 1 \tab 1 \tab 2 \tab 2 \tab 1 \tab 0 \cr
#' }
#'
#' @section Usage:
#' `makeEnvironment("windy.gridworld", ...)`
#'
#' @param ... \[`any`] \cr Arguments passed on to [makeEnvironment].
#'
#' @md
#'
#' @name WindyGridworld
#' @aliases windy.gridworld
#'
#' @references Sutton and Barto (Book draft 2017): Reinforcement Learning: An Introduction Example 6.5
#' @export
#'
#' @inheritSection Environment Methods
#' @examples
#' env = makeEnvironment("windy.gridworld")
NULL

WindyGridworld = R6::R6Class("WindyGridworld",
  inherit = Gridworld,
  public = list(
    initialize = function(...) {
      super$initialize(shape = c(7, 10), goal.states = 37,
        reward.step = -1, wind = c(0, 0, 0, 1, 1, 1, 2, 2, 1, 0),
        initial.state = 30, ...)
    }
  )
)

# fixme: add character actions (e.g. "left")