R/MDP.R
In pomdp: Infrastructure for Partially Observable Markov Decision Processes (POMDP)

Documented in is_solved_MDP MDP

#' Define an MDP Problem
#'
#' Defines all the elements of a finite state-space MDP problem.
#'
#' Markov decision processes (MDPs) are discrete-time stochastic control
#' process with completely observable states. We implement here
#' MDPs with a finite state space. similar to [POMDP]
#' models, but without the observation model. The `'observations'` column in
#' the the reward specification is always missing.
#'
#' [`make_partially_observable()`] reformulates an MDP as a POMDP by adding an observation
#' model with one observation per state
#' that reveals the current state. This is achieved by adding identity
#' observation probability matrices.
#'
#' More details on specifying the model components can be found in the documentation
#' for [POMDP].
#'
#' @family MDP
#' @family MDP_examples
#'
#' @include POMDP.R
#' @param states a character vector specifying the names of the states.
#' @param actions a character vector specifying the names of the available
#' actions.
#' @param transition_prob Specifies the transition probabilities between
#' states.
#' @param reward Specifies the rewards dependent on action, states and
#' observations.
#' @param discount numeric; discount rate between 0 and 1.
#' @param horizon numeric; Number of epochs. `Inf` specifies an infinite
#' horizon.
#' @param start Specifies in which state the MDP starts.
#' @param info A list with additional information.
#' @param name a string to identify the MDP problem.
#' @param x a `MDP` object.
#'
#' @return The function returns an object of class MDP which is list with
#'   the model specification. [solve_MDP()] reads the object and adds a list element called
#' `'solution'`.
#' @author Michael Hahsler
#' @examples
#' # Michael's Sleepy Tiger Problem is like the POMDP Tiger problem, but
#' # has completely observable states because the tiger is sleeping in front
#' # of the door. This makes the problem an MDP.
#'
#' STiger <- MDP(
#'   name = "Michael's Sleepy Tiger Problem",
#'   discount = .9,
#'
#'   states = c("tiger-left" , "tiger-right"),
#'   actions = c("open-left", "open-right", "do-nothing"),
#'   start = "uniform",
#'
#'   # opening a door resets the problem
#'   transition_prob = list(
#'     "open-left" =  "uniform",
#'     "open-right" = "uniform",
#'     "do-nothing" = "identity"),
#'
#'   # the reward helper R_() expects: action, start.state, end.state, observation, value
#'   reward = rbind(
#'     R_("open-left",  "tiger-left",  v = -100),
#'     R_("open-left",  "tiger-right", v =   10),
#'     R_("open-right", "tiger-left",  v =   10),
#'     R_("open-right", "tiger-right", v = -100),
#'     R_("do-nothing",                v =    0)
#'   )
#' )
#'
#' STiger
#'
#' sol <- solve_MDP(STiger)
#' sol
#'
#' policy(sol)
#' plot_value_function(sol)
#'
#' # convert the MDP into a POMDP and solve
#' STiger_POMDP <- make_partially_observable(STiger)
#' sol2 <- solve_POMDP(STiger_POMDP)
#' sol2
#'
#' policy(sol2)
#' plot_value_function(sol2, ylim = c(80, 120))
#' @export
MDP <- function(states,
                actions,
                transition_prob,
                reward,
                discount = .9,
                horizon = Inf,
                start = "uniform",
                info = NULL,
                name = NA) {
  
  # MDP does not have observations
  if (is.data.frame(reward))
    reward$observation <- NULL
  
  x <- list(
    name = name,
    discount = discount,
    horizon = horizon,
    states = states,
    actions = actions,
    transition_prob = transition_prob,
    reward = reward,
    info = info,
    start = start
  )
  
  class(x) <- list("MDP", "list")
  x <- check_and_fix_MDP(x)
  
  x
}


#' @export
print.MDP <- function(x, ...) {
  writeLines(paste(paste(class(x), collapse = ", "),
                   "-",
                   x$name))
  
  if (!is.null(x$discount))
    writeLines(sprintf("  Discount factor: %s",
                       paste(x$discount, collapse = "+")))
  
  if (!is.null(x$horizon))
    writeLines(sprintf("  Horizon: %s epochs",
                       paste(x$horizon, collapse = " + ")))
  
  writeLines(sprintf(
    "  Size: %d states / %d actions",
    length(x$states),
    length(x$actions)
  ))
  
  writeLines(paste0("  Start: ", shorten(paste(
    x$start, collapse = ", "
  ), n = -10L)))
  
  if (is_solved_MDP(x))
    writeLines(c(
      "  Solved:",
      sprintf("    Method: %s",
              sQuote(x$solution$method)),
      sprintf("    Solution converged: %s",
              x$solution$converged)
    ))
  
  writeLines("")
  
  writeLines(strwrap(
    paste("List components:", paste(sQuote(names(
      x
    )), collapse = ", "), "\n"),
    indent = 2,
    exdent = 4
  ))
}

#' @rdname MDP
#' @param stop logical; stop with an error.
#' @export
is_solved_MDP <- function(x, stop = FALSE) {
  if (!inherits(x, "MDP"))
    stop("x needs to be a MDP object!")
  solved <- !is.null(x$solution)
  if (stop && !solved)
    stop("x needs to be a solved MDP. Use solve_MDP() first.")
  
  solved
}

## this is .get_pg_index for MDPs
.get_pol_index <- function(model, epoch) {
  epoch <- as.integer(epoch)
  if (epoch < 1L)
    stop("Epoch has to be >= 1")
  
  ### (converged) infinite horizon POMDPs. We ignore epoch.
  if (length(model$solution$policy) == 1L)
    return(1L)
  
  ### regular epoch for finite/infinite horizon case
  if (epoch > length(model$solution$policy))
    stop("MDP model has only a policy up to epoch ",
         length(model$solution$policy))
  
  return(epoch)
}