MABsim: Run (contextual) Multi-Armed Bandit Simulations

Documented in mab_sim

#' (Contextual) MAB Simulation
#'
#' @param nsteps The number of steps (decisions) in the simulation.
#' @param reward_funs A list of functions givin the expected reward of each earm (given context x).
#' @param sd A numeric vector giving the standard deviation of each lever.
#' @param J The number of basis functions to be used in the model.
#' @param alpha A scaling parameter for the prior variances. The prior variance of the
#' i'th model coefficient is given by i^-alpha (the intercept is the 0'th coefficient, and
#' has a prior variance of 1).
#' @param a The shape parameter for the inverse gamma distribution.
#' @param b The scale parameter for the inverse gamma distribution.
#' @param repartition Set to FALSE if you don't want to partition the context space.
#'
#' @export
#'
#' @importFrom magrittr %<>%
mab_sim <- function(nsteps,reward_funs,sd,J,alpha = 1,a = 3,b = 1, repartition = TRUE){

  # First we set up arm objects (i.e. prior distributions)
  num_arms <- length(reward_funs)
  arms <- list()
  for (i in 1:num_arms) arms[[i]] <- new_arm(reward_funs[[i]],sd[i],J,alpha,a,b)

  basis <- legendre_basis(J)
  x <- c(); action <- c(); r <- c(); t_regret <- c()
  regret_sum <- 0

  # Partitioning parameters:
  pow <- rep(6,num_arms); pulls <- rep(0, num_arms); d <- rep(0,num_arms)

  for (t in 1:nsteps){ # First we test the posterior converges correctly.
    x <- c( x, stats::runif(1,-1,1) )
    subset_index <- rep(0,num_arms); for(i in 1:num_arms) subset_index[i] <- allocate(x[t], arms[[i]]@partition )
    x_map <- rep(0,num_arms)
    for(i in 1:num_arms){
      part <- arms[[i]]@partition
      index <- subset_index[i]
      x_map[i] <- remap(x[t],part[index,])
    }

    # Simulate expected reward and choose arm.
    exp_r_sim <- rep(0,num_arms)
    for(i in 1:num_arms){
      index <- subset_index[i]
      dis <- arms[[i]]@distributions[[index]]
      beta_sim <-  sim(dis)
      phi <- expand(x_map[i],basis)
      exp_r_sim[i] <- t(phi)%*%beta_sim
    }
    action <- c( action, which.max(exp_r_sim) )
    pulls[ action[t] ] <- pulls[ action[t] ] + 1
    r <- c( r, stats::rnorm(1, mean = reward_funs[[ action[t] ]]( x[t] ), sd = sd[ action[t] ]) ) # Observe reward.

    # Compute regret.
    exp_r_true <- rep(0,num_arms)
    for(i in 1:num_arms) exp_r_true[i] <- reward_funs[[i]](x[t])
    regret_sum <- regret_sum + max(exp_r_true) - exp_r_true[ action[t] ]
    t_regret <- c(t_regret,regret_sum)

    # Update Posterior
    prior <- dist(arms[[ action[t] ]], subset_index[ action[t] ] )
    posterior <- update_dist(x_map[ action[t] ],r[t],prior,basis)
    dist(arms[[ action[t] ]], subset_index[ action[t] ] ) <- posterior

    # Repartition when necessary.
    if(repartition){
      if(pulls[ action[t] ] == 2^pow[ action[t] ] ){
        pow[action[t]] <- pow[action[t]] + 2; d[action[t]] <- d[action[t]] + 1
        x1 <- x[action == action[t]] # Context observations associated with arm action[t].
        r1 <- r[action == action[t]]
        part <- arms[[ action[t] ]]@partition
        re_part <- partition(x1,d[ action[t] ])
        subset_index <- allocate(x1,re_part)
        data <- data.frame(x = x1, r = r1, subset = subset_index)
        k <- nrow(re_part)
        new_distributions <- list()
        for(i in 1:k){ # Re-train posterior from scratch for each subset.
          subset_data <- data[data$subset == i,]
          subset_data$x <- remap(subset_data$x,re_part[i,])
          prior <- new_prior(J,alpha,a,b)
          posterior <- update_dist(subset_data$x,subset_data$r,prior,basis)
          new_distributions[[i]] <- posterior
        }
        arms[[ action[t] ]]@distributions <- new_distributions
        arms[[ action[t] ]]@partition <- re_part
      }
    }
  }
  history <- data.frame(x = x, a = action, r = r, t_regret = t_regret)
  return(list(arms = arms, history = history))
}