#' (Contextual) MAB Simulation
#'
#' @param nsteps The number of steps (decisions) in the simulation.
#' @param reward_funs A list of functions givin the expected reward of each earm (given context x).
#' @param sd A numeric vector giving the standard deviation of each lever.
#' @param J The number of basis functions to be used in the model.
#' @param alpha A scaling parameter for the prior variances. The prior variance of the
#' i'th model coefficient is given by i^-alpha (the intercept is the 0'th coefficient, and
#' has a prior variance of 1).
#' @param a The shape parameter for the inverse gamma distribution.
#' @param b The scale parameter for the inverse gamma distribution.
#' @param repartition Set to FALSE if you don't want to partition the context space.
#'
#' @export
#'
#' @importFrom magrittr %<>%
mab_sim <- function(nsteps,reward_funs,sd,J,alpha = 1,a = 3,b = 1, repartition = TRUE){
# First we set up arm objects (i.e. prior distributions)
num_arms <- length(reward_funs)
arms <- list()
for (i in 1:num_arms) arms[[i]] <- new_arm(reward_funs[[i]],sd[i],J,alpha,a,b)
basis <- legendre_basis(J)
x <- c(); action <- c(); r <- c(); t_regret <- c()
regret_sum <- 0
# Partitioning parameters:
pow <- rep(6,num_arms); pulls <- rep(0, num_arms); d <- rep(0,num_arms)
for (t in 1:nsteps){ # First we test the posterior converges correctly.
x <- c( x, stats::runif(1,-1,1) )
subset_index <- rep(0,num_arms); for(i in 1:num_arms) subset_index[i] <- allocate(x[t], arms[[i]]@partition )
x_map <- rep(0,num_arms)
for(i in 1:num_arms){
part <- arms[[i]]@partition
index <- subset_index[i]
x_map[i] <- remap(x[t],part[index,])
}
# Simulate expected reward and choose arm.
exp_r_sim <- rep(0,num_arms)
for(i in 1:num_arms){
index <- subset_index[i]
dis <- arms[[i]]@distributions[[index]]
beta_sim <- sim(dis)
phi <- expand(x_map[i],basis)
exp_r_sim[i] <- t(phi)%*%beta_sim
}
action <- c( action, which.max(exp_r_sim) )
pulls[ action[t] ] <- pulls[ action[t] ] + 1
r <- c( r, stats::rnorm(1, mean = reward_funs[[ action[t] ]]( x[t] ), sd = sd[ action[t] ]) ) # Observe reward.
# Compute regret.
exp_r_true <- rep(0,num_arms)
for(i in 1:num_arms) exp_r_true[i] <- reward_funs[[i]](x[t])
regret_sum <- regret_sum + max(exp_r_true) - exp_r_true[ action[t] ]
t_regret <- c(t_regret,regret_sum)
# Update Posterior
prior <- dist(arms[[ action[t] ]], subset_index[ action[t] ] )
posterior <- update_dist(x_map[ action[t] ],r[t],prior,basis)
dist(arms[[ action[t] ]], subset_index[ action[t] ] ) <- posterior
# Repartition when necessary.
if(repartition){
if(pulls[ action[t] ] == 2^pow[ action[t] ] ){
pow[action[t]] <- pow[action[t]] + 2; d[action[t]] <- d[action[t]] + 1
x1 <- x[action == action[t]] # Context observations associated with arm action[t].
r1 <- r[action == action[t]]
part <- arms[[ action[t] ]]@partition
re_part <- partition(x1,d[ action[t] ])
subset_index <- allocate(x1,re_part)
data <- data.frame(x = x1, r = r1, subset = subset_index)
k <- nrow(re_part)
new_distributions <- list()
for(i in 1:k){ # Re-train posterior from scratch for each subset.
subset_data <- data[data$subset == i,]
subset_data$x <- remap(subset_data$x,re_part[i,])
prior <- new_prior(J,alpha,a,b)
posterior <- update_dist(subset_data$x,subset_data$r,prior,basis)
new_distributions[[i]] <- posterior
}
arms[[ action[t] ]]@distributions <- new_distributions
arms[[ action[t] ]]@partition <- re_part
}
}
}
history <- data.frame(x = x, a = action, r = r, t_regret = t_regret)
return(list(arms = arms, history = history))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.