contextual: Simulation and Analysis of Contextual Multi-Armed Bandit Policies

UnpooledUCBPolicy <- R6::R6Class(
  portable = FALSE,
  class = FALSE,
  inherit = Policy,
  public = list(
    class_name = "UnpooledUCBPolicy",
    n_subjects = NULL,
    initialize = function(n_subjects = 1, name = "UnpooledUCBPolicy") {
      self$n_subjects <- n_subjects
    },
    set_parameters = function(context_params) {
      self$theta <- list("n_total" = rep(0,self$n_subjects),
                         n = rep(list(as.list(rep(0, context_params$k))), self$n_subjects),
                         p = rep(list(as.list(rep(0, context_params$k))), self$n_subjects))
    },
    get_action = function(t, context) {
      user <- context$user_context
      if (self$theta$n_total[[user]] < context$k) {
        for (arm in 1:context$k) {
          if (self$theta$n[[user]][[arm]] == 0) {
            action$choice <- arm
            return(action)
          }
        }
      }
      expected_rewards <- rep(0.0, context$k)
      for (arm in 1:context$k) {
        expected_rewards[arm] <- self$theta$p[[user]][[arm]] + sqrt((2*self$theta$p[[user]][[arm]]*log(1/sqrt(1/self$theta$n_total[[user]])))/self$theta$n[[user]][[arm]]) + (2*log(1/sqrt(1/self$theta$n_total[[user]])))
      }
      action$choice  <- which_max_tied(expected_rewards)
      action
    },
    set_reward = function(t, context, action, reward) {
      arm    <- action$choice
      user   <- context$user_context
      reward <- reward$reward
      inc(self$theta$n_total[[user]])  <- 1
      inc(self$theta$n[[user]][[arm]]) <- 1
      inc(self$theta$p[[user]][[arm]]) <- (reward - self$theta$p[[user]][[arm]]) / self$theta$n[[user]][[arm]]
      self$theta
    }
  )
)

PooledUCBPolicy <- R6::R6Class(
  portable = FALSE,
  class = FALSE,
  inherit = Policy,
  public = list(
    class_name = "PooledUCBPolicy",
    n_subjects = NULL,
    initialize = function() {
      super$initialize()
    },
    set_parameters = function(context_params) {
      self$theta_to_arms <- list("P" = 0, "N" = 0)
      self$theta <- list("N_total" = 0)
    },
    get_action = function(t, context) {
      if (self$theta$N_total < context$k) {
        for (arm in 1:context$k) {
          if (self$theta$N[[arm]] == 0) {
            action$choice <- arm
            return(action)
          }
        }
      }
      expected_rewards <- rep(0.0, context$k)
      for (arm in 1:context$k) {
        expected_rewards[[arm]] <- self$theta$P[[arm]] + sqrt((2*self$theta$P[[arm]]*log(1/sqrt(1/self$theta$N_total)))/self$theta$N[[arm]]) + (2*log(1/sqrt(1/self$theta$N_total)))
      }
      action$choice  <- which_max_tied(expected_rewards)
      action
    },
    set_reward = function(t, context, action, reward) {
      arm <- action$choice
      reward <- reward$reward
      inc(self$theta$N_total)  <- 1
      inc(self$theta$N[[arm]])   <- 1
      inc(self$theta$P[[arm]])   <- (reward - self$theta$P[[arm]]) / self$theta$N[[arm]]
      self$theta
    }
  )
)


PartiallyPooledUCBPolicy <- R6::R6Class(
  portable = FALSE,
  class = FALSE,
  inherit = Policy,
  public = list(
    class_name = "PartiallyPooledUCBPolicy",
    n_subjects = NULL,
    initialize = function(n_subjects = 1) {
      super$initialize()
      self$n_subjects <- n_subjects
    },
    set_parameters = function(context_params) {
      self$theta <- list("N_total" = 0, "n_total" = rep(0,self$n_subjects),
                         n = rep(list(as.list(rep(0, context_params$k))), self$n_subjects),
                         p = rep(list(as.list(rep(0, context_params$k))), self$n_subjects))
      self$theta_to_arms <- list("P" = 0, "N" = 0)
    },
    get_action = function(t, context) {
      user <- context$user_context
      if (self$theta$n_total[[user]] < context$k) {
        for (arm in 1:context$k) {
          if (self$theta$n[[user]][[arm]] == 0) {
            action$choice <- arm
            return(action)
          }
        }
      }
      expected_rewards <- rep(0.0, context$k)
      #beta = 1/sqrt(self$theta$n_total[[user]])
      beta = 2/(2+self$theta$n_total[[user]])
      for (arm in 1:context$k) {
        p_mean <- self$theta$P[[arm]] + sqrt((2*self$theta$P[[arm]]*log(1/sqrt(1/self$theta$N_total)))/self$theta$N[[arm]]) + (2*log(1/sqrt(1/self$theta$N_total)))
        p_choice <- self$theta$p[[user]][[arm]] + sqrt((2*self$theta$p[[user]][[arm]]*log(1/sqrt(1/self$theta$n_total[[user]])))/self$theta$n[[user]][[arm]]) + (2*log(1/sqrt(1/self$theta$n_total[[user]])))
        p_hat = (beta * p_mean + (1-beta) * p_choice)
        expected_rewards[arm] = p_hat
      }
      action$choice  <- which_max_tied(expected_rewards)
      action
    },
    set_reward = function(t, context, action, reward) {
      arm                              <- action$choice
      user                             <- context$user_context
      reward                           <- reward$reward
      inc(self$theta$n_total[[user]])  <- 1
      inc(self$theta$n[[user]][[arm]]) <- 1
      inc(self$theta$p[[user]][[arm]]) <- (reward - self$theta$p[[user]][[arm]]) / self$theta$n[[user]][[arm]]
      inc(self$theta$N_total)          <- 1
      inc(self$theta$N[[arm]])         <- 1
      inc(self$theta$P[[arm]])         <- (reward - self$theta$P[[arm]]) / self$theta$N[[arm]]
      self$theta
    }
  )
)


PartiallyBBPooledUCBPolicy <- R6::R6Class(
  portable = FALSE,
  class = FALSE,
  inherit = Policy,
  public = list(
    class_name = "PartiallyPooledBBUCBPolicy",
    n_subjects = NULL,
    initialize = function(n_subjects = 1) {
      super$initialize()
      self$n_subjects <- n_subjects
    },
    set_parameters = function(context_params) {
      self$theta <- list("N_total" = 0, "n_total" = rep(0,self$n_subjects),
                         n  = rep(list(as.list(rep(1, context_params$k))),self$n_subjects),
                         p  = rep(list(as.list(rep(0, context_params$k))),self$n_subjects),
                         ss = rep(list(as.list(rep(0, context_params$k))),self$n_subjects),
                         c  = rep(list(as.list(rep(0, context_params$k))),self$n_subjects))
      self$theta_to_arms <- list("N"  = 1, "P" = 0, "SS" = 0, "C" = 0)
    },
    get_action = function(t, context) {
      user <- context$user_context
      if (self$theta$N_total < context$k) {
        for (arm in 1:context$k) {
          if (self$theta$N[[arm]]-1 == 0) {
            action$choice <- arm
            return(action)
          }
        }
      }
      if (self$theta$n_total[[user]] < context$k) {
        for (arm in 1:context$k) {
          if (self$theta$n[[user]][[arm]]-1 == 0) {
            action$choice <- arm
            return(action)
          }
        }
      }

      expected_rewards <- rep(0.0, context$k)

      ns     <- self$n_subjects
      P      <- unlist(self$theta$P)
      N      <- unlist(self$theta$N)
      C      <- unlist(self$theta$C)
      SS     <- unlist(self$theta$SS)
      p      <- unlist(self$theta$p[[user]])
      n      <- unlist(self$theta$n[[user]])

      sigmasq <- (ns * SS) / ((ns - 1) * N)
      M <- pmax((P * (1 - P) - sigmasq) / (sigmasq - ((P * (1 - P)) / ns) * C), 0)
      betas <- M / (M + n)
      betas[(M==0 | sigmasq==0)] <- 1

      p_mean   <- P + sqrt((2*P*log(1/sqrt(1/self$theta$N_total)))/N) + (2*log(1/sqrt(1/self$theta$N_total)))
      p_choice <- p + sqrt((2*p*log(1/sqrt(1/self$theta$n_total[[user]])))/n) + (2*log(1/sqrt(1/self$theta$n_total[[user]])))

      p_hat  <- betas * P + (1 - betas) * p

      action$choice <- which_max_tied(p_hat)

      action
    },
    set_reward = function(t, context, action, reward) {
      arm    <- action$choice
      user   <- context$user_context
      reward <- reward$reward

      inc(self$theta$N_total)           <- 1
      inc(self$theta$n_total[[user]])   <- 1

      inc(self$theta$n[[user]][[arm]])  <- 1
      inc(self$theta$p[[user]][[arm]])  <- (reward - self$theta$p[[user]][[arm]]) / self$theta$n[[user]][[arm]]

      inc(self$theta$N[[arm]])          <- 1
      inc(self$theta$P[[arm]])          <- (reward - self$theta$P[[arm]]) / self$theta$N[[arm]]
      self$theta$SS[[arm]]              <- self$theta$SS[[arm]]  - self$theta$ss[[user]][[arm]] + self$theta$n[[user]][[arm]] *
        (self$theta$p[[user]][[arm]] - self$theta$P[[arm]]) ^ 2
      self$theta$C[[arm]]               <- self$theta$C[[arm]] - self$theta$c[[user]][[arm]] +
        1/self$theta$n[[user]][[arm]]
      self$theta$c[[user]][[arm]]       <- 1 / self$theta$n[[user]][[arm]]
      self$theta$ss[[user]][[arm]]      <- self$theta$n[[user]][[arm]] * (self$theta$p[[user]][[arm]] - self$theta$P[[arm]]) ^ 2
      self$theta
    }
  )
)

Nth-iteration-labs/contextual documentation built on July 28, 2020, 1:13 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

Nth-iteration-labs/contextual
Simulation and Analysis of Contextual Multi-Armed Bandit Policies

demo/replication_kruijswijk_2018/policy_pooled_ucb.R
In Nth-iteration-labs/contextual: Simulation and Analysis of Contextual Multi-Armed Bandit Policies

R Package Documentation

Browse R Packages

We want your feedback!

Nth-iteration-labs/contextual Simulation and Analysis of Contextual Multi-Armed Bandit Policies

demo/replication_kruijswijk_2018/policy_pooled_ucb.R In Nth-iteration-labs/contextual: Simulation and Analysis of Contextual Multi-Armed Bandit Policies

R Package Documentation

Browse R Packages

We want your feedback!

Nth-iteration-labs/contextual
Simulation and Analysis of Contextual Multi-Armed Bandit Policies

demo/replication_kruijswijk_2018/policy_pooled_ucb.R
In Nth-iteration-labs/contextual: Simulation and Analysis of Contextual Multi-Armed Bandit Policies