demo/replication_li_2010/demo_yahoo_classes/yahoo_policy_epsilon_greedy_seg.R

YahooEpsilonGreedySegPolicy          <- R6::R6Class(
  portable = FALSE,
  class = FALSE,
  inherit = Policy,
  public = list(
    epsilon = NULL,
    cluster = NULL,
    class_name = "YahooEpsilonGreedySegPolicy",
    initialize = function(epsilon = 0.1) {
      super$initialize()
      self$epsilon                <- epsilon
    },
    set_parameters = function(context_params) {
      self$theta_to_arms          <- list('n' = rep(0,5), 'mean' = rep(0,5))
    },
    get_action = function(t, context) {
      local_arms                  <- context$arms
      if (runif(1) > self$epsilon) {
        # find the feature on which a user scores highest - that is this user's cluster
        self$cluster              <- which.max(head(context$X[context$unique,1],-1))
        expected_rewards          <- rep(0.0, length(local_arms))
        for (arm in seq_along(local_arms)) {
          expected_rewards[arm]   <- self$theta$mean[[local_arms[arm]]][self$cluster]
        }
        action$choice             <- local_arms[which_max_tied(expected_rewards)]
      } else {
        action$choice             <- sample(local_arms, 1)
      }
      action
    },
    set_reward = function(t, context, action, reward) {
      arm                                       <- action$choice
      reward                                    <- reward$reward
      self$theta$n[[arm]][self$cluster]         <- self$theta$n[[arm]][self$cluster] + 1
      self$theta$mean[[arm]][self$cluster]      <- self$theta$mean[[arm]][self$cluster] +
                                                   (reward - self$theta$mean[[arm]][self$cluster]) /
                                                   self$theta$n[[arm]][self$cluster]
      self$theta
    }
  )
)
Nth-iteration-labs/contextual documentation built on July 28, 2020, 1:13 p.m.