R/bandit_offline_replay_evaluator.R

#' @export
#' @import Formula
OfflineReplayEvaluatorBandit <- R6::R6Class(
  inherit = OfflineBootstrappedReplayBandit,
  class = FALSE,
  public = list(
    class_name = "OfflineReplayEvaluatorBandit",
    initialize   = function(formula,
                            data, k = NULL, d = NULL,
                            unique = NULL, shared = NULL,
                            randomize = TRUE, replacement = FALSE,
                            jitter = FALSE) {

        super$initialize(formula,
                         data, k, d,
                         unique, shared = shared,
                         randomize, replacement,
                         jitter, arm_multiply = FALSE)
    }
  )
)

#' Bandit: Offline Replay
#'
#' Policy for the evaluation of policies with offline data through replay.
#'
#' The key assumption of the method is that that the original logging policy chose
#' i.i.d. arms uniformly at random.
#'
#' Take care: if the original logging policy does not change over trials, data may be
#' used more efficiently via propensity scoring (Langford et al., 2008; Strehl et al., 2011)
#' and related techniques like doubly robust estimation (Dudik et al., 2011).
#'
#' @name OfflineReplayEvaluatorBandit
#'
#' @section Usage:
#' \preformatted{
#'   bandit <- OfflineReplayEvaluatorBandit(formula,
#'                                             data, k = NULL, d = NULL,
#'                                             unique = NULL, shared = NULL,
#'                                             randomize = TRUE, replacement = FALSE,
#'                                             jitter = FALSE)
#' }
#'
#' @section Arguments:
#'
#' \describe{
#'   \item{\code{formula}}{
#'     formula (required). Format: \code{y.context ~ z.choice | x1.context + x2.xontext + ...}
#'     By default,  adds an intercept to the context model. Exclude the intercept, by adding "0" or "-1" to
#'     the list of contextual features, as in: \code{y.context ~ z.choice | x1.context + x2.xontext -1}
#'   }
#'   \item{\code{data}}{
#'     data.table or data.frame; offline data source (required)
#'   }
#'   \item{\code{k}}{
#'     integer; number of arms (optional). Optionally used to reformat the formula defined x.context vector
#'     as a \code{k x d} matrix. When making use of such matrix formatted contexts, you need to define custom
#'     intercept(s) when and where needed in data.table or data.frame.
#'   }
#'   \item{\code{d}}{
#'     integer; number of contextual features (optional) Optionally used to reformat the formula defined
#'     x.context vector as a \code{k x d} matrix. When making use of such matrix formatted contexts, you need
#'     to define custom intercept(s) when and where needed in data.table or data.frame.
#'   }
#'   \item{\code{randomize}}{
#'     logical; randomize rows of data stream per simulation (optional, default: TRUE)
#'   }
#'   \item{\code{replacement}}{
#'     logical; sample with replacement (optional, default: FALSE)
#'   }
#'   \item{\code{replacement}}{
#'     logical; add jitter to contextual features (optional, default: FALSE)
#'   }
#'   \item{\code{unique}}{
#'     integer vector; index of disjoint features (optional)
#'   }
#'   \item{\code{shared}}{
#'     integer vector; index of shared features (optional)
#'   }
#'
#' }
#'
#' @section Methods:
#'
#' \describe{
#'
#'   \item{\code{new(formula, data, k = NULL, d = NULL, unique = NULL, shared = NULL, randomize = TRUE,
#'                   replacement = TRUE, jitter = TRUE, arm_multiply = TRUE)}}{ generates
#'    and instantializes a new \code{OfflineReplayEvaluatorBandit} instance. }
#'
#'   \item{\code{get_context(t)}}{
#'      argument:
#'      \itemize{
#'          \item \code{t}: integer, time step \code{t}.
#'      }
#'      returns a named \code{list}
#'      containing the current \code{d x k} dimensional matrix \code{context$X},
#'      the number of arms \code{context$k} and the number of features \code{context$d}.
#'  }
#'
#'   \item{\code{get_reward(t, context, action)}}{
#'      arguments:
#'      \itemize{
#'          \item \code{t}: integer, time step \code{t}.
#'          \item \code{context}: list, containing the current \code{context$X} (d x k context matrix),
#'          \code{context$k} (number of arms) and \code{context$d} (number of context features)
#'          (as set by \code{bandit}).
#'          \item \code{action}:  list, containing \code{action$choice} (as set by \code{policy}).
#'      }
#'      returns a named \code{list} containing \code{reward$reward} and, where computable,
#'         \code{reward$optimal} (used by "oracle" policies and to calculate regret).
#'  }
#'
#'   \item{\code{post_initialization()}}{
#'      Randomize offline data by shuffling the offline data.table before the start of each
#'      individual simulation when self$randomize is TRUE (default)
#'   }
#' }
#'
#' @references
#'
#' Li, Lihong, Chu, Wei, Langford, John, and Wang, Xuanhui. Unbiased offline evaluation of
#' contextual-bandit-based news article recommendation algorithms. In King, Irwin, Nejdl, Wolfgang, and Li,
#' Hang (eds.), Proc. Web Search and Data Mining (WSDM), pp. 297–306. ACM, 2011. ISBN 978-1-4503-0493-1.
#'
#' @seealso
#'
#' Core contextual classes: \code{\link{Bandit}}, \code{\link{Policy}}, \code{\link{Simulator}},
#' \code{\link{Agent}}, \code{\link{History}}, \code{\link{Plot}}
#'
#' Bandit subclass examples: \code{\link{BasicBernoulliBandit}}, \code{\link{ContextualLogitBandit}},
#' \code{\link{OfflineReplayEvaluatorBandit}}
#'
#' Policy subclass examples: \code{\link{EpsilonGreedyPolicy}}, \code{\link{ContextualLinTSPolicy}}
#'
#' @examples
#' \dontrun{
#'
#' url  <- "http://d1ie9wlkzugsxr.cloudfront.net/data_irecsys_CARSKit/Movie_DePaulMovie/ratings.csv"
#' data <- fread(url, stringsAsFactors=TRUE)
#'
#' # Convert data
#'
#' data        <- contextual::one_hot(data, cols = c("Time","Location","Companion"),
#'                                          sparsifyNAs = TRUE)
#' data[, itemid := as.numeric(itemid)]
#' data[, rating := ifelse(rating <= 3, 0, 1)]
#'
#' # Set simulation parameters.
#' simulations <- 10  # here, "simulations" represents the number of boostrap samples
#' horizon     <- nrow(data)
#'
#' # Initiate Replay bandit with 10 arms and 100 context dimensions
#' log_S       <- data
#' formula     <- formula("rating ~ itemid | Time_Weekday + Time_Weekend + Location_Cinema +
#'                        Location_Home + Companion_Alone + Companion_Family + Companion_Partner")
#' bandit      <- OfflineReplayEvaluatorBandit$new(formula = formula, data = data)
#'
#' # Define agents.
#' agents      <-
#'   list(Agent$new(RandomPolicy$new(), bandit, "Random"),
#'        Agent$new(EpsilonGreedyPolicy$new(0.03), bandit, "EGreedy 0.05"),
#'        Agent$new(ThompsonSamplingPolicy$new(), bandit, "ThompsonSampling"),
#'        Agent$new(LinUCBDisjointOptimizedPolicy$new(0.37), bandit, "LinUCB 0.37"))
#'
#' # Initialize the simulation.
#' simulation  <-
#'   Simulator$new(
#'     agents           = agents,
#'     simulations      = simulations,
#'     horizon          = horizon
#'   )
#'
#' # Run the simulation.
#' # Takes about 5 minutes: bootstrapbandit loops
#' # for arms x horizon x simulations (times nr of agents).
#'
#' sim  <- simulation$run()
#'
#' # plot the results
#' plot(sim, type = "cumulative", regret = FALSE, rate = TRUE,
#'      legend_position = "topleft", ylim=c(0.48,0.87))
#'
#' }
NULL

Try the contextual package in your browser

Any scripts or data that you put into this service are public.

contextual documentation built on July 26, 2020, 1:06 a.m.