#' SMG.Lag
#'
#' The OnlineSuperLearner uses various \code{SummaryMeasureGenerator} instances
#' to generate new variables that can be used to represent the relevant history
#' of a variable. The \code{SMG.Lag} class allows a user to specify lagged
#' variables. That is, in the prediction of a variable at time $t$, one can
#' include variables from past measurements, if there is an lagged or cross
#' lagged effect.
#'
#' @docType class
#' @importFrom R6 R6Class
#' @section Methods:
#' \describe{
#' \item{\code{initialize(lags, colnames.to.lag) }}{
#' Initializes a new \code{SMG.Lag} object.
#'
#' @param lags integer the number of lags the \code{SMG.Lag} object should
#' create (if there is a lag_1 variable, this number should be 1).
#'
#' @param colnames.to.lag vector a vector containg the names of the
#' variables to lag.
#' }
#'
#' \item{\code{lagged_colnames(colnames = NULL) }}{
#' This function returns a list of names that represent the lagged names of
#' the \code{colnames} provided to it. It uses the \code{lags} variable as
#' specified in the initialization step. One can choose to provide colnames
#' to this function to get the result for a specific set of colnames. If no
#' colnames are provided, the function uses the \code{colnames.to.lag}
#' names as specified on initialization.
#'
#' @param colnames vector (default = NULL) the vector of names for which to
#' genearte a lagged version.
#'
#' @return vector a vector of colnames with appended '_lag_x' names.
#' }
#'
#' \item{\code{update(data.current) }}{
#' For online learning we need to be able to create new data blocks on the
#' fly (as not all data is available beforehand. This function updates a
#' set of variables to contain the lagged variables for the next block.
#'
#' @param data.current data.table the current data / the last data used for
#' training, from which a new summary measure needs to be generated.
#'
#' @return data.table a new block for the next training iteration.
#' }
#'
#' \item{\code{process(data.current) }}{
#' Converts lagged columns for all provided data. It goes through the data
#' and generates the lagged variables accordingly.
#'
#' @param data.current data.table the currently available data.
#'
#' @return data.table with the new summary measure columns.
#' }
#'
#' \item{\code{minimalObservations}}{
#' Active method. The minimal number of measurements needed for this SMG to
#' be able to generate a new block. This is the number of lags + 1.
#'
#' @return integer the number of lags + 1
#' }
#'
#' \item{\code{exposedVariables}}{
#' Active method. Returns a list of variables returned by this SMG.
#'
#' @return vector a vector of strings conaining all lagged columnames.
#' }
#' }
#' @export
SMG.Lag <- R6Class("SMG.Lag",
inherit = SMG.Base,
public =
list(
initialize = function(lags, colnames.to.lag){
private$lags <- lags
private$lags.vector <- seq(1,lags)
private$colnames <- colnames.to.lag
private$colnames.lagged <- self$lagged_colnames()
},
lagged_colnames = function(colnames = NULL ) {
if (is.null(colnames)) colnames <- private$colnames
## This will create a list like this:
## A_lag_1 A_lag_2 W_lag_1 W_lag_2 Y_lag_1 Y_lag_2
## If we want to have a different ordering, note that the update function also
## needs to be updated!
unlist(lapply(colnames, function(col){
paste(col, 'lag', seq(private$lags), sep = "_")
})
)
},
## An update means getting the next set of variables, based on the
## current ones. This results in a vector with the new values
update = function(data.current) {
if (nrow(data.current) > 1) {
throw('Provided number of rows to lag / update function needs to be one')
}
## The data we have is 1 row, with all lags available. We only focus on the lags
## Retrieve the columns we are interested in
subset_to_lag <- data.current[, c(private$colnames, private$colnames.lagged), with=FALSE]
data.new <- lapply(private$colnames, function(cn) {
## Only use the current columnname + its lagged ones (this should be enough)
used_colnames <- c(cn, self$lagged_colnames(cn))
current <- subset(subset_to_lag, select = used_colnames)
## shift the names (essentially changing x -> x_lag_1, and x_lag_1 to x_lag_2)
current.names <- shift(names(current), type = 'lead')
## Rename the other columns to REMOVE to remove them all
current.names[is.na(current.names)] <- 'REMOVE'
names(current) <- current.names
while('REMOVE' %in% names(current)){
current[,REMOVE:=NULL]
}
current
}) %>% unlist
return(data.new)
},
process = function(data.current) {
self$check_enough_available(data.current)
# TODO: We could probably do this in a smarter way, i.e., creating an initial frame first
# and than prepend new columns, and remove the last column.
data.current[, (private$colnames.lagged) := shift(.SD, private$lags.vector, NA),
.SDcols = private$colnames]
# This will return the lagged dataset, and NOT only the complete cases.
data.current[, private$colnames.lagged, with=FALSE]
}
),
active =
list(
minimalObservations = function() {
private$lags + 1
},
exposedVariables = function() {
private$colnames.lagged
}
),
private =
list(
minimal.measurements = NULL,
lags.vector = NULL,
lags = NULL,
colnames.lagged = NULL,
colnames = NULL
)
)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.