#' @title
#' getLSTMmodel
#' @description
#' Constructs a custom \code{\link{mxLSTM}} model for use in the caret
#' \code{\link[caret]{train}} logic. It behaves slightly different than the usual
#' caret models as retrieved by \code{\link[caret]{getModelInfo}}. See details.
#' @details
#' \strong{model setup} \cr {The model is an LSTM recurrent neural network model with rmsprop optimizer.\cr }
#'
#' \strong{Purpose}\cr
#' The purpose of the custom model is the following:
#' \describe{
#' \item{Allow multiple y}{Allow a regression within caret that predicts multiple y in one model.}
#' \item{Scaling of y}{Allow for scaling of y. Possible options are \code{c('scale', 'center', 'minMax')}}
#' \item{scale x variables again}{If e.g. a PCA is conducted in the preprocessing, the resulting inputs can be scaled again
#' by preProcessing options \code{c('scaleAgain', 'centerAgain')}}
#' }
#'
#' \strong{Usage}\cr
#' The model differs from 'usual' caret models in its usage.
#' Differences when using it in \code{\link[caret]{train}}:
#' \describe{
#' \item{Different formula for model specification}{Usually, the formula would be for example \code{y1+y2+y3 ~ x1+x2+x3}.
#' Caret does not allow this specification, therefore a hack is used:
#' \itemize{
#' \item {construct a column \code{dummy = y1}}
#' \item {Specify the formula as \code{dummy~x1+x2+x3+y1+y2+y3}.}
#' \item {Determine x and y variables with the arguments \code{xVariables = c('x1', 'x2', 'x3')}
#' and \code{yVariables = c('y1','y2','y3')}}}}
#' \item{Different pre-processing arguments}{\itemize{
#' \item{Don't us the caret \code{preProcess} argument. Use \code{preProcessX} and
#' \code{preProcessY} instead}
#' \item{Don't specify \code{preProcOptions} in the \code{\link[caret]{trainControl}} call.
#' Specify them in the call to \code{\link[caret]{train}}.
#' They will be valid for preProcessX only since y pre-processing does not require further arguments.
#' preProcessX can be anything that caret allows plus \code{c('scaleAgain', 'centerAgain')}
#' for scaling as a last preProcessing step. \code{preProcessY} can include
#' \code{c('scale', 'center', 'minMax')}.}}}
#'
#' \item{Additional mandatory arguments to fit function} {For transforming the input to the LSTM, the following additional arguments
#' must be specified to the train function:
#' \itemize{
#' \item{\code{seqLength}: The sequence length (number of rows in input)}
#' \item{\code{seqFreq}: frequency of sequence starts (in rows).
#' If smaller than seqLength, sequences are overlapping}
#' }}
#' }
#'
#' \strong{Additional argumets to fit function:}
#' \itemize{
#' \item{testData:}{ dataset for evaluating performance after each epoch}
#' \item{initialModel:} { can be specified if the aim is to continue
#' training on an existing model. \cr
#' Has to be the output of a call to \code{\link{fitLSTMmodel}}. \cr
#' ATTENTION! Be sure, to specify the same xVariables, yVariables, hidden layers, and
#' preProcessing steps as in the original training.\cr
#' }
#' \item{seed:} {Optional random seed that is set before model training for reproducibility.}
#' }
#'
#' \strong{Additional argumets to predict function:}
#' \itemize{
#' \item{fullSequence:}{ Boolean. If FALSE, only the last output of a sequence is returned.
#' If TRUE, the output for the whole sequence is returned.}
#' }
#'
#'
#' \item{Different prediction function}{For predicting from the model as returned by caret's \code{\link[caret]{train}},
#' you have to use the \code{\link{predictAll}} function. This will call the internal
#' predict function of \code{getLSTMmodel} returning predictions for all y-variables.}
#'
#' \strong{tuning parameters\cr}
#' \itemize{
#' \item{num.epoch:}{ number of training epochs}
#' \item{batch.size:}{ batch size}
#' \item{layer1}{ number of hidden units in LSTM layer 1.}
#' \item{layer2}{ number of hidden units in LSTM layer 2.}
#' \item{layer3}{ number of hidden units in LSTM layer 3.}
#' \item{dropout1}{ dropout probability for LSTM layer 1}
#' \item{dropout2}{ dropout probability for LSTM layer 2}
#' \item{dropout3}{ dropout probability for LSTM layer 3}
#' \item{activation}{ Activation function for the update layers in the LSTM cells. "relu" or "tanh"}
#' \item{shuffle}{ Boolean. Should the training batches be randomly reordered?
#' Each sequence of course stays in its native order}
#' \item{weight_decay:} { defaults to 0}
#' \item{learningrate_momentum:} { gamma1. See API description of mx.opt.rmsprop}
#' \item{momentum:} { gamma2. See API description of mx.opt.rmsprop.}
#' \item{clip_gradients:} { See API description of mx.opt.rmsprop}
#'
#' }
#'
#' \strong{Other specific features}
#' \itemize{
#' \item{\strong{plot training history}} {It is possible to plot the training history
#' of an mxLSTM model with \code{\link{plot_trainHistory}}}
#' \item{\strong{restore checkpoint from specified epoch}}{ It is possible to restore
#' the model weights after a given epoch with the function \code{\link{restoreLstmCheckpoint}}.}
#'
#' }
#' @return A list of functions similar to the output of caret's \code{\link[caret]{getModelInfo}}:\cr
#' @seealso \code{\link{saveCaretLstmModel}}, \code{\link{loadCaretLstmModel}},
#' \code{\link{plot_trainHistory}}, \code{\link{fitLSTMmodel}},
#' \code{\link{predictLSTMmodel}}, \code{\link{getPreProcessor}},
#' \code{\link{predictPreProcessor}}, \code{\link{invertPreProcessor}}
#'
#' @examples
#'\dontrun{
#' library(mxLSTM)
#' library(data.table)
#' library(caret)
#' ###########################################################
#' ## perform a regression with nxLSTM
#' ## on dummy data
#'
#' ## simple data: one numeric output as a function of two numeric inputs.
#' ## including lag values
#' ## with some noise.
#' set.seed(200)
#' mx.set.seed(200)
#' nObs <- 20000
#' dat <- data.table(x = runif(n = nObs, min = 1000, max = 2000),
#' y = runif(n = nObs, min = -10, max = 10))
#' ## create target
#' dat[, target := 0.5 * x + 0.7 * lag(y, 3) - 0.2 * lag(x, 5)]
#' dat[, target2 := 0.1 * x + 0.3 * lag(y, 1) - 0.4 * lag(x, 2)]
#' dat[, target := target + rnorm(nObs, 0, 10)]
#' dat[, target2 := target2 + rnorm(nObs, 0, 10)]
#'
#' ## convert to nxLSTM input
#' dat <- transformLSTMinput(dat = dat, targetColumn = c("target", "target2"), seq.length = 5)
#'
#' ## convert to caret input
#' dat <- lstmInput2caret(dat)
#'
#' ## split into train and test set
#' trainIdx <- sample(seq_len(nrow(dat)), as.integer(nrow(dat) / 3))
#' evalIdx <- sample(seq_len(nrow(dat))[-trainIdx], as.integer(nrow(dat) / 3))
#' testIdx <- seq_len(nrow(dat))[-c(trainIdx, evalIdx)]
#' datTrain <- dat[trainIdx,]
#' datEval <- dat[evalIdx,]
#' datTest <- dat[testIdx,]
#'
#' ## define caret trainControl
#' thisTrainControl <- trainControl(method = "cv",
#' number = 2,
#' verboseIter = TRUE)
#'
#'
#' ## do the training
#'
#' ## grid for defining the parameters of the mxNet model
#' lstmGrid <- expand.grid(layer1 = 64, layer2 = 0, layer3 = 0,
#' weight.decay = 0, dropout1 = 0, dropout2 = 0, dropout3 = 0,
#' learningrate.momentum = 0.95,
#' momentum = 0.1, num.epoch = 50,
#' batch.size = 128, activation = "relu", shuffle = TRUE, stringsAsFactors = FALSE)
#'
#' ## construct formula with all variables on rigth-hand-side
#' form <- formula(paste0("dummy~", paste0(setdiff(names(datTrain), "dummy"), collapse = "+")))
#'
#' caret_lstm <- train(form = form,
#' data = datTrain,
#' testData = datEval,
#' method = getLSTMmodel(), ## get our custom model
#' xVariables = c("x", "y"), ## define predictors
#' yVariables = c("target", "target2"), ## define outcomes
#' preProcessX = c("pca", "scaleAgain", "centerAgain"),
#' preProcessY = c("scale", "center"), ## in case of multiple y, this makes sense imho
#' debugModel = FALSE,
#' trControl = thisTrainControl,
#' tuneGrid = lstmGrid,
#' learning.rate = c("1" = 0.02, "40" = 0.0002), ## adaptive learningrate that changes at epoch 40
#' optimizeFullSequence = FALSE
#' )
#'
#' ## get nice output of training history
#' plot_trainHistory(caret_lstm$finalModel)
#'
#' ## get predictions for the datasets
#' predTrain <- predictAll(caret_lstm, newdata = datTrain, fullSequence = FALSE)
#' predEval <- predictAll(caret_lstm, newdata = datEval, fullSequence = FALSE)
#' predTest <- predictAll(caret_lstm, newdata = datTest, fullSequence = FALSE)
#'
#' ## get nice goodness of fit plots.
#' plot_goodnessOfFit(predicted = predTrain$target, observed = datTrain$target_seq5Y)
#' plot_goodnessOfFit(predicted = predTrain$target2, observed = datTrain$target2_seq5Y)
#' plot_goodnessOfFit(predicted = predTest$target, observed = datTest$target_seq5Y)
#' plot_goodnessOfFit(predicted = predTest$target2, observed = datTest$target2_seq5Y)
#'
#' ## save the model
#' saveCaretLstmModel(caret_lstm, "testModel")
#' }
#' @export getLSTMmodel
getLSTMmodel <- function(){
lstmModel <- list()
lstmModel$label <- "MXnet Long-short-term memory recurrent neural network with rmsProp optimizer"
lstmModel$library <- "mxnet"
lstmModel$type <- "Regression"
lstmModel$tags <- "LSTM recurrent neural network"
## prob is not required for regression
lstmModel <- c(lstmModel, prob = NA)
lstmModel$parameters <-
data.frame(parameter = "layer1", class = "numeric", label = "Hidden nodes in LSTM layer 1")
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "layer2", class = "numeric", label = "Hidden nodes in LSTM layer 2"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "layer3", class = "numeric", label = "Hidden nodes in LSTM layer 3"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "num.epoch", class = "numeric", label = "number of passes over the full training set"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "activation", class = "character", label = "activation function for update layers"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "batch.size", class = "numeric", label = "batch size for training"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "dropout1", class = "numeric", label = "Dropout probability for inputs to LSTM layer 1"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "dropout2", class = "numeric", label = "Dropout probability for inputs to LSTM layer 2"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "dropout3", class = "numeric", label = "Dropout probability for inputs to LSTM layer 3"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "shuffle", class = "logical", label = "switch for activating reshuffling of training events"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "weight.decay", class = "numeric", label = "Weight decay"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "learningrate.momentum", class = "numeric", label = "Factor for adjusting moving average for gradient^2 (gamma1)"))
lstmModel$parameters <-
rbind(lstmModel$parameters,
data.frame(parameter = "momentum", class = "numeric", label = "Momentum (gamma2)"))
lstmModel$grid <-
function(x, y, len = NULL, search = "grid"){
if(search != "grid") stop("Only grid search implemented so far for mxLSTM")
out <- expand.grid(seq.length = 12, layer1 = ((1:len) * 2) - 1, layer2 = 0, layer3 = 0,
num.epoch = 10, activation = "relu", batch.size = 128,
dropout1 = 0, dropout2 = 0, dropout3 = 0, shuffle = TRUE,
weight.decay = 0, learningrate.momentum = 0.9,
momentum = 0.9)
}
## add scaling of predictors and target to the fit function
lstmModel$fit <- function(x, y, wts, param, lev, last, classProbs, xVariables, yVariables,
preProcessX = NULL,
preProcessY = NULL,
preProcOptions = list(thresh = 0.95, ICAcomp = 3, k = 5, freqCut = 95/5,
uniqueCut = 10, cutoff = 0.9),
debugModel = FALSE,
testData = NULL, ## data frame with the same format as x for validation
initialModel = NULL,
seed = NULL,
...){
## set global option for debug so that also the predict function will be in debug mode.
if(debugModel) options(mxLSTM.debug = TRUE)
if(getOption("mxLSTM.debug")){
cat("###############################################################\n")
cat("#################Preprocessing x and y #################\n")
cat("###############################################################\n")
cat("###########input data##############\n")
print(summary(x))
cat("###########input xVariables##############\n")
print(xVariables)
cat("###########input yVariables##############\n")
print(yVariables)
cat("###########preProcessing options##############\n")
print(preProcOptions)
}
## do the preProcessing
## transform the data to pre-processable data.frame
dat <- caret2lstmInput1(x)
if(getOption("mxLSTM.debug")){
cat("#################input after caret2lstmInput1 #################\n")
print(summary(dat))
}
if(any(!xVariables %in% names(dat))) stop("xVariables contains variables that are not in x or on
the right hand side of the formula")
if(any(!yVariables %in% names(dat))) stop("yVariables contains columns that are not in x or that also appear on the right hand side of the formula.
Make sure that the left hand side of the formula
contains a copy of y that is called 'dummy'.
The right hand side of the formula should contain the real y variables")
seq.length <- max(dat$sequenceId)
## split data into id, x and y
datX <- data.frame(dat[, xVariables, with = FALSE])
datY <- data.frame(dat[, yVariables, with = FALSE])
## preProcess x and y
preProcessX <- getPreProcessor(dat = datX,
preProcess = preProcessX,
preProcOptions = preProcOptions)
datX <- predictPreProcessor(preProcess = preProcessX, dat = datX)
preProcessY <- getPreProcessor(dat = datY,
preProcess = preProcessY,
preProcOptions = preProcOptions)
datY <- predictPreProcessor(preProcess = preProcessY, dat = datY)
dat <- transformLSTMinput(cbind(datX, datY), targetColumn = yVariables, seq.length = seq.length)
if(getOption("mxLSTM.debug")){
cat("#################x and y after preProcessing #################\n")
print(summary(dat$x))
print(summary(dat$y))
print("xVariables:")
print(dimnames(dat$x)[[1]])
print("yVariables:")
print(unique(dimnames(dat$y)[[2]]))
}
## same for test data if applicable
if(!is.null(testData)){
testData <- caret2lstmInput1(testData)
testX <- data.frame(testData[, xVariables, with = FALSE])
testY <- data.frame(testData[, yVariables, with = FALSE])
testX <- predictPreProcessor(preProcessX, testX)
testY <- predictPreProcessor(preProcessY, testY)
testData <- transformLSTMinput(cbind(testX, testY), targetColumn = yVariables, seq.length = seq.length)
} else {
testData <- list(x = NULL,
y = NULL)
}
if(getOption("mxLSTM.debug")){
cat("#################input data right before model fit #################\n")
cat("dat$x:\n")
print(dim(dat$x))
print(summary(dat$x))
cat("dat$y:\n")
print(dim(dat$y))
print(summary(dat$y))
}
## fit the model
model <- fitLSTMmodel(x = dat$x, y = dat$y, test.x = testData$x, test.y = testData$y,
initialModel = initialModel, seed = seed, param = param, ...)
## add the preProcessing information, so that it can be used in predict.
model$preProcessX <- preProcessX
model$preProcessY <- preProcessY
model$xVariables <- xVariables
model$yVariables <- yVariables
## if it's the last training, reset debug option
if(last == TRUE) options(mxLSTM.debug = FALSE)
return(model)
}
## add scaling to the predict function
lstmModel$predict <- function(modelFit, newdata, submodels = NULL, allY = FALSE, debugModel = FALSE, fullSequence = FALSE){
## set debugging option.
if(debugModel) options(mxLSTM.debug = TRUE)
## get x and y variable names
xVariables <- modelFit$xVariables
names(xVariables) <- xVariables
yVariables <- modelFit$yVariables
names(yVariables) <- yVariables
if(getOption("mxLSTM.debug")){
cat("###############################################################\n")
cat("#################Starting prediction #################\n")
cat("###############################################################\n")
cat("#################newdata before transformation#################\n")
print(summary(newdata))
cat("#################xVariables#################\n")
print(xVariables)
cat("#################yVariables#################\n")
print(yVariables)
cat("#################Preprocessing#################\n")
cat("x: ")
print(modelFit$preProcessX)
cat("y: ")
print(modelFit$preProcessY)
}
newdata <- as.data.frame(newdata)
## transform newdata
newdata <- as.data.frame(caret2lstmInput1(newdata))
if(getOption("mxLSTM.debug")){
cat("###########newdata after caret2lstmInput1##############\n")
print(summary(newdata))
}
seq.length <- max(newdata$sequenceId)
newdataX <- newdata[, xVariables, drop = FALSE]
newdataY <- newdata[, yVariables, drop = FALSE]
## apply the scaling to xVariables
newdataX <- predictPreProcessor(modelFit$preProcessX, newdataX)
if(getOption("mxLSTM.debug")){
cat("###########newdata after predictPreprocessor##############\n")
print(summary(cbind(newdataX, newdataY)))
print(yVariables)
print(seq.length)
}
newdata <- transformLSTMinput(dat = cbind(newdataX, newdataY),
targetColumn = yVariables,
seq.length = seq.length)
if(getOption("mxLSTM.debug")){
cat("###########newdata right before prediction##############\n")
cat("newdata$x:\n")
print(dim(newdata$x))
print(summary(newdata$x))
}
## now do the real prediction. Since we have one row per event in the input,
## we take the alst sequence element for each event as prediction
out <- predictLSTMmodel(model = modelFit, dat = newdata$x, fullSequence = fullSequence)
if(getOption("mxLSTM.debug")){
cat("###########predicted y before retransformation##############\n")
print(summary(out))
print(modelFit$yVariables)
}
out$rowIndex <- NULL
names(out) <- yVariables
## invert transformation on y.
out <- invertPreProcessor(preProcessor = modelFit$preProcessY, dat = out)
if(!allY) {
## select first yVariable if required
out <- as.numeric(out[,1])
}
if(getOption("mxLSTM.debug")){
cat("###########predicted y after retransformation##############\n")
print(summary(out))
}
## reset global debugging option
options(mxLSTM.debug = FALSE)
return(out)
}
return(lstmModel)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.