R/boost.R

Defines functions update.rtMod.boost as.boost expand.boost predict.boost print.boost boost

Documented in as.boost boost expand.boost predict.boost print.boost update.rtMod.boost

# boost.R
# ::rtemis::
# 2018 Efstathios D. Gennatas egenn.github.io
# TODO: add error vector (1 * n.iter)
# TODO: make learning.rate input into max.iter vector

#' Boost an \pkg{rtemis} learner for regression
#'
#' Train an ensemble using boosting of any learner
#'
#' If \code{learning.rate} is set to 0, a nullmod will be created
#' @inheritParams s.GLM
#' @param x.valid Data.frame; optional: Validation data
#' @param y.valid Float, vector; optional: Validation outcome
#' @param mod Character: Algorithm to train base learners, for options, see \link{modSelect}. Default = "cart"
#' @param resid Float, vector, length = length(y): Residuals to work on. Do not change unless you know
#' what you're doing. Default = NULL, for regular boosting
#' @param boost.obj [Internal use]
#' @param mod.params Named list of arguments for \code{mod}
#' @param case.p Float (0, 1]: Train each iteration using this perceent of cases. Default = 1, i.e. use all cases
#' @param learning.rate Float (0, 1] Learning rate for the additive steps
#' @param init Float: Initial value for prediction. Default = mean(y)
#' @param tolerance Float: If training error <= this value, training stops
#' @param tolerance.valid Float: If validation error <= this value, training stops
#' @param max.iter Integer: Maximum number of iterations (additive steps) to perform. Default = 10
#' @param trace Integer: If > 0, print diagnostic info to console
#' @param print.progress.every Integer: Print progress over this many iterations
#' @param base.verbose Logical: \code{verbose} argument passed to learner
#' @param print.error.plot String or Integer: "final" plots a training and validation (if available) error curve at the
#' end of training. If integer, plot training and validation error curve every this many iterations
#' during training. "none" for no plot.
#' @param print.base.plot Logical: Passed to \code{print.plot} argument of base learner, i.e. if TRUE, print error plot
#' for each base learner
#' @param plot.type Character: "l" or "p". Plot using lines or points. Default = "l"
#' @param prefix Internal
#' @param ... Additional parameters to be passed to learner define by \code{mod}
#' @author Efstathios D. Gennatas
#' @export


boost <- function(x, y = NULL,
                  x.valid = NULL, y.valid = NULL,
                  x.test = NULL, y.test = NULL,
                  mod = 'cart',
                  resid = NULL,
                  boost.obj = NULL,
                  mod.params = list(),
                  case.p = 1,
                  weights = NULL,
                  learning.rate = .1,
                  tolerance = 0,
                  tolerance.valid = .00001,
                  max.iter = 10,
                  init = NULL,
                  x.name = NULL,
                  y.name = NULL,
                  question = NULL,
                  base.verbose = FALSE,
                  verbose = TRUE,
                  trace = 0,
                  print.progress.every = 5,
                  print.error.plot = "final",
                  prefix = NULL,
                  plot.theme = getOption("rt.fit.theme", "lightgrid"),
                  plot.fitted = NULL,
                  plot.predicted = NULL,
                  print.plot = FALSE,
                  print.base.plot = FALSE,
                  plot.type = 'l',
                  outdir = NULL, ...) {

  # [ INTRO ] ====
  if (missing(x)) {
    print(args(boost))
    return(invisible(9))
  }
  if (!is.null(outdir)) outdir <- normalizePath(outdir, mustWork = FALSE)
  logFile <- if (!is.null(outdir)) {
    paste0(outdir, "/", sys.calls()[[1]][[1]], ".", format(Sys.time(), "%Y%m%d.%H%M%S"), ".log")
  } else {
    NULL
  }
  start.time <- intro(verbose = verbose, logFile = logFile)
  mod.name <- "BOOST"

  # [ ARGUMENTS ] ====
  if (is.null(x.name)) x.name <- getName(x, "x")
  if (is.null(y.name)) y.name <- getName(y, "y")
  if (!verbose) print.plot <- FALSE
  verbose <- verbose | !is.null(logFile)
  # if (save.mod & is.null(outdir)) outdir <- paste0("./s.", mod.name)
  if (!is.null(outdir)) outdir <- paste0(normalizePath(outdir, mustWork = FALSE), "/")
  extra.args <- list(...)
  mod.params <- c(mod.params, extra.args)

  # [ DATA ] ====
  dt <- dataPrepare(x, y,
                    x.test, y.test,
                    x.valid, y.valid,
                    verbose = verbose)
  x <- dt$x
  y <- dt$y
  x.test <- dt$x.test
  y.test <- dt$y.test
  x.valid <- dt$x.valid
  y.valid <- dt$y.valid
  xnames <- dt$xnames
  type <- dt$type
  if (verbose) dataSummary(x, y, x.test, y.test, type)
  if (print.plot) {
    if (is.null(plot.fitted)) plot.fitted <- if (is.null(y.test)) TRUE else FALSE
    if (is.null(plot.predicted)) plot.predicted <- if (!is.null(y.test)) TRUE else FALSE
  } else {
    plot.fitted <- plot.predicted <- FALSE
  }
  if (is.null(init)) init <- mean(y)

  # [ BOOST ] ====
  mod <- learner.short <- toupper(mod)
  learner <- modSelect(mod)
  learner.name <- modSelect(mod, desc = TRUE)
  learner.short <- toupper(mod)

  if (verbose) parameterSummary(mod,
                                mod.params,
                                init,
                                max.iter,
                                learning.rate,
                                tolerance,
                                tolerance.valid)
  if (trace > 0) msg("Initial MSE =", mse(y, init))

  # '- New series ====
  # init learning.rate vector
  if (is.null(boost.obj)) {
    mods <- list()
    Fval <- penult.fitted <- init
    .learning.rate <- numeric()

    error <- vector("numeric")
    error[[1]] <- mse(y, Fval) # will be overwritten, needed for while statement

    if (!is.null(x.valid)) {
      error.valid <- vector("numeric")
      Fvalid <- init
    } else {
      error.valid <- predicted.valid <- Fvalid <- NULL
    }
    i <- 1
    if (verbose) msg("[ Boosting ", learner.name, "... ]", sep = "")
  } else {
    .learning.rate <- boost.obj$mod$learning.rate
    # '- Expand series ====
    mods <- boost.obj$mod$mods
    Fval <- penult.fitted <- boost.obj$fitted
    error <- boost.obj$mod$error
    if (!is.null(x.valid)) {
      error.valid <- boost.obj$mod$error.valid
      Fvalid <- boost.obj$mod$predicted.valid
    } else {
      error.valid <- predicted.valid <- Fvalid <- NULL
    }
    max.iter <- max.iter + length(mods)
    i <- length(mods) + 1
    if (trace > 0) msg("i =", i)
    if (verbose) msg("[ Expanding boosted ", learner.name, "... ]", sep = "")
  }

  if (is.null(resid)) resid <- y - Fval

  # Print error during training
  if (max.iter == 1 && is.null(boost.obj)) {
    print.progress.index <- FALSE
    print.error.plot <- "none"
  } else if (print.progress.every < max.iter) {
    print.progress.index <- seq(print.progress.every, max.iter, print.progress.every)
  } else {
    print.progress.index <- max.iter
  }

  # Print error plot
  if (max.iter > 1 && is.numeric(print.error.plot)) {
    if (print.error.plot < max.iter) {
      print.error.plot.index <- seq(print.error.plot, max.iter, print.error.plot)
    } else {
      print.error.plot.index <- max.iter
    }
    print.error.plot <- "iter"
  }

  # '- Iterate learner ====
  while (i <= max.iter) {
    .learning.rate[i] <- learning.rate
    if (trace > 0) msg("learning.rate is", .learning.rate[i])
    if (trace > 0) msg("i =", i)
    if (case.p < 1) {
      n.cases <- NROW(x)
      index <- sample(n.cases, case.p * n.cases)
      x1 <- x[index, , drop = FALSE]
      # y1 <- y[index]
      resid1 <- resid[index]
    } else {
      x1 <- x
      resid1 <- resid
    }

    mod.args <- c(list(x = x1, y = resid1,
                       x.test = x.valid, y.test = y.valid,
                       verbose = base.verbose, print.plot = print.base.plot),
                  mod.params)

    # '- Train base learner ====
    if (.learning.rate[i] != 0) {
      mods[[i]] <- do.call(learner, args = mod.args)
    } else {
      mods[[i]] <- list(fitted = Fval)
      class(mods[[i]]) <- c("nullmod", "list")
    }

    names(mods)[i] <- paste0(learner.short, ".", i)
    if (case.p < 1) {
      mods[[i]]$fitted <- fitted <- predict(mods[[i]], x)
    } else {
      fitted <- mods[[i]]$fitted
    }

    Fval <- Fval + .learning.rate[i] * fitted
    if (i == max.iter - 1) penult.fitted <- Fval
    resid <- y - Fval
    error[[i]] <- mse(y, Fval)
    if (!is.null(x.valid)) {
      predicted.valid <- mods[[i]]$predicted
      Fvalid <- Fvalid + .learning.rate[i] * predicted.valid
      error.valid[[i]] <- mse(y.valid, Fvalid)
      if (verbose && i %in% print.progress.index) if (verbose) msg("Iteration #", i, ": Training MSE = ",
                                                                   ddSci(error[[i]]),
                                                                   "; Validation MSE = ", ddSci(error.valid[[i]]),
                                                                   sep = "")
    } else {
      if (verbose && i %in% print.progress.index) {
        msg("Iteration #", i, ": Training MSE = ", ddSci(error[[i]]), sep = "")
      }
    }
    if (print.error.plot == "iter" && i %in% print.error.plot.index) {
      if (is.null(x.valid)) {
        mplot3.xy(seq(error), error, type = plot.type,
                  xlab = "Iteration", ylab = "MSE",
                  main = paste0(prefix, learner.short, " Boosting"), zero.lines = FALSE,
                  theme = plot.theme)
      } else {
        mplot3.xy(seq(error), list(training = error, validation = error.valid), type = plot.type,
                  xlab = "Iteration", ylab = "MSE", group.adj = .95,
                  main = paste0(prefix, learner.short, " Boosting"), zero.lines = FALSE,
                  theme = plot.theme)
      }

    }
    if (error[[i]] <= tolerance) {
      if (verbose) msg("Reached training error tolerance = ", tolerance, "; breaking", sep = "")
      break
    }
    if (!is.null(x.valid) && error.valid[[i]] <= tolerance.valid) {
      if (verbose) msg("Reached validationn error tolerance, breaking")
      break
    }
    i <- i + 1
  }
  if (verbose && i > max.iter) msg("Reached max iterations")

  if (print.error.plot == "final") {
    if (is.null(x.valid)) {
      mplot3.xy(seq(error), error, type = plot.type,
                xlab = "Iteration", ylab = "MSE",
                main = paste0(prefix, learner.short, " Boosting"), zero.lines = FALSE,
                theme = plot.theme)
    } else {
      mplot3.xy(seq(error), list(training = error, validation = error.valid), type = plot.type,
                xlab = "Iteration", ylab = "MSE", group.adj = .95,
                main = paste0(prefix, learner.short, " Boosting"), zero.lines = FALSE,
                theme = plot.theme)
    }
  }

  # '- boost object ====
  obj <- list(call = NULL,
              mod.name = mod.name,
              learning.rate = .learning.rate,
              init = init,
              penult.fitted = penult.fitted,
              fitted = Fval,
              last.fitted = fitted,
              predicted.valid = Fvalid,
              error = error,
              error.valid = error.valid,
              mods = mods)
  class(obj) <- c("boost", "list")

  # [ FITTED ] ====
  error.train <- modError(y, obj$fitted)
  if (verbose) errorSummary(error.train)

  # [ PREDICTED ] ====
  predicted <- error.test <- NULL
  if (!is.null(x.test)) {
    predicted <- predict(obj, x.test)
    if (!is.null(y.test)) {
      error.test <- modError(y.test, predicted)
      if (verbose) errorSummary(error.test)
    }
  }

  # [ OUTRO ] ====
  parameters <- list(mod = learner.short,
                     mod.params = mod.params,
                     init = init,
                     n.its = length(error),
                     learning.rate = learning.rate,
                     tolerance = tolerance,
                     tolerance.valid = tolerance.valid,
                     max.iter = max.iter)
  extra <- list(error.valid = error.valid)
  rt <- rtModSet(rtclass = "R6",
                 mod = obj,
                 mod.name = mod.name,
                 type = type,
                 parameters = parameters,
                 call = NULL,
                 y.train = y,
                 y.test = y.test,
                 x.name = x.name,
                 y.name = y.name,
                 xnames = xnames,
                 fitted = obj$fitted,
                 se.fit = NULL,
                 error.train = error.train,
                 predicted = predicted,
                 se.prediction = NULL,
                 error.test = error.test,
                 varimp = NULL,
                 question = question,
                 extra = extra)

  if (!is.null(outdir)) {
    filename.train <- paste0(outdir, mod.name, "_", mod, "_Fitted.vs.True.pdf")
    if (!is.null(y.test)) {
      filename.test <- paste0(outdir, mod.name, "_", mod, "_Predicted.vs.True.pdf")
    }
  } else {
    filename.train <- filename.test <- NULL
  }

  if (print.plot | !is.null(outdir)) {
    if (plot.fitted | !is.null(outdir)) plot(rt, estimate = "fitted", theme = plot.theme,
                                             print.plot = plot.fitted, filename = filename.train)
    if (plot.predicted | !is.null(outdir)) plot(rt, estimate = "predicted", theme = plot.theme,
                                                print.plot = plot.predicted, filename = filename.test)
  }
  if (!is.null(outdir)) rtSave(rt, outdir, verbose = verbose)
  outro(start.time, verbose = verbose, sinkOff = ifelse(is.null(logFile), FALSE, TRUE))
  rt

} # rtemis::boost


#' Print method for \link{boost} object
#'
#' @method print boost
#' @author Efstathios D. Gennatas
#' @export

print.boost <- function(x, ...) {

  mod.name <- x$mod$mods[[1]]$mod.name
  n.iter <- length(x$mods)
  cat("\n  A boosted", mod.name, "model with", n.iter, "iterations\n")
  cat("  and a learning rate of", x$learning.rate[1], "\n\n")
  # printls(x$mod[[1]]$parameters) # check printls can handle functions

} # rtemis::print.boost


#' Predict method for \code{boost} object
#'
#' @method predict boost
#' @author Efstathios D. Gennatas
#' @export

predict.boost <- function(object,
                          newdata = NULL,
                          n.feat = NCOL(newdata),
                          n.iter = NULL,
                          as.matrix = FALSE,
                          verbose = FALSE,
                          n.cores = rtCores, ...) {

  if (inherits(object, "rtMod") && inherits(object$mod, "boost")) {
    object <- object$mod
    if (verbose) msg("Found rtemis boost object")
  } else if (inherits(object, "boost")) {
    msg("Found boost object")
  } else {
    stop("Please provide boost object")
  }

  if (is.null(newdata)) return(object$fitted)

  if (!is.null(newdata)) {
    if (!is.data.frame(newdata)) {
      .colnames <- if (!is.null(colnames(newdata))) colnames(newdata) else paste0("V", 1:NCOL(newdata))
      newdata <- as.data.frame(newdata)
      colnames(newdata) <- .colnames
      newdata <- newdata[, seq(n.feat), drop = FALSE]
    }
  }

  if (is.null(n.iter)) n.iter <- length(object$mods)

  if (!as.matrix) {
    predicted <- rowSums(cbind(rep(object$init, NROW(newdata)),
                               pbapply::pbsapply(seq(n.iter), function(i)
                                 predict(object$mods[[i]], newdata) * object$learning.rate[i],
                                 cl = n.cores)))
  } else {
    predicted.n <- pbapply::pbsapply(seq(n.iter), function(i)
      predict(object$mods[[i]], newdata) * object$learning.rate[i],
      cl = n.cores)

    predicted <- matrix(nrow = NROW(newdata), ncol = n.iter)
    predicted[, 1] <- object$init + predicted.n[, 1]
    for (i in seq(n.iter)[-1]) {
      predicted[, i] <- predicted[, i - 1] + predicted.n[, i]
    }
  }

  predicted

} # rtemis::predict.boost


#' Expand boosting series
#'
#' Expand a \link{boost} object by adding more iterations
#'
#' @inheritParams boost
#' @param object \link{boost} object
#' @author Efstathios D. Gennatas
#' @export

expand.boost <- function(object,
                         x, y = NULL,
                         x.valid = NULL, y.valid = NULL,
                         x.test = NULL, y.test = NULL,
                         mod = NULL,
                         resid = NULL,
                         mod.params = NULL,
                         max.iter = 10,
                         learning.rate = NULL,
                         case.p = 1,
                         prefix = NULL,
                         verbose = TRUE,
                         trace = 0,
                         print.error.plot = "final",
                         print.plot = FALSE) {

  if (is.null(y)) y <- object$y.train
  if (is.null(mod)) mod <- object$parameters$mod
  if (is.null(mod.params)) mod.params <- object$parameters$mod.params
  if (is.null(learning.rate)) learning.rate <- object$parameters$learning.rate
  tolerance <- object$parameters$tolerance
  boost(x = x, y = y,
        x.valid = x.valid, y.valid = y.valid,
        x.test = x.test, y.test = y.test,
        resid = resid,
        mod = mod,
        boost.obj = object,
        mod.params = mod.params,
        case.p = case.p,
        learning.rate = learning.rate,
        max.iter = max.iter,
        tolerance = tolerance,
        prefix = prefix,
        verbose = verbose,
        trace = trace,
        print.error.plot = print.error.plot,
        print.plot = print.plot)

} # rtemis::expand.boost


#' \code{as.boost} Place model in \link{boost} structure
#'
#' @param object rtMod model
#' @param x Data.frame, optional: if provided, use to calculate fitted values of new boost object
#' @param y Float, vector: Outcome
#' @param x.valid Data.frame; optional: Validation data
#' @param y.valid Float, vector; optional: Validation outcome
#' @param learning.rate Float: Learning rate for new boost object. Default = 1
#' @param init Float: Initial value for new boost object. Default = 0
#' @param apply.lr Logical: Only considered is \code{x = NULL}. If TRUE, new boost object's fitted values will
#' be object$fitted * learning.rate, otherwise object$fitted
#' @param tolerance Float: error tolerance for new boost object. See \link{boost}
#' @param tolerance.valid Float: error tolerance for validation set. See \link{boost}
#' @author Efstathios D. Gennatas
#' @export
# TODO: add x = NULL, if not NULL calculate fitted values

as.boost <- function(object,
                     x = NULL,
                     y = NULL,
                     x.valid = NULL,
                     y.valid = NULL,
                     learning.rate = 1,
                     init = 0,
                     apply.lr = TRUE,
                     tolerance = .00001,
                     tolerance.valid = .00001) {

  if (!inherits(object, "rtMod")) {
    stop("Please provide rtMod object")
  }
  mods <- list()
  mods[[1]] <- object
  if (!is.null(x)) {
    fitted <- init + predict(object, x)
  } else {
    fitted <- object$fitted
  }
  if (apply.lr) fitted <- fitted * learning.rate
  error <- if (!is.null(y)) mse(y, fitted) else NULL

  if (!is.null(x.valid)) {
    predicted.valid <- init + predict(object, x.valid)
    if (apply.lr) predicted.valid <- predicted.valid * learning.rate
    if (!is.null(y.valid)) {
      error.valid <- mse(y.valid, predicted.valid)
    } else {
      error.valid <- NULL
    }
  } else {
    predicted.valid <- error.valid <- NULL
  }

  obj <- list(call = "external",
              mod.name = "BOOST",
              learning.rate = learning.rate,
              init = init,
              fitted = fitted,
              predicted.valid = predicted.valid,
              error = error,
              error.valid = error.valid,
              mods = mods)
  class(obj) <- c("boost", "list")

  # [ OUTRO ] ====
  parameters <- list(mod = object$mod.name,
                     mod.params = object$parameters,
                     init = init,
                     n.its = 1,
                     learning.rate = learning.rate,
                     tolerance = tolerance,
                     tolerance.valid = tolerance.valid,
                     max.iter = 1)
  extra <- list(error.valid = NULL)
  rt <- rtModSet(rtclass = "R6",
                 mod = obj,
                 mod.name = "BOOST",
                 type = object$type,
                 parameters = parameters,
                 call = NULL,
                 y.train = object$y,
                 y.test = object$y.test,
                 x.name = object$x.name,
                 y.name = object$y.name,
                 xnames = object$xnames,
                 fitted = fitted,
                 se.fit = NULL,
                 error.train = object$error.train,
                 predicted = object$predicted,
                 se.prediction = NULL,
                 error.test = object$error.test,
                 varimp = NULL,
                 question = object$question,
                 extra = extra)
  rt

} # rtemis::as.boost

#' \pkg{rtemis} internals: Update \code{rtMod} \link{boost} object's fitted values in-place
#'
#' Calculate new fitted values for a \link{boost} object.
#' Advanced use only: run with new \code{x} or after updating learning.rate in object
#'
#' @param object \link{boost} object
#' @param x Data frame: Features
#' @param last.step.only Logical: If TRUE, \code{x} must be provided and only the last meta model will be updated
#' using this \code{x}
#' @return \link{boost} object
#' @author Efstathios D. Gennatas
#' @return Nothing; updates \code{object} in-place
#' @export
# TODO: add support for x.test

update.rtMod.boost <- function(object, x = NULL,
                               x.test = NULL,
                               trace = 0,
                               last.step.only = FALSE,
                               n.cores = rtCores, ...) {

  if (trace > 0) fitted.orig <- object$fitted
  # Create n.iter x n.cases fitted values; one row per iteration
  if (is.null(x)) {
    fitted <- t(vapply(object$mod$mods, function(i) i$fitted, vector("numeric", length(object$fitted))))
  } else {
    if (!last.step.only) {
      fitted <- t(as.data.frame(pbapply::pblapply(object$mod$mods, function(i) predict(i, x), cl = n.cores)))
    } else {
      u <- length(object$mod$mods)
      object$mod$mods[[u]]$fitted <- predict(object$mod$mods[[u]], x)
      fitted <- t(vapply(object$mod$mods, function(i) i$fitted, vector("numeric", length(object$fitted))))
    }
  }

  # if (!is.null(x.valid)) {
  #   # currently predicted.valid not saved in object
  #     predicted.valid <- t(as.data.frame(pbapply::pblapply(object$mod$mods,
  #                                                          function(i) predict(i, x.valid), cl = n.cores)))
  # }

  # Multiply each row by its corresponding learning.rate, and sum all n.case-length vectors to get fitted value
  object$fitted <- object$mod$init + matrixStats::colSums2(fitted * object$mod$learning.rate)
  object$error.train <- modError(object$y.train, object$fitted)
  # object$mod$error.valid <- modError(object$mod$y.valid, predicted.valid)
  if (trace > 0) {
    mse.orig <- mse(object$y.train, fitted.orig)
    mse.new <- mse(object$y.train, fitted)
    msg0("old MSE = ", ddSci(mse.orig), "; new MSE = ", ddSci(mse.new))
    # if (mse.new > mse.orig) warning("Whatever you did, it didn't really help:\nnew MSE is higher than original")
  }

} # rtemis::update.rtMod.boost
egenn/rtemis documentation built on March 25, 2020, 3:28 p.m.