cvreg: Cross Validation and Robust Estimation Utilities

Documented in cv_pense_mstep

#' Cross Validate Penalized Elastic Net S-Estimator (PENSEM)
#'
#' @param formula a model formula
#' @param data a training data set
#' @param cv.method preferably one of "boot632" (the default), "cv", or "repeatedcv".
#' @param nfolds the number of bootstrap or cross-validation folds to use. defaults to 5.
#' @param folds a vector of pre-set cross-validation or bootstrap folds from caret::createResample or
#' caret::createFolds.
#' @param nrep the number of repetitions for cv.method = "repeatedcv". defaults to 4.
#' @param tunlen the number of values for the unknown hyperparameter to test. defaults to 10.
#' @param crit the criterion by which to evaluate the model performance. must be one of "MAE" (the default)
#' or "MSE".
#' @param select the selection rule to use. Should be one of "best" or "oneSE" (the default).
#' @param max.c the largest value of the constant for calculating lambda. defaults to 8, but
#' may be adjusted. for example, if the error metric becomes constant after a certain
#' value of C, it may be advisable to lower max.c to a smaller value to obtain
#' a more fine-grained grid over the plausible values.
#'
#' @return
#' a train object
#' @export
#'
cv_pense_mstep = function(formula, data, fit, realpha = FALSE, cv.method = "boot632", nfolds = 5, nrep = 4, folds = NULL, select = "oneSE", tunlen = 10, crit = "MAE", max.c = 2){

  if (!is.null(folds)) {
    nfolds = NULL
  }

  PENSEM <- list(type = "Regression",
                library = "pense",
                loop = NULL)
  PENSEM$max.c <- max.c

  if (!realpha) {

  y <- model.frame(formula, scale(data))[,1]
  x <- model.matrix(formula, scale(data))[,-1]

  penseopts = pense_options(delta = 0.30,
                            maxit = 2500,
                            mscale_maxit = 1000,
                            eps = 1e-04,
                            mscale_eps = 1e-05)

  PENSEM$modelfit <- pense::pense(x = scale(x),
                             y = scale(y),
                             lambda = fit$finalModel$lambda,
                             alpha = fit$finalModel$alpha,
                             options = penseopts,
                             standardize = FALSE)

  PENSEM$parameters <- data.frame(parameter = c("lambda"),
                                 class = rep("numeric", 1),
                                 label = c("lambda"))


  penseGrid <- function(x, y, fit = PENSEM$modelfit, max.c = PENSEM$max.c, len = NULL, search = "grid") {

    ## use grid search:
    if(search == "grid"){
      search = "grid"
    } else {
      search = "grid"
    }


    f = function(alpha){
      sqrt((2 - alpha)^3 / (2 - (1 - alpha)))
    }

    D = nrow(x)
    N = length(y)

    lambda <- as.numeric(fit$scale) * f(fit$alpha) * sqrt(log(D) / N) * seq(0.5, max.c, length.out = len)

    grid <- expand.grid(lambda = lambda)
    out <- grid

    return(out)
  }

  PENSEM$grid <- penseGrid

  penseFit <- function(x, y, fit = PENSEM$modelfit, param, ...) {

    pense::pensem(x = fit,
                  x_train = x,
                  y_train = y,
                  alpha = fit$alpha,
                  lambda = param$lambda,
                  lambda_s = fit$lambda,
                  standardize = FALSE
    )

  }

  PENSEM$fit <- penseFit
  PENSEM$prob <- penseFit

  pensePred <- function(modelFit, newdata, preProc = NULL, submodels = NULL){
    pense:::predict.pense(modelFit, newdata, exact = TRUE)
  }

  PENSEM$predict <- pensePred

  postRobResamp = function(pred, obs) {

    isNA <- is.na(pred)
    pred <- pred[!isNA]
    obs <- obs[!isNA]
    if (!is.factor(obs) && is.numeric(obs)) {
      if (length(obs) + length(pred) == 0) {
        out <- rep(NA, 2)
      }
      else {

        huber.mean <-  function (y) {
          init.robmu = MASS::hubers(y, k = 2.241403, initmu = median(y), s = sd(y))$mu
          MASS::hubers(y, k = 1.281552, initmu = init.robmu)$mu
        }

        robmse <- huber.mean((pred - obs)^2)
        robmae <- huber.mean(abs(pred - obs))
        out <- c(robmse, robmae)
      }
      names(out) <- c("MSE", "MAE")
    }
    else {
      if (length(obs) + length(pred) == 0) {
        out <- rep(NA, 2)
      }
      else {
        pred <- factor(pred, levels = levels(obs))
        requireNamespaceQuietStop("e1071")
        out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
                                                                 "kappa")]
      }
      names(out) <- c("Accuracy", "Kappa")
    }
    if (any(is.nan(out)))
      out[is.nan(out)] <- NA
    out
  }

  robustSummary = function (data, lev = NULL, model = NULL)
  {
    if (is.character(data$obs))
      data$obs <- factor(data$obs, levels = lev)
    postRobResamp(data[, "pred"], data[, "obs"])
  }


  if (cv.method == "repeatedcv") {
    fitControl <- trainControl(method = cv.method,
                               number = nfolds,
                               repeats = nrep,
                               index = folds,
                               selectionFunction = select,
                               savePredictions = "all",
                               summaryFunction = robustSummary,
                               search = "grid")
  } else {

    fitControl <- trainControl(method = cv.method,
                               number = nfolds,
                               index = folds,
                               selectionFunction = select,
                               savePredictions = "all",
                               summaryFunction = robustSummary,
                               search = "grid")
  }


  fitted.models <- train(formula, data,
                         method = PENSEM,
                         metric = crit,
                         tuneLength = tunlen,
                         maximize = FALSE,
                         preProcess = c("center", "scale"),
                         trControl = fitControl)

    return(fitted.models)
  } else if (realpha) {

    y <- model.frame(formula, scale(data))[,1]
    x <- model.matrix(formula, scale(data))[,-1]

    penseopts = pense_options(delta = 0.30,
                              maxit = 2500,
                              mscale_maxit = 1000,
                              eps = 1e-04,
                              mscale_eps = 1e-05)

    PENSEM$modelfit <- pense::pense(x = scale(x),
                                    y = scale(y),
                                    lambda = fit$finalModel$lambda,
                                    alpha = fit$finalModel$alpha,
                                    options = penseopts,
                                    standardize = FALSE)

    PENSEM$parameters <- data.frame(parameter = c("C", "alpha", "base.lambda"),
                                    class = rep("numeric", 3),
                                    label = c("C", "alpha", "base.lambda"))


    penseGrid <- function(x, y, fit = PENSEM$modelfit, max.c = PENSEM$max.c, len = NULL, search = "grid") {

      ## use grid search:

      if(search == "grid"){
        search = "grid"
      } else {
        search = "grid"
      }

      D = nrow(x)
      N = length(y)

      base.lambda <- as.numeric(fit$scale) * sqrt(log(D) / N)

      alphas <- seq(0, 1, length.out = 6)
      C <- seq(0.25, max.c, length.out = len)
      grid <- expand.grid(C = C, alpha = alphas)
      grid <- cbind.data.frame(grid, base.lambda = rep(base.lambda, nrow(grid)))
      out <- grid
      return(out)
    }

    PENSEM$grid <- penseGrid

    penseFit <- function(x, y, fit = PENSEM$modelfit, param, ...) {

      f = function(alpha){
        sqrt((2 - alpha)^3 / (2 - (1 - alpha)))
      }

      pense::pensem(x = fit,
                    x_train = x,
                    y_train = y,
                    alpha = param$alpha,
                    lambda = param$C * f(param$alpha) * param$base.lambda,
                    lambda_s = fit$lambda,
                    standardize = FALSE
      )

    }

    PENSEM$fit <- penseFit
    PENSEM$prob <- penseFit

    pensePred <- function(modelFit, newdata, preProc = NULL, submodels = NULL){
      pense:::predict.pense(modelFit, newdata, exact = TRUE)
    }

    PENSEM$predict <- pensePred

    postRobResamp = function(pred, obs) {

      isNA <- is.na(pred)
      pred <- pred[!isNA]
      obs <- obs[!isNA]
      if (!is.factor(obs) && is.numeric(obs)) {
        if (length(obs) + length(pred) == 0) {
          out <- rep(NA, 2)
        }
        else {

          huber.mean <-  function (y) {
            init.robmu = MASS::hubers(y, k = 2.241403, initmu = median(y), s = sd(y))$mu
            MASS::hubers(y, k = 1.281552, initmu = init.robmu)$mu
          }

          robmse <- huber.mean((pred - obs)^2)
          robmae <- huber.mean(abs(pred - obs))
          out <- c(robmse, robmae)
        }
        names(out) <- c("MSE", "MAE")
      }
      else {
        if (length(obs) + length(pred) == 0) {
          out <- rep(NA, 2)
        }
        else {
          pred <- factor(pred, levels = levels(obs))
          requireNamespaceQuietStop("e1071")
          out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
                                                                   "kappa")]
        }
        names(out) <- c("Accuracy", "Kappa")
      }
      if (any(is.nan(out)))
        out[is.nan(out)] <- NA
      out
    }

    robustSummary = function (data, lev = NULL, model = NULL)
    {
      if (is.character(data$obs))
        data$obs <- factor(data$obs, levels = lev)
      postRobResamp(data[, "pred"], data[, "obs"])
    }


    if (cv.method == "repeatedcv") {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 repeats = nrep,
                                 index = folds,
                                 selectionFunction = select,
                                 savePredictions = "all",
                                 summaryFunction = robustSummary,
                                 search = "grid")
    } else {

      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 index = folds,
                                 savePredictions = "all",
                                 selectionFunction = select,
                                 summaryFunction = robustSummary,
                                 search = "grid")
    }


    fitted.models <- train(formula, data,
                           method = PENSEM,
                           metric = crit,
                           tuneLength = tunlen,
                           maximize = FALSE,
                           preProcess = c("center", "scale"),
                           trControl = fitControl)

    return(fitted.models)
  }
}