cvreg: Cross Validation and Robust Estimation Utilities

Documented in cv_lqlasso

#' Cross Validate Power Exponential Likelihood LASSO (Lq norm LASSO)
#'
#' @param formula a model formula
#' @param data a training data set
#' @param cv.method preferably one of "boot632" (the default), "cv", or "repeatedcv".
#' @param nfolds the number of bootstrap or cross-validation folds to use. defaults to 5.
#' @param nrep the number of repetitions for cv.method = "repeatedcv". defaults to 4.
#' @param folds a vector of pre-set cross-validation or bootstrap folds from caret::createResample or
#' caret::createFolds.
#' @param tunlen the number of values for the unknown hyperparameter to test. defaults to 10.
#' @param crit the criterion by which to evaluate the model performance. must be one of "MAE" (the default)
#' or "MSE".
#' @param max.c the largest value of the constant for calculating lambda. defaults to 8, but
#' may be adjusted. for example, if the error metric becomes constant after a certain
#' value of C, it may be advisable to lower max.c to a smaller value to obtain
#' a more fine-grained grid over the plausible values.
#'
#' @return
#' a train object
#' @export
#'
#'
cv_lqlasso = function(formula, data, cv.method = "boot632", nfolds = 5, nrep = 4, folds = NULL, tunlen = 10, max.c = 8, crit = "MAE"){

  if (!is.null(folds)) {
    nfolds = NULL
  }

  LQLasso <- list(type = "Regression",
                  library = "flare",
                  loop = NULL)

  prm <- data.frame(parameter = c("C", "q", "base.lambda"),
                    class = rep("numeric", 3),
                    label = c("C", "q", "base.lambda"))

  LQLasso$parameters <- prm

  lm.betas <- lmSolve(formula, data)
  model.mat <- model.matrix(formula, data)
  lm.pred <- as.vector(lm.betas) %*% t(model.mat)
  lm.res <- as.vector(model.frame(formula, data)[,1]) - lm.pred
  LQLasso$noiseSD <- mad(lm.res)

  LQLasso$max.c <- max.c
  LQLassoGrid <- function(x, y, max.c = LQLasso$max.c, noise.sd = LQLasso$noiseSD, len = NULL, search = "grid") {

    D = nrow(x)
    N = length(y)
    lambda0 = noise.sd * sqrt(log(D) / N)

    C <- seq(1, max.c, length.out = len)
    q <- c(1, 1.4, 1.6, 2)

    grid <- expand.grid(C = C, q = q)
    grid$base.lambda <- rep(lambda0, nrow(grid))

    ## use grid search:
    if(search == "grid"){
      search = "grid"
    } else {
      search = "grid"
    }

    out <- grid
    return(out)
  }

  LQLasso$grid <- LQLassoGrid

  LQLassoFit <- function(x, y, param, ...) {

    f = function(q){
      (1 + round((sqrt(exp(log1p(q))) * sqrt(q)) - 0.41422, 3)^q) / 2
    }

    flare::slim(X = x, Y = y, method = "lq", q = param$q, lambda = param$C * f(param$q) * param$base.lambda, verbose = FALSE, res.sd = FALSE)
  }

  LQLasso$fit <- LQLassoFit
  LQLasso$prob <- LQLassoFit

  LQLassoPred <- function(modelFit, newdata, preProc = NULL, submodels = NULL){
    betas = as.vector(c(modelFit$intercept, modelFit$beta))
    newx = as.matrix(cbind(y = rep(1, nrow(newdata)), newdata))
    as.vector(betas %*% t(newx))
  }

  LQLasso$predict <- LQLassoPred

  postRobResamp = function(pred, obs) {

    isNA <- is.na(pred)
    pred <- pred[!isNA]
    obs <- obs[!isNA]
    if (!is.factor(obs) && is.numeric(obs)) {
      if (length(obs) + length(pred) == 0) {
        out <- rep(NA, 3)
      }
      else {

        robmse <- mean((pred - obs)^2)
        robmae <- mean(abs(pred - obs))
        out <- c(robmse, robmae)
      }
      names(out) <- c("MSE", "MAE")
    }
    else {
      if (length(obs) + length(pred) == 0) {
        out <- rep(NA, 2)
      }
      else {
        pred <- factor(pred, levels = levels(obs))
        requireNamespaceQuietStop("e1071")
        out <- unlist(e1071::classAgreement(table(obs, pred)))[c("diag",
                                                                 "kappa")]
      }
      names(out) <- c("Accuracy", "Kappa")
    }
    if (any(is.nan(out)))
      out[is.nan(out)] <- NA
    out
  }

  basicSummary = function (data, lev = NULL, model = NULL)
  {
    if (is.character(data$obs))
      data$obs <- factor(data$obs, levels = lev)
    postRobResamp(data[, "pred"], data[, "obs"])
  }


  if (cv.method == "repeatedcv") {
    fitControl <- trainControl(method = cv.method,
                               number = nfolds,
                               index = folds,
                               repeats = nrep,
                               savePredictions = "all",
                               summaryFunction = basicSummary,
                               search = "grid")
  } else {

    fitControl <- trainControl(method = cv.method,
                               number = nfolds,
                               index = folds,
                               savePredictions = "all",
                               summaryFunction = basicSummary,
                               search = "grid")
  }


  fitted.models <- train(formula, data,
                         method = LQLasso,
                         metric = crit,
                         tuneLength = tunlen,
                         maximize = FALSE,
                         preProcess = c("center", "scale"),
                         trControl = fitControl)


  f = function(q){
    (1 + round((sqrt(exp(log1p(q))) * sqrt(q)) - 0.41422, 3)^q) / 2
  }

  lambda <- fitted.models$results$C * f(fitted.models$results$q) * fitted.models$results$base.lambda
  fitted.models$results = cbind.data.frame(fitted.models$results[,1:3],
                                 lambda = lambda,
                                 fitted.models$results[,4:ncol(fitted.models$results)])

  return(fitted.models)

}