R/cv_gpls.R

Defines functions cv_gpls

Documented in cv_gpls

#' Cross Validation for Generalized Projection to Latent Structures Regression
#'
#' @param formula a model formula
#' @param data a training data set
#' @param family the glm family. One of "gaussian", "student", "poisson", "negbinom", "binomial", "multinom", "gamma", "invgauss"
#' @param link the link function. See details for more information.
#' @param cv.method preferably one of "boot632" (the default), "cv", or "repeatedcv".
#' @param nfolds the number of bootstrap or cross-validation folds to use. defaults to 5.
#' @param nrep the number of repetitions for cv.method = "repeatedcv". defaults to 4.
#' @param crit the criterion by which to evaluate the model performance. See details for more information.
#' @param select the selection rule to use. Should be one of "best" or "oneSE" (the default).
#' @param folds a vector of pre-set cross-validation or bootstrap folds from caret::createResample or
#' caret::createFolds.
#'
#' @details The available summary statistics for the argument "crit" depend on which likelihood function is chosen for the glm family.
#' \cr \cr
#' If the outcome is multinomial, it should be one of "Kappa" (the default), "Accuracy", "Mean_F1",
#' "Mean_Sensitivity", "Mean_Specificity", "Mean_Pos_Pred_Value", "Mean_Neg_Pred_Value",  "Mean_Precision", "Mean_Recall",
#' or "Mean_Detection_Rate". \cr
#' \cr
#' If the outcome is binomial, it should be one of "Spec" (Specificity, the default) or "Sens". \cr
#' \cr
#' Otherwise, it should be one of "MAE" (the default) or "MSE". \cr \cr
#' \cr
#' #' The following link functions are available for each distribution: \cr
#' \cr
#' Gaussian & Student's T: "identity" \cr
#' Binomial & Multinomial: "logit", "probit", "cauchit", "robit" (Student T with 3 df), and "cloglog" \cr
#' Poisson & Negative Binomial: "log" \cr
#' Gamma: "inverse" (1 / x) \cr
#' Inverse Gaussian: "invsquare" (1/x^2)
#'
#' @return
#' a train object
#' @export
#'
cv_gpls = function(formula, data, family = NULL, link = NULL, cv.method = "boot632", nfolds = 5, nrep = 4, crit = NULL, select = "oneSE", folds = NULL, preproc = c("center" , "scale")){

  if (is.null(family)){
    cat(crayon::bold(crayon::make_style(rgb(.945, .4057, .016))("warning!")), crayon::blue("Please specify a glm family from the following options:\n"), crayon::make_style(rgb(0.008, 0.341, 0.224))("'gaussian', 'gamma', 'invgauss', 'poisson', 'negbinom', 'binomial', 'multinom'"))
  } else {

  if (is.null(crit)){
    if (family == "multinom"){
      crit = "Kappa"
    }
    if (family == "binomial"){
      crit = "Spec"
    }
    else {
      crit = "MAE"
      }
    }
  }

  if (!is.null(link)){
    if (family == "gaussian"){
      if (link != "identity")
        cat("link function not recognized for ", family, "only identity is available!\n")
    }
    else if (family=="binomial" || family == "multinom") {
      tf = !is.na(match(link, c("logit", "probit", "cloglog", "cauchit", "robit")))
      if (isFALSE(tf))
          cat("link function not recognized for ", family, " only logit, probit, cauchit, robit, and cloglog are available!\n")
    }
    else if (family=="poisson") {
      if (link != "log")
        cat("link function not recognized for ", family, " only log is available!\n")
    }
    else if (family=="gamma") {
      if (link != "inverse")
        cat("link function not recognized for ", family, " only inverse is available!\n")
    }
    else if (family=="invgauss") {
      if (link != "invsquare")
        cat("link function not recognized for ", family, " only invsquare is available!\n")
    }
    else if (family=="negbinom") {
      if (link != "log")
        cat("link function not recognized for ", family, " only log is available!\n")
    }
  }


  if (family == "binomial" || family == "multinom"){

    gplsCV <- list(type = "Classification",
                   library = "stats",
                   loop = NULL)

  } else {

    gplsCV <- list(type = "Regression",
                   library = "stats",
                   loop = NULL)
  }

  gplsCV$family <- family
  gplsCV$link <- link
  gplsCV$x.names <- colnames(model.frame(formula, data))[-1]
  gplsCV$y.name <- colnames(model.frame(formula, data))[1]
  gplsCV$formula

  prm <- data.frame(parameter = c("ncomp"),
                    class = rep("numeric", 1),
                    label = c("ncomp"))

  gplsGrid <- function(x, y, len = NULL, search = "grid", family = gplsCV$family) {

    ## use grid search:
    if(search == "grid"){
      search = "grid"
    } else {
      search = "grid"
    }
    if (family == "multinom"){
      k <- length(unique(y)) - 1
      ncomp = seq(1 * k, min(nrow(as.matrix(y)), ncol(x)) * k, length.out = min(nrow(as.matrix(y)), ncol(x)))

    } else {
      ncomp = seq(1, min(nrow(as.matrix(y)), ncol(x)), length.out = min(nrow(as.matrix(y)), ncol(x)))
    }
    grid <- expand.grid(ncomp = ncomp)
    return(grid)
  }

  gplsFit <- function(x, y, param, Family = gplsCV$family, Link = gplsCV$link, ...) {

    dat <- as.data.frame(x)
    dat$.outcome <- y
    model.out <- gpls(.outcome ~ ., data = dat, ncomp = param$ncomp, family = Family, link = Link, shrinkage = T)
    model.out

  }

  predictgplsCV <- function(object, newdata = NULL, type = c("response", "link", "class")){

    if (!inherits(object, "gpls"))
      stop("object not of class gpls")
    type <- match.arg(type)
    family <- object$family
    link <- object$link
    formula <- object$formula
    coefs <- coef(object)
    if (is.null(newdata)){
      x <- object$model.matrix
    } else{
      if (!is.data.frame(newdata)){
        newdata <- as.data.frame(newdata)
      }
      x <- cbind.data.frame(rep(1, nrow(newdata)), newdata)
    }
    eta <- as.vector(coefs %*% t(x))
    preds <- linkinv(eta, family = family, link = link)
    if (type == "class"){
      if (family != "binomial"){
        stop("type 'predict' is only valid for predicting class labels for binomial models.")
      }
      else {
        levs <- object$levs
        classification <- preds > 0.50
        labels <- ifelse(classification, levs[1], levs[2])
        return(labels)
        #return(as.numeric(classification))
      }
    }
    else if (type == "link"){
      return(eta)
    }
    else if (type == "response"){
      return(preds)
    }
  }

  predictgplsCVsoftmax <- function(model, newdata = NULL, type = c("response", "class")) {

    type <- match.arg(type)
    if (is.null(newdata)){
      X <- model$X
    } else{
      X <- as.matrix(newdata)
    }
    beta <- model$coefficients
    if(all(X[,1] == rep(1,nrow(X))))
    {
      eta <- X%*%beta
    } else
    {
      eta <- cbind(rep(1,nrow(X)),X)%*% beta
    }
    p <- exp(eta)
    p.denom <- apply(p, 1, function(x) 1+sum(x))
    probabilities <- p/p.denom
    probabilities <- cbind(1 - rowSums(probabilities), probabilities)
    colnames(probabilities) <- model$outcome.levs
    if (type == "response")
      return(probabilities)
    else
      return(model$outcome.levs[sapply(1:nrow(probabilities), function(x) which.max(probabilities[x, ]))])
  }


  gplsPred <- function(modelFit, newdata, submodels = NULL){
    if (!is.data.frame(newdata))
      newdata <- as.data.frame(newdata)

    if (modelFit$family == "binomial"){
      predictgplsCV(modelFit, newdata, type = "class")
    }
    else if (modelFit$family == "multinom"){
      predictgplsCVsoftmax(modelFit, newdata, type = "class")
    }
    else {
      predictgplsCV(modelFit, newdata, type = "response")
    }
  }

  gplsCV$grid <- gplsGrid
  gplsCV$parameters <- prm
  gplsCV$fit <- gplsFit

  gplsCV$prob <- gplsPred
  gplsCV$predict <- gplsPred

  fitResamp = function(pred, obs) {

        huber.mean = function(y) {
          MASS::hubers(y, initmu = mean(c(mean(y), median(y))), s = sd(y))$mu
        }

        robmse <- huber.mean((pred - obs)^2)
        robmae <- huber.mean(abs(pred - obs))
        fit.measure <- c(robmse, robmae)

      names(fit.measure) <- c("MSE", "MAE")

    if (any(is.nan(fit.measure)))
      fit.measure[is.nan(fit.measure)] <- NA
    return(fit.measure)
  }

  fitSummary = function (data, lev = NULL, model = NULL){
    if (is.character(data$obs))
      data$obs <- factor(data$obs, levels = lev)
    fitResamp(data[, "pred"], data[, "obs"])
  }


  if (family == "multinom"){
    if (cv.method == "repeatedcv") {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 repeats = nrep,
                                 classProbs = TRUE,
                                 summaryFunction = multiClassSummary,
                                 allowParallel = TRUE,
                                 index = folds,
                                 selectionFunction = select,
                                 search = "grid")
  } else {
      fitControl <- trainControl(method = cv.method,
                                number = nfolds,
                                classProbs = TRUE,
                                summaryFunction = multiClassSummary,
                                allowParallel = TRUE,
                                index = folds,
                                selectionFunction = select,
                                search = "grid")
     }
  }
  else if (family == "binomial"){

    if (cv.method == "repeatedcv") {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 repeats = nrep,
                                 classProbs = TRUE,
                                 summaryFunction = twoClassSummary,
                                 allowParallel = TRUE,
                                 index = folds,
                                 selectionFunction = select,
                                 search = "grid")
    } else {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 classProbs = TRUE,
                                 summaryFunction = twoClassSummary,
                                 allowParallel = TRUE,
                                 index = folds,
                                 selectionFunction = select,
                                 search = "grid")
    }
  } else{
    if (cv.method == "repeatedcv") {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 repeats = nrep,
                                 summaryFunction = fitSummary,
                                 allowParallel = TRUE,
                                 index = folds,
                                 selectionFunction = select,
                                 search = "grid")
    } else {
      fitControl <- trainControl(method = cv.method,
                                 number = nfolds,
                                 summaryFunction = fitSummary,
                                 allowParallel = TRUE,
                                 index = folds,
                                 selectionFunction = select,
                                 search = "grid")
      }
  }

  out <- train(formula,
               data,
               method = gplsCV,
               metric = crit,
               preProcess = preproc,
               trControl = fitControl)


  return(out)

}
abnormally-distributed/cvreg documentation built on May 3, 2020, 3:45 p.m.