funshim: Sparse Additive Interaction Learning

Documented in createfolds cvcompute cv.lspath cv.sail getmin lambda.interp

#' Cross-validation for sail
#'
#' @description Does k-fold cross-validation for sail and determines the optimal
#'   tuning parameter \eqn{\lambda}.
#' @inheritParams sail
#' @param ... other arguments that can be passed to \code{\link{sail}}
#' @param lambda Optional user-supplied lambda sequence; default is NULL, and
#'   \code{\link{sail}} chooses its own sequence
#' @param type.measure loss to use for cross-validation. Currently only 3
#'   options are implemented. The default is \code{type.measure="deviance"},
#'   which uses squared-error for gaussian models (and is equivalent to
#'   \code{type.measure="mse"}) there). \code{type.measure="mae"} (mean absolute
#'   error) can also be used which measures the absolute deviation from the
#'   fitted mean to the response (\eqn{|y-\hat{y}|}).
#' @param nfolds number of folds. Although \code{nfolds} can be as large as the
#'   sample size (leave-one-out CV), it is not recommended for large datasets.
#'   Smallest value allowable is \code{nfolds=3}. Default: 10
#' @param foldid an optional vector of values between 1 and \code{nfold}
#'   identifying what fold each observation is in. If supplied,\code{nfold} can
#'   be missing. Often used when wanting to tune the second tuning parameter
#'   (\eqn{\alpha}) as well (see details).
#' @param grouped This is an experimental argument, with default \code{TRUE},
#'   and can be ignored by most users. This refers to computing \code{nfolds}
#'   separate statistics, and then using their mean and estimated standard error
#'   to describe the CV curve. If \code{grouped=FALSE}, an error matrix is built
#'   up at the observation level from the predictions from the \code{nfold}
#'   fits, and then summarized (does not apply to \code{type.measure="auc"}).
#'   Default: TRUE.
#' @param keep If \code{keep=TRUE}, a \emph{prevalidated} array is returned
#'   containing fitted values for each observation and each value of
#'   \code{lambda}. This means these fits are computed with this observation and
#'   the rest of its fold omitted. The \code{folid} vector is also returned.
#'   Default: FALSE
#' @param parallel If \code{TRUE}, use parallel \code{foreach} to fit each fold.
#'   Must register parallel before hand using the
#'   \code{\link[doParallel]{registerDoParallel}} function from the \code{doParallel} package. See
#'   the example below for details. Default: FALSE
#' @return an object of class \code{"cv.sail"} is returned, which is a list with
#'   the ingredients of the cross-validation fit. \describe{ \item{lambda}{the
#'   values of converged \code{lambda} used in the fits.} \item{cvm}{The mean
#'   cross-validated error - a vector of length \code{length(lambda)}.}
#'   \item{cvsd}{estimate of standard error of \code{cvm}.} \item{cvup}{upper
#'   curve = \code{cvm+cvsd}.} \item{cvlo}{lower curve = \code{cvm-cvsd}.}
#'   \item{nzero}{number of non-zero coefficients at each \code{lambda}. This is
#'   the sum of the total non-zero main effects and interactions. Note that when
#'   \code{expand=TRUE}, we only count a variable once in the calculation of
#'   \code{nzero}, i.e., if a variable is expanded to three columns, then this
#'   is only counted once even though all three coefficients are estimated to be
#'   non-zero} \item{name}{a text string indicating type of measure (for
#'   plotting purposes).} \item{sail.fit}{a fitted \code{sail} object for the full
#'   data.} \item{lambda.min}{value of \code{lambda} that gives minimum
#'   \code{cvm}.} \item{lambda.1se}{largest value of \code{lambda} such that
#'   error is within 1 standard error of the minimum.} \item{fit.preval}{if
#'   \code{keep=TRUE}, this is the array of prevalidated fits. Some entries can
#'   be \code{NA}, if that and subsequent values of \code{lambda} are not
#'   reached for that fold} \item{foldid}{if \code{keep=TRUE}, the fold
#'   assignments used}}
#' @details The function runs \code{\link{sail}} \code{nfolds}+1 times; the
#'   first to get the \code{lambda} sequence, and then the remainder to compute
#'   the fit with each of the folds omitted. Note that a new lambda sequence is
#'   computed for each of the folds and then we use the \code{predict} method to
#'   get the solution path at each value of the original lambda sequence. The
#'   error is accumulated, and the average error and standard deviation over the
#'   folds is computed. Note that \code{cv.sail} does NOT search for values for
#'   \code{alpha}. A specific value should be supplied, else \code{alpha=0.5} is
#'   assumed by default. If users would like to cross-validate \code{alpha} as
#'   well, they should call \code{cv.sail} with a pre-computed vector
#'   \code{foldid}, and then use this same fold vector in separate calls to
#'   \code{cv.sail} with different values of \code{alpha}. Note also that the
#'   results of \code{cv.sail} are random, since the folds are selected at
#'   random. Users can reduce this randomness by running \code{cv.sail} many
#'   times, and averaging the error curves.
#' @note The skeleton of this function and the documentation were taken straight
#'   from the \code{glmnet} package. See references for details.
#' @references Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010).
#'   Regularization Paths for Generalized Linear Models via Coordinate Descent.
#'   Journal of Statistical Software, 33(1), 1-22.
#'   \url{http://www.jstatsoft.org/v33/i01/}.
#' @references Bhatnagar SR, Yang Y, Greenwood CMT. Sparse additive interaction
#'   models with the strong heredity property (2018+). Preprint.
#' @seealso \code{\link[splines]{bs}} \code{\link{sail}}
#' @examples
#' f.basis <- function(i) splines::bs(i, degree = 3)
#' data("sailsim")
#' # Parallel
#' library(doParallel)
#' cl <- makeCluster(2)
#' registerDoParallel(cl)
#' cvfit <- cv.sail(x = sailsim$x, y = sailsim$y, e = sailsim$e,
#'                  parallel = TRUE, nlambda = 10,
#'                  maxit = 25, basis = f.basis,
#'                  nfolds = 3, dfmax = 5)
#' stopCluster(cl)
#' # plot cross validated curve
#' plot(cvfit)
#' # solution at lambda.min
#' coef(cvfit, s = "lambda.min")
#' # solution at lambda.1se
#' coef(cvfit, s = "lambda.1se")
#' # non-zero coefficients at lambda.min
#' predict(cvfit, s = "lambda.min", type = "nonzero")
#'
#'
#' @rdname cv.sail
#' @export
cv.sail <- function(x, y, e, ...,
                    weights,
                    lambda = NULL,
                    type.measure = c("mse", "deviance", "class", "auc", "mae"),
                    nfolds = 10, foldid, grouped = TRUE, keep = FALSE, parallel = FALSE) {
  if (!requireNamespace("foreach", quietly = TRUE)) {
    stop("Package \"foreach\" needed for this function to work in parallel. Please install it.",
      call. = FALSE
    )
  }

  if (missing(type.measure)) type.measure <- "default" else type.measure <- match.arg(type.measure)
  if (!is.null(lambda) && length(lambda) < 2) {
    stop("Need more than one value of lambda for cv.sail")
  }
  N <- nrow(x)
  if (missing(weights)) {
    weights <- rep(1, N)
  } else {
    weights <- as.double(weights)
  }
  y <- drop(y)
  sail.call <- match.call(expand.dots = TRUE)
  which <- match(c(
    "type.measure", "nfolds", "foldid", "grouped",
    "keep"
  ), names(sail.call), F)
  if (any(which)) {
    sail.call <- sail.call[-which]
  }
  sail.call[[1]] <- as.name("sail")
  sail.object <- sail::sail(
    x = x, y = y, e = e, ...,
    weights = weights, lambda = lambda
  )

  sail.object$call <- sail.call

  ### Next line is commented out so each call generates its own lambda sequence
  # lambda <- sail.object$lambda

  # nz = sapply(predict(sail.object, type = "nonzero"), length)
  nz <- sapply(sail.object$active, length)
  # if (missing(foldid)) foldid <- sample(rep(seq(nfolds), length = N)) else nfolds <- max(foldid)
  if (missing(foldid)) foldid <- createfolds(y = y, k = nfolds) else nfolds <- max(foldid)
  if (nfolds < 3) {
    stop("nfolds must be bigger than 3; nfolds=10 recommended")
  }
  outlist <- as.list(seq(nfolds))
  if (parallel) {
    outlist <- foreach::foreach(i = seq(nfolds), .packages = c("sail")) %dopar% {
      which <- foldid == i

      if (length(dim(y)) > 1) {
        y_sub <- y[!which, ]
      } else {
        y_sub <- y[!which]
      }

      if (length(dim(e)) > 1) {
        e_sub <- e[!which, ]
      } else {
        e_sub <- e[!which]
      }

      sail::sail(
        x = x[!which, , drop = FALSE], y = y_sub, e = e_sub, ...,
        lambda = lambda, weights = weights[!which]
      )
    }
  } else {
    for (i in seq(nfolds)) {
      which <- foldid == i

      if (is.matrix(y)) {
        y_sub <- y[!which, ]
      } else {
        y_sub <- y[!which]
      }

      if (length(dim(e)) > 1) {
        e_sub <- e[!which, ]
      } else {
        e_sub <- e[!which]
      }

      outlist[[i]] <- sail::sail(
        x = x[!which, , drop = FALSE], y = y_sub, e = e_sub, ...,
        lambda = lambda, weights = weights[!which]
      )
    }
  }

  fun <- paste("cv", class(sail.object)[[1]], sep = ".")
  lambda <- sail.object$lambda
  cvstuff <- do.call(fun, list(
    outlist, lambda, x, y, e, weights,
    foldid, type.measure, grouped, keep
  ))
  cvm <- cvstuff$cvm
  cvsd <- cvstuff$cvsd
  nas <- is.na(cvsd)
  if (any(nas)) {
    lambda <- lambda[!nas]
    cvm <- cvm[!nas]
    cvsd <- cvsd[!nas]
    nz <- nz[!nas]
  }
  cvname <- cvstuff$name
  out <- list(
    lambda = lambda, cvm = cvm, cvsd = cvsd, cvup = cvm +
      cvsd, cvlo = cvm - cvsd, nzero = nz, name = cvname,
    sail.fit = sail.object
  )
  if (keep) {
    out <- c(out, list(fit.preval = cvstuff$fit.preval, foldid = foldid))
  }
  lamin <- if (cvname == "AUC") {
    getmin(lambda, -cvm, cvsd)
  } else {
    getmin(lambda, cvm, cvsd)
  }
  obj <- c(out, as.list(lamin))
  class(obj) <- "cv.sail"
  obj
}


#' Create CV Folds
#'
#' @description \code{createfolds} splits the data into \code{k} groups. Taken
#'   from the \code{caret} package (see references for details)
#' @param y vector of response
#' @param k integer for the number of folds.
#' @param list logical - should the results be in a list (TRUE) or a matrix
#' @param returnTrain a logical. When true, the values returned are the sample
#'   positions corresponding to the data used during training. This argument
#'   only works in conjunction with \code{list = TRUE}
#' @return A vector of CV fold ID's for each observation in \code{y}
#' @details For numeric y, the sample is split into groups sections based on
#'   percentiles and sampling is done within these subgroups
#' @references \url{https://topepo.github.io/caret/data-splitting.html}
createfolds <- function(y, k = 10, list = FALSE, returnTrain = FALSE) {
  if (class(y)[1] == "Surv") {
    y <- y[, "time"]
  }
  if (is.numeric(y)) {
    cuts <- floor(length(y) / k)
    if (cuts < 2) {
      cuts <- 2
    }
    if (cuts > 5) {
      cuts <- 5
    }
    breaks <- unique(stats::quantile(y, probs = seq(0, 1, length = cuts)))
    y <- cut(y, breaks, include.lowest = TRUE)
  }
  if (k < length(y)) {
    y <- factor(as.character(y))
    numInClass <- table(y)
    foldVector <- vector(mode = "integer", length(y))
    for (i in 1:length(numInClass)) {
      min_reps <- numInClass[i] %/% k
      if (min_reps > 0) {
        spares <- numInClass[i] %% k
        seqVector <- rep(1:k, min_reps)
        if (spares > 0) {
          seqVector <- c(seqVector, sample(1:k, spares))
        }
        foldVector[which(y == names(numInClass)[i])] <- sample(seqVector)
      }
      else {
        foldVector[which(y == names(numInClass)[i])] <- sample(1:k,
          size = numInClass[i]
        )
      }
    }
  }
  else {
    foldVector <- seq(along = y)
  }
  if (list) {
    out <- split(seq(along = y), foldVector)
    names(out) <- paste("Fold", gsub(" ", "0", format(seq(along = out))),
      sep = ""
    )
    if (returnTrain) {
      out <- lapply(out, function(data, y) y[-data], y = seq(along = y))
    }
  }
  else {
    out <- foldVector
  }
  out
}


#' Compute cross validation error
#'
#' @description functions used to calculate cross validation error and used by
#'   the \code{\link{cv.sail}} function
#'
#' @param outlist list of cross validated fitted models. List is of length equal
#'   to \code{nfolds} argument in \code{\link{cv.sail}} function
#' @param foldid numeric vector indicating which fold each observation belongs
#'   to
#' @param mat matrix of predictions
#' @param nlams number of lambdas fit
#' @param cvm mean cv error
#' @param cvsd sd of cv error
#' @param s numeric value of lambda
#' @inheritParams sail
#' @inheritParams cv.sail
#' @importFrom stats predict
#' @rdname cv.lspath
#' @seealso \code{\link{cv.sail}}
#' @details The output of the \code{cv.lspath} function only returns values for
#'   those tuning parameters that converged. \code{cvcompute, getmin,
#'   lambda.interp} are taken verbatim from the \code{glmnet} package
#' @references Jerome Friedman, Trevor Hastie, Robert Tibshirani (2010).
#'   Regularization Paths for Generalized Linear Models via Coordinate Descent.
#'   Journal of Statistical Software, 33(1), 1-22.
#'   \url{http://www.jstatsoft.org/v33/i01/}.
cv.lspath <- function(outlist, lambda, x, y, e, weights,
                      foldid, type.measure, grouped, keep = FALSE) {
  typenames <- c(
    deviance = "Mean-Squared Error", mse = "Mean-Squared Error",
    mae = "Mean Absolute Error"
  )
  if (type.measure == "default") {
    type.measure <- "mse"
  }
  if (!match(type.measure, c("mse", "mae", "deviance"), FALSE)) {
    warning("Only 'mse', 'deviance' or 'mae'  available for Gaussian models; 'mse' used")
    type.measure <- "mse"
  }

  mlami <- max(sapply(outlist, function(obj) min(obj$lambda)))
  which_lam <- lambda >= mlami
  predmat <- matrix(NA, length(y), length(lambda))
  nfolds <- max(foldid)
  nlams <- double(nfolds)
  for (i in seq(nfolds)) {
    which <- foldid == i
    fitobj <- outlist[[i]]
    # fitobj$offset = FALSE
    preds <- predict(fitobj, newx = x[which, , drop = FALSE], newe = e[which], s = lambda[which_lam])
    nlami <- sum(which_lam)
    predmat[which, seq(nlami)] <- preds
    nlams[i] <- nlami
  }
  N <- length(y) - apply(is.na(predmat), 2, sum)
  cvraw <- switch(type.measure, mse = (y - predmat)^2, deviance = (y - predmat)^2, mae = abs(y - predmat))
  if ((length(y) / nfolds < 3) && grouped) {
    warning("Option grouped=FALSE enforced in cv.sail, since < 3 observations per fold",
      call. = FALSE
    )
    grouped <- FALSE
  }
  if (grouped) {
    cvob <- cvcompute(cvraw, weights, foldid, nlams)
    cvraw <- cvob$cvraw
    weights <- cvob$weights
    N <- cvob$N
  }
  cvm <- apply(cvraw, 2, stats::weighted.mean, w = weights, na.rm = TRUE)
  cvsd <- sqrt(apply(scale(cvraw, cvm, FALSE)^2, 2, stats::weighted.mean,
    w = weights, na.rm = TRUE
  ) / (N - 1))
  out <- list(cvm = cvm, cvsd = cvsd, name = typenames[type.measure])
  if (keep) {
    out$fit.preval <- predmat
  }
  out
}

#' @describeIn cv.lspath Computations for crossvalidation error
cvcompute <- function(mat, weights, foldid, nlams) {
  ### Computes the weighted mean and SD within folds, and hence the se of the mean
  wisum <- tapply(weights, foldid, sum)
  nfolds <- max(foldid)
  outmat <- matrix(NA, nfolds, ncol(mat))
  good <- matrix(0, nfolds, ncol(mat))
  mat[is.infinite(mat)] <- NA # just in case some infinities crept in
  for (i in seq(nfolds)) {
    mati <- mat[foldid == i, , drop = FALSE]
    wi <- weights[foldid == i]
    outmat[i, ] <- apply(mati, 2, stats::weighted.mean, w = wi, na.rm = TRUE)
    good[i, seq(nlams[i])] <- 1
  }
  N <- apply(good, 2, sum)
  list(cvraw = outmat, weights = wisum, N = N)
}

#' @describeIn cv.lspath get lambda.min and lambda.1se
getmin <- function(lambda, cvm, cvsd) {
  cvmin <- min(cvm, na.rm = TRUE)
  idmin <- cvm <= cvmin
  lambda.min <- max(lambda[idmin], na.rm = TRUE)
  idmin <- match(lambda.min, lambda)
  semin <- (cvm + cvsd)[idmin]
  idmin <- cvm <= semin
  lambda.1se <- max(lambda[idmin], na.rm = TRUE)
  list(lambda.min = lambda.min, lambda.1se = lambda.1se)
}

#' @describeIn cv.lspath Interpolation function.
#' @importFrom stats approx
lambda.interp <- function(lambda, s) {
  ### lambda is the index sequence that is produced by the model
  ### s is the new vector at which evaluations are required.
  ### the value is a vector of left and right indices, and a vector of fractions.
  ### the new values are interpolated bewteen the two using the fraction
  ### Note: lambda decreases. you take:
  ### sfrac*left+(1-sfrac*right)

  if (length(lambda) == 1) { # degenerate case of only one lambda
    nums <- length(s)
    left <- rep(1, nums)
    right <- left
    sfrac <- rep(1, nums)
  }
  else {
    s[s > max(lambda)] <- max(lambda)
    s[s < min(lambda)] <- min(lambda)
    k <- length(lambda)
    sfrac <- (lambda[1] - s) / (lambda[1] - lambda[k])
    lambda <- (lambda[1] - lambda) / (lambda[1] - lambda[k])
    coord <- approx(lambda, seq(lambda), sfrac)$y
    left <- floor(coord)
    right <- ceiling(coord)
    sfrac <- (sfrac - lambda[right]) / (lambda[left] - lambda[right])
    sfrac[left == right] <- 1
    sfrac[abs(lambda[left] - lambda[right]) < .Machine$double.eps] <- 1
  }
  list(left = left, right = right, frac = sfrac)
}