#' Cross-validation for fwelnet
#'
#' Does \code{k}-fold cross-validation for \code{fwelnet}.
#'
#' This function runs \code{fwelnet nfolds+1} times: the first to get the
#' \code{lambda} sequence, and the remaining \code{nfolds} times to compute the
#' fit with each of the folds omitted. The error is accumulated, and the mean
#' error and standard deviation over the folds is computed. Note that
#' \code{cv.pcLasso} does NOT search for values of \code{alpha}. A specific
#' value of \code{alpha} should be supplied.
#'
#' @param x \code{x} matrix as in \code{fwelnet}.
#' @param y \code{y} matrix as in \code{fwelnet}.
#' @param z \code{z} matrix as in \code{fwelnet}.
#' @param family Response type. Either \code{"gaussian"} (default) for linear
#' regression or \code{"binomial"} for logistic regression.
#' @param lambda A user supplied \code{lambda} sequence. Typical usage is to
#' have the program compute its own \code{lambda} sequence; supplying a value of
#' lambda overrides this.
#' @param type.measure Loss to use for cross-validation. Currently five options,
#' not all available for all models. The default is type.measure="deviance",
#' which uses squared-error for gaussian models (a.k.a type.measure="mse" there)
#' and deviance for logistic regression. type.measure="class" applies to binomial
#' logistic regression only, and gives misclassification error. type.measure="auc"
#' is for two-class logistic regression only, and gives area under the ROC curve.
#' type.measure="mse" or type.measure="mae" (mean absolute error) can be used by
#' all models.
#' @param nfolds Number of folds for CV (default is 10). Although \code{nfolds}
#' can be as large as the sample size (leave-one-out CV), it is not recommended
#' for large datasets. Smallest value allowable is \code{nfolds = 3}.
#' @param foldid An optional vector of values between 1 and \code{nfolds}
#' identifying what fold each observation is in. If supplied, \code{nfolds} can
#' be missing.
#' @param keep If \code{keep = TRUE}, a prevalidated array is returned
#' containing fitted values for each observation at each value of lambda. This
#' means these fits are computed with this observation and the rest of its fold
#' omitted. Default is \code{FALSE}.
#' @param verbose Print information as model is being fit? Default is FALSE.
#' @param ... Other arguments that can be passed to \code{fwelnet}.
#'
#' @return An object of class \code{"cv.fwelnet"}, which is a list with the
#' ingredients of the cross-validation fit.
#' \item{glmfit}{A fitted \code{fwelnet} object for the full data.}
#' \item{lambda}{The values of \code{lambda} used in the fits.}
#' \item{nzero}{The number of non-zero coefficients in the model \code{glmfit}.}
#' \item{fit.preval}{If \code{keep=TRUE}, this is the array of prevalidated
#' fits.}
#' \item{cvm}{The mean cross-validated error: a vector of length
#' \code{length(lambda)}.}
#' \item{cvsd}{Estimate of standard error of \code{cvm}.}
#' \item{cvlo}{Lower curve = \code{cvm - cvsd}.}
#' \item{cvup}{Upper curve = \code{cvm + cvsd}.}
#' \item{lambda.min}{The value of \code{lambda} that gives minimum
#' \code{cvm}.}
#' \item{lambda.1se}{The largest value of \code{lambda} such that the CV
#' error is within one standard error of the minimum.}
#' \item{foldid}{If \code{keep=TRUE}, the fold assignments used.}
#' \item{name}{Name of error measurement used for CV.}
#' \item{call}{The call that produced this object.}
#'
#' @examples
#' set.seed(1)
#' n <- 100; p <- 20
#' x <- matrix(rnorm(n * p), n, p)
#' beta <- matrix(c(rep(2, 5), rep(0, 15)), ncol = 1)
#' y <- x %*% beta + rnorm(n)
#' z <- cbind(1, abs(beta) + rnorm(p))
#'
#' cvfit1 <- cv.fwelnet(x, y, z)
#'
#' # change no. of CV folds
#' cvfit2 <- cv.fwelnet(x, y, z, nfolds = 5)
#' # specify which observations are in each fold
#' foldid <- sample(rep(seq(5), length = length(y)))
#' cvfit3 <- cv.fwelnet(x, y, z, foldid = foldid)
#' # keep=TRUE to have pre-validated fits and foldid returned
#' cvfit4 <- cv.fwelnet(x, y, z, keep = TRUE)
#'
#' @importFrom stats predict
#' @export
cv.fwelnet <- function(x, y, z, family = c("gaussian", "binomial"), lambda = NULL,
type.measure = c("mse", "deviance", "class", "auc", "mae"),
nfolds = 10, foldid = NULL, keep = FALSE, verbose = FALSE,
...) {
this.call <- match.call()
n <- nrow(x); p <- ncol(x); K <- ncol(z)
y <- as.vector(y)
family <- match.arg(family)
# get type.measure
if (missing(type.measure)) {
type.measure <- ifelse(family == "gaussian", "mse", "deviance")
} else {
type.measure <- match.arg(type.measure)
}
# if fold IDs not given, randomly create them
# if given, count the number of folds
if (is.null(foldid)) {
foldid <- sample(rep(seq(nfolds), length = n))
} else {
nfolds <- max(foldid)
}
if (nfolds < 3) {
stop("nfolds must be bigger than 3; nfolds=10 recommended")
}
# get fwelnet fit for all of the data
fit0 <- fwelnet(x, y, z, lambda = lambda, family = family, ...)
if (verbose) {
cat("Initial fit done", fill = TRUE)
}
nz <- colSums(abs(fit0$beta) > 0)
# fit fwelnet on folds
fits <- vector("list", nfolds)
for (ii in 1:nfolds) {
if (verbose) {
cat(c("Fold = ", ii), fill = TRUE)
}
oo <- foldid == ii
xc <- x[!oo, , drop = FALSE]
yy <- y[!oo]
fits[[ii]] <- fwelnet(xc, yy, z, lambda = fit0$lambda, family = family,
verbose = verbose, ...)
}
# get predictions
predmat <- matrix(NA, n, length(fit0$lambda))
for (ii in 1:nfolds) {
oo <- foldid == ii
out <- predict(fits[[ii]], x[oo, , drop = F])
predmat[oo, 1:ncol(out)] <- out
}
# compute CV stuff (machinery borrowed from glmnet)
weights <- rep(1, n); grouped = TRUE # for compatibility w glmnet
if (family == "gaussian") {
cvstuff <- cv.elnet(predmat, y, type.measure, weights, foldid, grouped)
name <- ifelse(type.measure == "mae", "Mean Absolute Error",
"Mean-Squared Error")
} else { # family == "binomial"
cvstuff <- cv.lognet(predmat, y, type.measure, weights, foldid, grouped)
name <- switch(type.measure,
deviance = "Binomial Deviance",
class = "Misclassification Error",
auc = "AUC")
}
cvout <- cvstats(cvstuff, foldid, nfolds, lambda, nz, cvstuff$grouped)
cvm <- cvout$cvm
cvsd <- cvout$cvsd
cvup <- cvout$cvup
cvlo <- cvout$cvlo
predmat.preval <- NULL
foldid_copy <- NULL
if (keep) {
predmat.preval <- predmat
foldid_copy <- foldid
}
# get best and 1se lambda
if (type.measure == "auc") {
imin <- which.max(cvm)
lambda.min <- fit0$lambda[imin]
imin.1se <- which(cvm > cvm[imin] - cvsd[imin])[1]
lambda.1se <- fit0$lambda[imin.1se]
} else {
imin <- which.min(cvm)
lambda.min <- fit0$lambda[imin]
imin.1se <- which(cvm < cvm[imin] + cvsd[imin])[1]
lambda.1se <- fit0$lambda[imin.1se]
}
out <- list(glmfit=fit0, lambda=fit0$lambda, nzero=fit0$nzero,
fit.preval=predmat.preval, cvm=cvm, cvsd=cvsd, cvlo=cvlo,
cvup=cvup, lambda.min=lambda.min, lambda.1se=lambda.1se,
foldid=foldid_copy, name=name, call=this.call)
class(out) <- "cv.fwelnet"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.