#' cv.glmnet.raw_cue function
#'
#' @description This function is an internal function
#' @keywords internal
#' @export
#' @param x x matrix as in glmnet.
#' @param y response y as in glmnet.
#' @param weights Observation weights; defaults to 1 per observation
#' @param offset Offset vector (matrix) as in glmnet
#' @param lambda Optional user-supplied lambda sequence; default is NULL, and glmnet chooses its own sequence
#' @param type.measure loss to use for cross-validation. Currently five options, not all available for all models. The default is type.measure="deviance", which uses squared-error for gaussian models (a.k.a type.measure="mse" there), deviance for logistic and poisson regression, and partial-likelihood for the Cox model. type.measure="class" applies to binomial and multinomial logistic regression only, and gives misclassification error. type.measure="auc" is for two-class logistic regression only, and gives area under the ROC curve. type.measure="mse" or type.measure="mae" (mean absolute error) can be used by all models except the "cox"; they measure the deviation from the fitted mean to the response. type.measure="C" is Harrel's concordance measure, only available for cox models.
#' @param nfolds number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds=3
#' @param foldid an optional vector of values between 1 and nfold identifying what fold each observation is in. If supplied, nfold can be missing.
#' @param alignment This is an experimental argument, designed to fix the problems users were having with CV, with possible values "lambda" (the default) else "fraction". With "lambda" the lambda values from the master fit (on all the data) are used to line up the predictions from each of the folds. In some cases this can give strange values, since the effective lambda values in each fold could be quite different. With "fraction" we line up the predictions in each fold according to the fraction of progress along the regularization. If in the call a lambda argument is also provided, alignment="fraction" is ignored (with a warning).
#' @param grouped This is an experimental argument, with default TRUE, and can be ignored by most users. For all models except the "cox", this refers to computing nfolds separate statistics, and then using their mean and estimated standard error to describe the CV curve. If grouped=FALSE, an error matrix is built up at the observation level from the predictions from the nfold fits, and then summarized (does not apply to type.measure="auc"). For the "cox" family, grouped=TRUE obtains the CV partial likelihood for the Kth fold by subtraction; by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the on the (K-1)/K dataset. This makes more efficient use of risk sets. With grouped=FALSE the log partial likelihood is computed only on the Kth fold
#' @param keep If keep=TRUE, a prevalidated array is returned containing fitted values for each observation and each value of lambda. This means these fits are computed with this observation and the rest of its fold omitted. The folid vector is also returned. Default is keep=FALSE. If relax=TRUE, then a list of such arrays is returned, one for each value of 'gamma'. Note: if the value 'gamma=1' is omitted, this case is included in the list since it corresponds to the original 'glmnet' fit.
#' @param parallel If TRUE, use parallel foreach to fit each fold. Must register parallel before hand, such as doMC or others. See the example below.
#' @param gamma The values of the parameter for mixing the relaxed fit with the regularized fit, between 0 and 1; default is gamma = c(0, 0.25, 0.5, 0.75, 1)
#' @param relax If TRUE, then CV is done with respect to the mixing parameter gamma as well as lambda. Default is relax=FALSE
#' @param trace.it If trace.it=1, then progress bars are displayed; useful for big models that take a long time to fit. Limited tracing if parallel=TRUE
#' @param ... Other arguments that can be passed to glmnet
#' @return
cv.glmnet.raw_cue = function (x, y, weights, offset, lambda, type.measure, nfolds,
foldid, alignment, grouped, keep, parallel, trace.it, glmnet.call,
cv.call, ...)
{
if (trace.it)
cat("Training\n")
glmnet.object = glmnet(x, y, weights = weights, offset = offset,
lambda = lambda, trace.it = trace.it, ...)
glmnet.object$call = glmnet.call
subclass = class(glmnet.object)[[1]]
type.measure = cvtype(type.measure, subclass)
is.offset = glmnet.object$offset
if (inherits(glmnet.object, "multnet") && !glmnet.object$grouped) {
nz = predict(glmnet.object, type = "nonzero")
nz = sapply(nz, function(x) sapply(x, length))
nz = ceiling(apply(nz, 1, median))
}
else nz = sapply(predict(glmnet.object, type = "nonzero"),
length)
outlist = as.list(seq(nfolds))
N = nrow(x)
if (parallel) {
outlist = foreach(i = seq(nfolds), .packages = c("glmnet")) %dopar%
{
which = foldid == i
if (length(dim(y)) > 1)
y_sub = y[!which, ]
else y_sub = y[!which]
if (is.offset)
offset_sub = as.matrix(offset)[!which, ]
else offset_sub = NULL
glmnet(x[!which, , drop = FALSE], y_sub, lambda = lambda,
offset = offset_sub, weights = weights[!which],
...)
}
}
else {
for (i in seq(nfolds)) {
if (trace.it)
cat(sprintf("Fold: %d/%d\n", i, nfolds))
which = foldid == i
if (is.matrix(y))
y_sub = y[!which, ]
else y_sub = y[!which]
if (is.offset)
offset_sub = as.matrix(offset)[!which, ]
else offset_sub = NULL
outlist[[i]] = glmnet(x[!which, , drop = FALSE],
y_sub, lambda = lambda, offset = offset_sub,
weights = weights[!which], trace.it = trace.it,
...)
}
}
lambda = glmnet.object$lambda
class(outlist) = paste0(subclass, "list")
predmat = buildPredmat(outlist, lambda, x, offset, foldid,
alignment, y = y, weights = weights, grouped = grouped,
type.measure = type.measure, family = family(glmnet.object))
fun = paste("cv", subclass, sep = ".")
cvstuff = do.call(fun, list(predmat, y, type.measure, weights,
foldid, grouped))
grouped = cvstuff$grouped
if ((N/nfolds < 3) && grouped) {
warning("Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold",
call. = FALSE)
grouped = FALSE
}
out = cvstats(cvstuff, foldid, nfolds, lambda, nz, grouped)
cvname = names(cvstuff$type.measure)
names(cvname) = cvstuff$type.measure
out = c(out, list(call = cv.call, name = cvname, glmnet.fit = glmnet.object))
if (keep)
out = c(out, list(fit.preval = predmat, foldid = foldid))
lamin = with(out, getOptcv.glmnet(lambda, cvm, cvsd, cvname))
## Lines of code added by Cue Hyunkyu Lee to calculate the 10-fold variances of sample predictions ##
var_vec = array(0L, nfolds)
for (i in seq(nfolds)) {
if (trace.it)
cat(sprintf("Fold: %d/%d\n", i, nfolds))
which = foldid == i
if (is.matrix(y))
y_sub = y[!which, ]
else y_sub = y[!which]
if (is.offset)
offset_sub = as.matrix(offset)[!which, ]
else offset_sub = NULL
glmnet_fit = glmnet(x[!which, , drop = FALSE],
y_sub, lambda = lamin$lambda.min, offset = offset_sub,
weights = weights[!which], trace.it = trace.it,
...)
var_vec[i] = var(predict.glmnet(object = glmnet_fit, newx = x[which, , drop = FALSE]) - y[which])
}
predvar = mean(var_vec)
obj = c(out, as.list(lamin), list(predvar=predvar))
## ended
class(obj) = "cv.glmnet"
obj
}
#' cv.glmnet_cue function
#' @description This function allows you to get per sample variance estimated from 10 estimates of 10-fold cross validation All input parameters are the same as cv.glmnet function of the package 'glmnet'
#' @param x x matrix as in glmnet.
#' @param y response y as in glmnet.
#' @param weights Observation weights; defaults to 1 per observation
#' @param offset Offset vector (matrix) as in glmnet
#' @param lambda Optional user-supplied lambda sequence; default is NULL, and glmnet chooses its own sequence
#' @param type.measure loss to use for cross-validation. Currently five options, not all available for all models. The default is type.measure="deviance", which uses squared-error for gaussian models (a.k.a type.measure="mse" there), deviance for logistic and poisson regression, and partial-likelihood for the Cox model. type.measure="class" applies to binomial and multinomial logistic regression only, and gives misclassification error. type.measure="auc" is for two-class logistic regression only, and gives area under the ROC curve. type.measure="mse" or type.measure="mae" (mean absolute error) can be used by all models except the "cox"; they measure the deviation from the fitted mean to the response. type.measure="C" is Harrel's concordance measure, only available for cox models.
#' @param nfolds number of folds - default is 10. Although nfolds can be as large as the sample size (leave-one-out CV), it is not recommended for large datasets. Smallest value allowable is nfolds=3
#' @param foldid an optional vector of values between 1 and nfold identifying what fold each observation is in. If supplied, nfold can be missing.
#' @param alignment This is an experimental argument, designed to fix the problems users were having with CV, with possible values "lambda" (the default) else "fraction". With "lambda" the lambda values from the master fit (on all the data) are used to line up the predictions from each of the folds. In some cases this can give strange values, since the effective lambda values in each fold could be quite different. With "fraction" we line up the predictions in each fold according to the fraction of progress along the regularization. If in the call a lambda argument is also provided, alignment="fraction" is ignored (with a warning).
#' @param grouped This is an experimental argument, with default TRUE, and can be ignored by most users. For all models except the "cox", this refers to computing nfolds separate statistics, and then using their mean and estimated standard error to describe the CV curve. If grouped=FALSE, an error matrix is built up at the observation level from the predictions from the nfold fits, and then summarized (does not apply to type.measure="auc"). For the "cox" family, grouped=TRUE obtains the CV partial likelihood for the Kth fold by subtraction; by subtracting the log partial likelihood evaluated on the full dataset from that evaluated on the on the (K-1)/K dataset. This makes more efficient use of risk sets. With grouped=FALSE the log partial likelihood is computed only on the Kth fold
#' @param keep If keep=TRUE, a prevalidated array is returned containing fitted values for each observation and each value of lambda. This means these fits are computed with this observation and the rest of its fold omitted. The folid vector is also returned. Default is keep=FALSE. If relax=TRUE, then a list of such arrays is returned, one for each value of 'gamma'. Note: if the value 'gamma=1' is omitted, this case is included in the list since it corresponds to the original 'glmnet' fit.
#' @param parallel If TRUE, use parallel foreach to fit each fold. Must register parallel before hand, such as doMC or others. See the example below.
#' @param gamma The values of the parameter for mixing the relaxed fit with the regularized fit, between 0 and 1; default is gamma = c(0, 0.25, 0.5, 0.75, 1)
#' @param relax If TRUE, then CV is done with respect to the mixing parameter gamma as well as lambda. Default is relax=FALSE
#' @param trace.it If trace.it=1, then progress bars are displayed; useful for big models that take a long time to fit. Limited tracing if parallel=TRUE
#' @param ... Other arguments that can be passed to glmnet
#' @return predmean persample mean estimates
#' @return predvar persample variance estimates
#' @export
#' @examples
#' set.seed(1010)
#' n = 1000
#' p = 100
#' nzc = trunc(p/10)
#' x = matrix(rnorm(n * p), n, p)
#' beta = rnorm(nzc)
#' fx = x[, seq(nzc)] %*% beta
#' eps = rnorm(n) * 5
#' y = drop(fx + eps)
#' px = exp(fx)
#' px = px/(1 + px)
#' ly = rbinom(n = length(px), prob = px, size = 1)
#' set.seed(1011)
#' res = cv.glmnet_cue(x, y)
#' res$predmean
#' res$predvar
cv.glmnet_cue = function (x, y, weights = NULL, offset = NULL, lambda = NULL,
type.measure = c("default", "mse", "deviance", "class",
"auc", "mae", "C"), nfolds = 10, foldid = NULL, alignment = c("lambda",
"fraction"), grouped = TRUE, keep = FALSE, parallel = FALSE,
gamma = c(0, 0.25, 0.5, 0.75, 1), relax = FALSE, trace.it = 0,
...)
{
type.measure = match.arg(type.measure)
alignment = match.arg(alignment)
if (!is.null(lambda) && length(lambda) < 2)
stop("Need more than one value of lambda for cv.glmnet")
if (!is.null(lambda) && alignment == "fraction") {
warning("fraction of path alignment not available if lambda given as argument; switched to alignment=`lambda`")
alignment = "lambda"
}
N = nrow(x)
if (is.null(weights))
weights = rep(1, N)
else weights = as.double(weights)
y = drop(y)
cv.call = glmnet.call = match.call(expand.dots = TRUE)
which = match(c("type.measure", "nfolds", "foldid", "grouped",
"keep"), names(glmnet.call), FALSE)
if (any(which))
glmnet.call = glmnet.call[-which]
glmnet.call[[1]] = as.name("glmnet")
if (glmnet.control()$itrace)
trace.it = 1
else {
if (trace.it) {
glmnet.control(itrace = 1)
on.exit(glmnet.control(itrace = 0))
}
}
if (is.null(foldid))
foldid = sample(rep(seq(nfolds), length = N))
else nfolds = max(foldid)
if (nfolds < 3)
stop("nfolds must be bigger than 3; nfolds=10 recommended")
if (relax)
cv.relaxed.raw(x, y, weights, offset, lambda, type.measure,
nfolds, foldid, alignment, grouped, keep, parallel,
trace.it, glmnet.call, cv.call, gamma, ...)
else cv.glmnet.raw_cue(x, y, weights, offset, lambda, type.measure,
nfolds, foldid, alignment, grouped, keep, parallel,
trace.it, glmnet.call, cv.call, ...)
}
environment(cv.glmnet_cue) <- asNamespace('glmnet')
environment(cv.glmnet.raw_cue) <- asNamespace('glmnet')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.