#' Running sprinter with cross-validation
#'
#' The main cross-validation function to select the best sprinter fit for a path of tuning parameters.
#'
#' @param x An \code{n} by \code{p} design matrix of main effects. Each row is an observation of \code{p} main effects.
#' @param y A response vector of size \code{n}.
#' @param square Indicator of whether squared effects should be fitted in Step 1. Default to be FALSE.
#' @param num_keep A user specified number of candidate interactions to keep in Step 2. If \code{num_keep} is not specified (as default), it will be set to \code{round[n / log n]}.
#' @param lambda1 Tuning parameter values for Step 1. \code{lambda1} is a vector. Default to be NULL, and the program will compute its own \code{lambda1} based on \code{nlam1} and \code{lam_min_ratio}.
#' @param lambda3 Tuning parameter values for Step 3. \code{lambda3} is a matrix, where the k-th column is the list of tuning parameter in Step 3 corresponding to Step 1 using \code{lambda1[k]}. Default to be NULL, and the program will compute its own \code{lambda3} based on \code{nlam3} and \code{lam_min_ratio}.
#' @param cv_step1 Indicator of whether cross-validation of \code{lambda1} should be carried out in Step 1 before subsequent steps. Default is \code{FALSE}.
#' @param nlam1 the number of values in \code{lambda1}. If not specified, they will be all set to \code{10}.
#' @param nlam3 the number of values in each column of \code{lambda3}. If not specified, they will be all set to \code{100}.
#' @param lam_min_ratio The ratio of the smallest and the largest values in \code{lambda1} and each column of \code{lambda2}. The largest value is usually the smallest value for which all coefficients are set to zero. Default to be \code{1e-2} in the \code{n} < \code{p} setting.
#' @param nfold Number of folds in cross-validation. Default value is 5. If each fold gets too view observation, a warning is thrown and the minimal \code{nfold = 3} is used.
#' @param foldid A vector of length \code{n} representing which fold each observation belongs to. Default to be \code{NULL}, and the program will generate its own randomly.
#' @param verbose If \code{TRUE}, a progress bar shows the progress of the fitting.
#' @param ... other arguments to be passed to the \code{glmnet} calls, such as \code{alpha} or \code{penalty.factor}
#'
#' @return An object of S3 class "\code{sprinter}".
#' \describe{
#' \item{\code{n}}{The sample size.}
#' \item{\code{p}}{The number of main effects.}
#' \item{\code{square}}{The \code{square} parameter passed into sprinter.}
#' \item{\code{a0_step3}}{Estimate of intercept corresponding to the CV-selected model.}
#' \item{\code{compact}}{A compact representation of the selected variables. \code{compact} has three columns, with the first two columns representing the indices of a selected variable (main effects with first index = 0), and the last column representing the estimate of coefficients.}
#' \item{\code{fit}}{The whole \code{glmnet} fit object.}
#' \item{\code{fitted}}{fitted value of response corresponding to the CV-selected model.}
#' \item{\code{num_keep}}{The value of \code{num_keep}.}
#' \item{\code{cvm}}{The averaged estimated prediction error on the test sets over K folds.}
#' \item{\code{cvse}}{The standard error of the estimated prediction error on the test sets over K folds.}
#' \item{\code{foldid}}{Fold assignment. A vector of length \code{n}.}
#' \item{\code{i_lambda1_best}}{The index in \code{lambda1} that is chosen by CV by minimizing cvm.}
#' \item{\code{i_lambda3_best}}{The index in \code{lambda3} that is chosen by CV by minimizing cvm.}
#' \item{\code{lambda1_best}}{The value of \code{lambda1} that is chosen by CV by minimizing cvm.}
#' \item{\code{lambda3_best}}{The value of \code{lambda3} that is chosen by CV by minimizing cvm.}
#' \item{\code{call}}{Function call.}
#' }
#' @seealso
#' \code{\link{predict.cv.sprinter}}
#' @examples
#' n <- 100
#' p <- 100
#' x <- matrix(rnorm(n * p), n, p)
#' y <- x[, 1] - 2 * x[, 2] + 3 * x[, 1] * x[, 3] - 4 * x[, 4] * x[, 5] + rnorm(n)
#' mod <- cv.sprinter(x = x, y = y)
#'
#' @import glmnet
#' @export
cv.sprinter <- function(x, y, square = FALSE, num_keep = NULL,
lambda1 = NULL, lambda3 = NULL,
cv_step1 = FALSE,
nlam1 = 10, nlam3 = 100,
lam_min_ratio = ifelse(nrow(x) < ncol(x), 0.01, 1e-04),
nfold = 5, foldid = NULL, verbose = FALSE, ...){
n <- nrow(x)
p <- ncol(x)
q <- ifelse(square, p * (p - 1) / 2, p * (p - 1) / 2 + p)
if(is.null(num_keep))
num_keep <- ceiling(n / log(n))
else
stopifnot(num_keep > 0 & num_keep <= q)
if(verbose)
cat("cv initial:", fill = TRUE)
# first fit the sprinter using all data with all lambdas
fit <- sprinter(x = x, y = y,
square = square, num_keep = num_keep,
lambda1 = lambda1, lambda3 = lambda3,
cv_step1 = cv_step1,
nlam1 = nlam1, nlam3 = nlam3,
lam_min_ratio = lam_min_ratio, ...)
nlam1 <- length(fit$lambda1)
nlam3 <- length(fit$lambda3[, 1])
# use cross-validation to select the best lambda
if (is.null(foldid)){
# foldid is a vector of values between 1 and nfold
# identifying what fold each observation is in.
# If supplied, nfold can be missing.
foldid <- sample(seq(nfold), size = n, replace = TRUE)
}
# mse of lasso estimate of coef
err <- vector("list", nfold)
err_mat <- matrix(0, nrow = nlam1, ncol = nlam3)
for (i in seq(nfold)){
if(verbose)
cat(paste("cv fold ", i, ":"), fill = TRUE)
# train on all but i-th fold
id_tr <- (foldid != i)
id_te <- (foldid == i)
# training/testing data set
# standardize the training data
x_tr <- myscale(x[id_tr, ])
# use the scaling for training data
x_te <- myscale(x[id_te, ],
center = attr(x = x_tr, which = "scaled:center"),
scale = attr(x = x_tr, which = "scaled:scale"))
y_tr <- y[id_tr]
y_te <- y[id_te]
# get the fit using lasso on training data
# and fit/obj on the test data
fit_tr <- sprinter(x = x_tr, y = y_tr, square = fit$square,
num_keep = fit$num_keep,
lambda1 = fit$lambda1,
lambda3 = fit$lambda3, ...)
err[[i]] <- matrix(NA, nrow = nlam1, ncol = nlam3)
pred_te <- predict.sprinter(object = fit_tr, newdata = x_te)
for(k in seq(nlam1)){
if(ncol(pred_te[[k]]) < nlam3){
# this only happens when glmnet returns a numerical warning as follows:
# Fortran error code; Convergence for i-th lambda value not reached after maxit=100000 iterations; solutions for larger lambdas returned
# when this happens, not all nlam3 lambda values in Step3 is returned
# we just copy the prediction from the last viable lambda values to fill in the empty slots
last_col <- ncol(pred_te[[k]])
ncol_to_add <- nlam3 - last_col
pred_te[[k]] <- cbind(pred_te[[k]], matrix(rep(pred_te[[k]][, last_col], ncol_to_add), ncol = ncol_to_add))
}
err[[i]][k, ] <- sqrt(as.numeric(colMeans((y_te - pred_te[[k]])^2)))
}
err_mat <- err_mat + err[[i]]
}
# extract information from CV
# the mean cross-validated error, a nlam1-by-nlam3 matrix
err_mat <- err_mat / nfold
# also compute the se_mat
se_mat <- matrix(0, nrow = nlam1, ncol = nlam3)
for (i in seq(nfold)){
se_mat <- se_mat + (err[[i]] - err_mat)^2
}
se_mat <- sqrt(se_mat / (nfold - 1))
# the index of best lambda
# if there is ties
# we select the one with smaller lambda1 value and larger lambda 3 value
# i.e., we select the model with more main effects and fewer interactions
ibest <- tail(which(err_mat == min(err_mat), arr.ind = TRUE), 1)
colnames(ibest) <- NULL
lambda1_best <- ibest[1]
lambda3_best <- ibest[2]
#lam_1se <- tail(which(err_mat[num_keep_best, ] < err_mat[lambda1_best, lambda3_best] + se_mat[lambda1_best, lambda3_best], arr.ind = TRUE), 1)
# get a compact output
idx_i <- fit$step2[[lambda1_best]]
if(square)
idx <- rbind(cbind(rep(0, p), seq(p), rep(0, p)),
cbind(seq(p), seq(p), rep(0, p)),
idx_i)
else
idx <- rbind(cbind(rep(0, p), seq(p), rep(0, p)), idx_i)
coef <- fit$step3[[lambda1_best]]$coef[, lambda3_best]
compact <- cbind(idx[which(coef != 0), 1:2, drop = FALSE], coef[coef != 0])
colnames(compact) <- c("index_1", "index_2", "coefficient")
fitted <- fit$step1$fitted[, lambda1_best] + fit$step3[[lambda1_best]]$fitted[, lambda3_best]
# finally return the best lambda
out <- list(n = n,
p = p,
square = square,
a0_step3 = fit$step3[[lambda1_best]]$a0[lambda3_best],
compact = compact,
fit = fit,
fitted = fitted,
num_keep = num_keep,
cvm = err_mat,
cvse = se_mat,
foldid = foldid,
i_lambda1_best = lambda1_best,
i_lambda3_best = lambda3_best,
lambda1_best = fit$lambda1[lambda1_best],
lambda3_best = fit$lambda3[lambda3_best, lambda1_best],
call = match.call())
class(out) <- "cv.sprinter"
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.