#' @title Cross-validated group subset selection
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Fits the regularisation surface for a regression model with a group subset selection
#' penalty and then cross-validates this surface.
#'
#' @param x a predictor matrix
#' @param y a response vector
#' @param group a vector of length \code{ncol(x)} with the jth element identifying the group that
#' the jth predictor belongs to; alternatively, a list of vectors with the kth vector identifying
#' the predictors that belong to the kth group (useful for overlapping groups)
#' @param penalty the type of penalty to apply; one of 'grSubset', 'grSubset+grLasso', or
#' 'grSubset+Ridge'
#' @param loss the type of loss function to use; 'square' for linear regression or 'logistic' for
#' logistic regression
#' @param lambda an optional list of decreasing sequences of group subset selection parameters; the
#' list should contain a vector for each value of \code{gamma}
#' @param gamma an optional decreasing sequence of group lasso or ridge parameters
#' @param nfold the number of cross-validation folds
#' @param folds an optional vector of length \code{nrow(x)} with the ith entry identifying the fold
#' that the ith observation belongs to
#' @param cv.loss an optional cross-validation loss-function to use; should accept a vector of
#' predicted values and a vector of actual values
#' @param cluster an optional cluster for running cross-validation in parallel; must be set up using
#' \code{parallel::makeCluster}; each fold is evaluated on a different node of the cluster
#' @param interpolate a logical indicating whether to interpolate the \code{lambda} sequence for
#' the cross-validation fits; see details below
#' @param ... any other arguments for \code{grpsel()}
#'
#' @details When \code{loss='logistic'} stratified cross-validation is used to balance
#' the folds. When fitting to the cross-validation folds, \code{interpolate=TRUE} cross-validates
#' the midpoints between consecutive \code{lambda} values rather than the original \code{lambda}
#' sequence. This new sequence retains the same set of solutions on the full data, but often leads
#' to superior cross-validation performance.
#'
#' @return An object of class \code{cv.grpsel}; a list with the following components:
#' \item{cv.mean}{a list of vectors containing cross-validation means per value of \code{lambda};
#' an individual vector in the list for each value of \code{gamma}}
#' \item{cd.sd}{a list of vectors containing cross-validation standard errors per value of
#' \code{lambda}; an individual vector in the list for each value of \code{gamma}}
#' \item{lambda}{a list of vectors containing the values of \code{lambda} used in the fit; an
#' individual vector in the list for each value of \code{gamma}}
#' \item{gamma}{a vector containing the values of \code{gamma} used in the fit}
#' \item{lambda.min}{the value of \code{lambda} minimising \code{cv.mean}}
#' \item{gamma.min}{the value of \code{gamma} minimising \code{cv.mean}}
#' \item{fit}{the fit from running \code{grpsel()} on the full data}
#'
#' @example R/examples/example-cv-grpsel.R
#'
#' @export
cv.grpsel <- \(x, y, group = seq_len(ncol(x)),
penalty = c('grSubset', 'grSubset+grLasso', 'grSubset+Ridge'),
loss = c('square', 'logistic'), lambda = NULL, gamma = NULL, nfold = 10,
folds = NULL, cv.loss = NULL, cluster = NULL, interpolate = TRUE, ...) {
penalty <- match.arg(penalty)
loss <- match.arg(loss)
# Check data is valid
if (!is.matrix(x)) x <- as.matrix(x)
if (!is.matrix(y)) y <- as.matrix(y)
# Check arguments are valid
if (nfold < 2 | nfold > nrow(x)) {
stop('nfolds must be at least 2 and at most the number of rows in x')
}
if (!is.null(folds) & length(folds) != nrow(x)) {
stop('length of folds must equal number of rows in x')
}
# Set up for cross-validation
lambda.compute <- is.null(lambda)
fit <- grpsel(x, y, group, penalty, loss, lambda = lambda, gamma = gamma, ...)
lambda <- fit$lambda
gamma <- fit$gamma
nlambda <- vapply(lambda, length, integer(1))
ngamma <- length(fit$gamma)
if (is.null(folds)) {
if (loss == 'square') {
folds <- sample(rep_len(1:nfold, nrow(x)))
} else if (loss == 'logistic') {
folds <- integer(nrow(x))
folds[y == 0] <- sample(rep_len(1:nfold, sum(y == 0)))
folds[y == 1] <- sample(rep_len(1:nfold, sum(y == 1)))
}
} else {
nfold <- length(unique(folds))
}
# Save cross-validation loss functions
if (is.null(cv.loss)) {
if (loss == 'square') {
cv.loss <- \(xb, y) 0.5 * mean((y - xb) ^ 2)
} else if (loss == 'logistic') {
cv.loss <- \(xb, y) {
pi <- pmax(1e-5, pmin(1 - 1e-5, 1 / (1 + exp(- xb))))
- mean(y * log(pi) + (1 - y) * log(1 - pi))
}
}
}
# If lambda was computed, use midpoints between consecutive lambdas in cross-validation
lambda.cv <- lambda
if (interpolate & lambda.compute) {
for (i in 1:ngamma) {
if (length(lambda[[i]]) > 2) {
lambda.cv[[i]][2:(nlambda[i] - 1)] <-
lambda[[i]][- c(1, nlambda[i])] + diff(lambda[[i]][- 1]) / 2
}
}
}
# Loop over folds
cvf <- \(fold) {
fold.ind <- which(folds == fold)
x.train <- x[- fold.ind, , drop = FALSE]
x.valid <- x[fold.ind, , drop = FALSE]
y.train <- y[- fold.ind, , drop = FALSE]
y.valid <- y[fold.ind, , drop = FALSE]
fit.fold <- grpsel(x.train, y.train, group, penalty, loss, lambda = lambda.cv, gamma = gamma,
...)
cv <- list()
for (i in 1:ngamma) {
cv[[i]] <- apply(predict(fit.fold, x.valid, gamma = gamma[i]), 2, cv.loss, y.valid)
}
cv
}
if (is.null(cluster)) {
cv <- lapply(1:nfold, cvf)
} else {
parallel::clusterCall(cluster, \() library(grpsel))
cv <- parallel::clusterApply(cluster, 1:nfold, cvf)
}
cv <- lapply(1:ngamma, \(i) t(simplify2array(lapply(cv, `[[`, i))))
# Compose cross-validation results
cv.mean <- lapply(cv, colMeans)
cv.sd <- lapply(cv, \(x) apply(x, 2, stats::sd) / sqrt(nfold))
gamma.min.ind <- which.min(vapply(cv.mean, min, numeric(1)))
gamma.min <- gamma[gamma.min.ind]
lambda.min.ind <- which.min(cv.mean[[gamma.min.ind]])
lambda.min <- lambda[[gamma.min.ind]][lambda.min.ind]
# Return result
result <- list(cv.mean = cv.mean, cv.sd = cv.sd, lambda = lambda, gamma = gamma,
lambda.min = lambda.min, gamma.min = gamma.min, fit = fit)
class(result) <- 'cv.grpsel'
return(result)
}
#==================================================================================================#
# Coefficient function
#==================================================================================================#
#' @title Coefficient function for cv.grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Extracts coefficients for specified values of the tuning parameters.
#'
#' @param object an object of class \code{cv.grpsel}
#' @param lambda the value of \code{lambda} indexing the desired fit
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A matrix of coefficients.
#'
#' @method coef cv.grpsel
#'
#' @export
#'
#' @importFrom stats "coef"
coef.cv.grpsel <- \(object, lambda = 'lambda.min', gamma = 'gamma.min', ...) {
if (!is.null(lambda)) if (lambda == 'lambda.min') lambda <- object$lambda.min
if (!is.null(gamma)) if (gamma == 'gamma.min') gamma <- object$gamma.min
coef.grpsel(object$fit, lambda = lambda, gamma = gamma, ...)
}
#==================================================================================================#
# Predict function
#==================================================================================================#
#' @title Predict function for cv.grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Generate predictions for new data using specified values of the tuning parameters.
#'
#' @param object an object of class \code{cv.grpsel}
#' @param x.new a matrix of new values for the predictors
#' @param lambda the value of \code{lambda} indexing the desired fit
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A matrix of predictions.
#'
#' @method predict cv.grpsel
#'
#' @export
#'
#' @importFrom stats "predict"
predict.cv.grpsel <- \(object, x.new, lambda = 'lambda.min', gamma = 'gamma.min', ...) {
if (!is.null(lambda)) if (lambda == 'lambda.min') lambda <- object$lambda.min
if (!is.null(gamma)) if (gamma == 'gamma.min') gamma <- object$gamma.min
predict.grpsel(object$fit, x.new, lambda = lambda, gamma = gamma, ...)
}
#==================================================================================================#
# Plot function
#==================================================================================================#
globalVariables(c('ng', 'cv.mean', 'cv.sd'))
#' @title Plot function for cv.grpsel object
#'
#' @author Ryan Thompson <ryan.thompson-1@uts.edu.au>
#'
#' @description Plot the cross-validation results from group subset selection for a specified value
#' of \code{gamma}.
#'
#' @param x an object of class \code{cv.grpsel}
#' @param gamma the value of \code{gamma} indexing the desired fit
#' @param ... any other arguments
#'
#' @return A plot of the cross-validation results.
#'
#' @method plot cv.grpsel
#'
#' @export
#'
#' @importFrom graphics "plot"
plot.cv.grpsel <- \(x, gamma = 'gamma.min', ...) {
if (gamma == 'gamma.min') gamma <- x$gamma.min
index <- which.min(abs(gamma - x$gamma))
df <- data.frame(cv.mean = x$cv.mean[[index]], cv.sd = x$cv.sd[[index]], ng = x$fit$ng[[index]])
p <- ggplot2::ggplot(df, ggplot2::aes(ng, cv.mean)) +
ggplot2::geom_point() +
ggplot2::geom_errorbar(ggplot2::aes(ymin = cv.mean - cv.sd, ymax = cv.mean + cv.sd)) +
ggplot2::xlab('number of groups')
p
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.