Nothing
###########################################
### Main implementation of sparse projected averaged regression (SPAR)
##########################################
#' Sparse Projected Averaged Regression
#'
#' Apply Sparse Projected Averaged Regression to high-dimensional data by
#' building an ensemble of generalized linear models, where the high-dimensional
#' predictors can be screened using a screening coefficient and then projected
#' using data-agnostic or data-informed random projection matrices.
#' This function performs the procedure for a given grid of thresholds \eqn{\nu}
#' and a grid of the number of marginal models to be employed in the ensemble.
#' This function is also used in the cross-validated procedure [spar.cv].
#'
#' @param x n x p numeric matrix of predictor variables.
#' @param y quantitative response vector of length n.
#' @param family a \link[stats]{family} object used for the marginal generalized linear model,
#' default \code{gaussian("identity")}.
#' @param model function creating a \code{'sparmodel'} object;
#' defaults to \code{spar_glm()} for gaussian family with identity link and to
#' \code{spar_glmnet()} for all other family-link combinations.
#' @param rp function creating a \code{'randomprojection'} object. Defaults to NULL.
#' In this case \code{rp_cw(data = TRUE)} is used.
#' @param screencoef function creating a \code{'screeningcoef'} object. Defaults to NULL.
#' In this case no screening is used is used.
#' @param xval optional matrix of predictor variables observations used for
#' validation of threshold nu and number of models; \code{x} is used
#' if not provided.
#' @param yval optional response observations used for validation of
#' threshold nu and number of models; \code{y} is used if not provided.
#' @param nnu number of different threshold values \eqn{\nu} to consider for thresholding;
#' ignored when nus are given; defaults to 20.
#' @param nus optional vector of \eqn{\nu}'s to consider for thresholding;
#' if not provided, \code{nnu} values ranging from 0 to the maximum absolute
#' marginal coefficient are used.
#' @param nummods vector of numbers of marginal models to consider for
#' validation; defaults to \code{c(20)}.
#' @param measure loss to use for validation; defaults to \code{"deviance"}
#' available for all families. Other options are \code{"mse"} or \code{"mae"}
#' (between responses and predicted means, for all families),
#' \code{"class"} (misclassification error) and
#' \code{"1-auc"} (one minus area under the ROC curve) both just for
#' binomial family.
#' @param avg_type type of averaging the marginal models; either on link (default)
#' or on response level. This is used in computing the validation measure.
#' @param parallel assuming a parallel backend is loaded and available, a
#' logical indicating whether the function should use it in parallelizing the
#' estimation of the marginal models. Defaults to FALSE.
#' @param inds optional list of index-vectors corresponding to variables kept
#' after screening in each marginal model of length \code{max(nummods)};
#' dimensions need to fit those of RPMs.
#' @param RPMs optional list of projection matrices used in each
#' marginal model of length \code{max(nummods)}, diagonal elements will be
#' overwritten with a coefficient only depending on the given \code{x} and \code{y}.
#' @param seed integer seed to be set at the beginning of the SPAR algorithm. Default to NULL, in which case no seed is set.
#' @param ... further arguments mainly to ensure back-compatibility
#' @returns object of class \code{'spar'} with elements
#' \itemize{
#' \item \code{betas} p x \code{max(nummods)} sparse matrix of class
#' \code{'\link[Matrix:dgCMatrix-class]{Matrix::dgCMatrix}'} containing the
#' standardized coefficients from each marginal model
#' \item \code{intercepts} used in each marginal model
#' \item \code{scr_coef} vector of length p with coefficients used for screening the standardized predictors
#' \item \code{inds} list of index-vectors corresponding to variables kept after screening in each marginal model of length max(nummods)
#' \item \code{RPMs} list of projection matrices used in each marginal model of length \code{max(nummods)}
#' \item \code{val_res} \code{data.frame} with validation results (validation measure
#' and number of active variables) for each element of \code{nus} and \code{nummods}
#' \item \code{val_set} logical flag, whether validation data were provided;
#' if \code{FALSE}, training data were used for validation
#' \item \code{family} a character corresponding to \link[stats]{family} object used for the marginal generalized linear model e.g.,
#' \code{"gaussian(identity)"}
#' \item \code{nus} vector of \eqn{\nu}'s considered for thresholding
#' \item \code{nummods} vector of numbers of marginal models considered for validation
#' \item \code{ycenter} empirical mean of initial response vector
#' \item \code{yscale} empirical standard deviation of initial response vector
#' \item \code{xcenter} p-vector of empirical means of initial predictor variables
#' \item \code{xscale} p-vector of empirical standard deviations of initial predictor variables
#' \item \code{avg_type} character, averaging type for computing the validation measure
#' \item \code{measure} character, type of validation measure used
#' \item \code{rp} an object of class \code{"randomprojection"}
#' \item \code{screencoef} an object of class \code{"screeningcoef"}
#' \item \code{x_rows_for_fitting_marginal_models} vector of row indicators from
#' \code{x} which were used for fitting the marginal models, if screening was performed
#' using \code{screencoef} with \code{split_data_prop} argument. Is \code{NULL} otherwise.
#' }
#' If a parallel backend is registered and \code{parallel = TRUE},
#' the \link[foreach]{foreach} function
#' is used to estimate the marginal models in parallel.
#'
#' @references{
#' \insertRef{parzer2024lm}{spareg}
#'
#' \insertRef{parzer2024glms}{spareg}
#'
#' \insertRef{Clarkson2013LowRankApprox}{spareg}
#'
#' \insertRef{ACHLIOPTAS2003JL}{spareg}
#' }
#' @examples
#' example_data <- simulate_spareg_data(n = 200, p = 400, ntest = 100)
#' spar_res <- spar(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' coefs <- coef(spar_res)
#' pred <- predict(spar_res, xnew = example_data$x)
#' plot(spar_res)
#' plot(spar_res, plot_type = "val_measure", plot_along = "nummod", nu = 0)
#' plot(spar_res, plot_type = "val_measure", plot_along = "nu", nummod = 10)
#' plot(spar_res, plot_type = "val_numactive", plot_along = "nummod", nu = 0)
#' plot(spar_res, plot_type = "val_numactive", plot_along = "nu", nummod = 10)
#' plot(spar_res, plot_type = "res_vs_fitted", xfit = example_data$xtest,
#' yfit = example_data$ytest)
#' plot(spar_res, plot_type = "coefs", prange = c(1,400))
#'
#' @seealso [spar.cv], [coef.spar], [predict.spar], [plot.spar], [print.spar]
#' @aliases spareg
#' @export
#'
#' @import methods
#' @importFrom stats median reshape glm.fit coef fitted gaussian predict rnorm quantile
#' residuals sd var cor glm aggregate
#' @importFrom utils head
#' @importFrom Matrix Matrix solve crossprod tcrossprod rowMeans rowSums
#' @importFrom Rdpack reprompt
#' @importFrom rlang list2
#' @importFrom glmnet glmnet
#' @importFrom ROCR prediction performance
#'
spar <- function(x, y, family = gaussian("identity"), model = NULL, rp = NULL,
screencoef = NULL, xval = NULL, yval = NULL, nnu = 20, nus = NULL,
nummods = c(20), measure = c("deviance","mse","mae","class","1-auc"),
avg_type = c("link", "response"),
parallel = FALSE, inds = NULL, RPMs = NULL, seed = NULL, ...) {
# Set up and checks ----
measure <- match.arg(measure)
stopifnot("Length of y does not fit nrow(x)." = length(y) == nrow(x))
stopifnot("Response y must be numeric." = is.numeric(y))
# Ensure back compatibility ----
args <- list(...)
arg_list <- check_and_set_args(args, x, y, family, model,
screencoef, rp, measure)
model <- arg_list$model; rp <- arg_list$rp
screencoef <- arg_list$screencoef; measure <- arg_list$measure
avg_type <- match.arg(avg_type)
# Call SPAR algorithm ----
res <- spar_algorithm(x = x, y = y,
family = family,
model = model, rp = rp, screencoef = screencoef,
xval = xval, yval = yval,
nnu = nnu, nus = nus,
nummods = nummods,
measure = measure,
avg_type = avg_type,
inds = inds, RPMs = RPMs,
parallel = parallel,
seed = seed)
return(res)
}
#' @rdname spar
#' @examples
#' spar_res <- spareg(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' @aliases spar
#' @export
spareg <- spar
spar_algorithm <- function(x, y,
family, model, rp, screencoef,
xval = NULL, yval = NULL,
nnu, nus,
nummods, measure,
avg_type,
inds = NULL, RPMs = NULL,
parallel = FALSE,
seed = NULL){
# Start SPAR algorithm
p <- ncol(x)
n <- nrow(x)
# Scaling the x matrix ----
xcenter <- colMeans(x)
xscale <- apply(x, 2, sd)
if (!is.null(seed)) {
if (parallel & requireNamespace("doRNG", quietly = TRUE)) {
registerDoRNG <- getNamespace("doRNG")$registerDoRNG
registerDoRNG(seed = seed)
} else {
set.seed(seed)
}
}
if (is.null(inds) || is.null(RPMs)) {
actual_p <- sum(xscale > 0)
z <- scale(x[, xscale > 0],
center = xcenter[xscale > 0],
scale = xscale[xscale > 0])
} else {
actual_p <- p
xscale[xscale == 0] <- 1
z <- scale(x, center = xcenter, scale = xscale)
}
# Scaling the y vector ----
if (family$family == "gaussian" & family$link=="identity") {
ycenter <- mean(y)
yscale <- sd(y)
} else {
ycenter <- 0
yscale <- 1
}
yz <- scale(y,center = ycenter,scale = yscale)
# Setup model ----
if (is.null(model$control$family)) {
if (is.null(attr(model, "family"))) {
model$control$family <- family
} else {
model$control$family <- attr(model, "family")
}
}
if (!is.null(model$update_fun)) {
model <- model$update_fun(model)
}
# Setup screening ----
family_str <- paste0(family$family, "(", family$link, ")")
if (is.null(attr(screencoef, "family"))) {
attr(screencoef, "family_string") <- family_str
}
if (!is.null(attr(screencoef, "split_data_prop"))) {
scr_inds <- sample(n,
ceiling(n * attr(screencoef, "split_data_prop")))
mar_inds <- seq_len(n)[-scr_inds]
} else {
mar_inds <- scr_inds <- seq_len(n)
}
if (is.null(attr(screencoef, "nscreen"))) {
if (2*n > p) {
message("Screening is not performed by default, as 2 * n, the default number of screened variables, is larger than the number of predictors. For performing screening, adjust nscreen in screen_*().")
}
nscreen <- attr(screencoef, "nscreen") <- min(p, 2 * n)
} else {
nscreen <- attr(screencoef, "nscreen")
}
mslow <- attr(rp, "mslow")
if (is.null(mslow)) mslow <- ceiling(log(p))
msup <- attr(rp, "msup")
if (is.null(msup)) msup <- ceiling(n/2)
if (!(msup <= nscreen)) {
message("Provided upper bound on goal dimension of random projection (msup) or its default value (n/2) is larger than nscreen. Setting msup to nscreen.")
msup <- nscreen
}
stopifnot("Provided lower bound on goal dimension of random projection (mslow) or its default value (log(p)) is larger than upper bound (msup)." =
mslow <= msup)
# Perform screening ----
if (nscreen < p) {
scr_coef <- screencoef$generate_fun(
object = screencoef,
x = z[scr_inds,],
y = yz[scr_inds, ])
inc_probs <- abs(scr_coef)
max_inc_probs <- max(inc_probs)
inc_probs <- inc_probs/max_inc_probs
attr(screencoef, "inc_prob") <- inc_probs
if (attr(screencoef, "type") == "prob" && sum(inc_probs > 0) < nscreen) {
warning(
sprintf("The number of variables with non-zero screening coefficients (%i) is less than the number of variables to screen (%i). Probabilistic screening with nscreen variables is performed anyway, but some of some of the variables with a zero inclusion probability will be randomly added to the set of screened variables. Alternatively, nscreen can be lowered in screen_*().",
sum(inc_probs > 0), nscreen))
}
} else {
scr_coef <- NULL
# message("No screening performed.")
}
attr(screencoef, "importance") <- scr_coef
# Update RP ----
thiscall <- match.call(expand.dots = TRUE)
thiscall[["screencoef"]] <- screencoef
rp <- eval.parent(as.call(c(list(rp$update_fun),
as.list(thiscall)[-1])))
max_num_mod <- max(nummods)
drawRPMs <- FALSE
if (is.null(RPMs)) {
RPMs <- vector("list", length = max_num_mod)
drawRPMs <- TRUE
ms <- sample(seq(floor(mslow), ceiling(msup)),
max_num_mod, replace=TRUE)
}
drawinds <- FALSE
if (is.null(inds)) {
inds <- vector("list", length = max_num_mod)
drawinds <- TRUE
}
# SPAR algorithm ----
marginal_model_function <- function(i) {
## Function for screening, drawing the RP and estimating one model in ensemble
## Screening step ----
out <- list()
if (drawinds) {
if (nscreen < p) {
ind_use <- switch(attr(screencoef, "type"),
"fixed" = order(inc_probs, decreasing = TRUE)[seq_len(nscreen)],
"prob" = c(sample(seq_len(actual_p)[inc_probs > 0],
min(sum(inc_probs > 0), nscreen),
prob = inc_probs[inc_probs>0]),
sample(seq_len(actual_p)[inc_probs == 0],
nscreen - min(sum(inc_probs > 0), nscreen))),
stop("Type of screening coef should be fixed or prob.")
)
} else {
ind_use <- seq_len(actual_p)
}
out$inds <- ind_use
} else {
ind_use <- inds[[i]]
}
p_use <- length(ind_use)
## RP step ----
if (drawRPMs) {
m <- ms[i]
if (p_use < m) {
m <- p_use
RPM <- Matrix::Matrix(diag(1, m),sparse=TRUE)
} else {
RPM <- rp$generate_fun(rp, m = m,
included_vector = ind_use,
x = x, y = y)
}
out$RPMs <- RPM
} else {
RPM <- RPMs[[i]]
if (!is.null(rp$update_rpm_w_data)) {
RPM <- rp$update_rpm_w_data(rpm = RPM, rp = rp,
included_vector = ind_use)
}
}
## Marginal model ----
znew <- Matrix::tcrossprod(z[mar_inds, ind_use], RPM)
res <- model$model_fun(y = yz[mar_inds], z = znew, object = model)
out$intercepts <- res$intercept
out$betas_std_m <- as(numeric(actual_p), "sparseMatrix")
out$betas_std_m[ind_use] <- crossprod(RPM, res$gammas)
out
}
if (parallel) {
# honor registration made by user, and only create and register
# our own cluster object once
if (!requireNamespace("foreach", quietly = TRUE)) {
stop("Package 'foreach' is required for parallel execution. Please install it using install.packages('foreach').")
}
# Load foreach functions
foreach <- getNamespace("foreach")$foreach
`%dopar%` <- getNamespace("foreach")$`%dopar%`
`%do%` <- getNamespace("foreach")$`%do%`
getDoParRegistered <- getNamespace("foreach")$getDoParRegistered
getDoParName <- getNamespace("foreach")$getDoParName
getDoParWorkers <- getNamespace("foreach")$getDoParWorkers
if (!getDoParRegistered()) {
message('Warning: No doPar backend. Executing SPAR algorithm sequentially.
For using parallelization, please register backend and rerun.')
`%d%` <- `%do%`
} else {
message('Using ', getDoParName(), ' with ',
getDoParWorkers(), ' workers')
`%d%` <- `%dopar%`
}
if (!getDoParRegistered()) {
message('Warning: No doPar backend. Executing SPAR algorithm sequentially.
For using parallelization, please register backend and rerun.')
`%d%` <- `%do%`
} else {
message('Using ', getDoParName(), ' with ',
getDoParWorkers(), ' workers')
`%d%` <- `%dopar%`
}
i <- NULL
res_all <- foreach(i = seq_len(max_num_mod),
.verbose = FALSE,
.packages = "spareg",
.errorhandling = "stop") %d% {
marginal_model_function(i = i)
}
} else {
res_all <- lapply(seq_len(max_num_mod), marginal_model_function)
}
if (drawRPMs) RPMs <- lapply(res_all, "[[", "RPMs")
if (drawinds) inds <- lapply(res_all, "[[", "inds")
intercepts <- sapply(res_all, "[[", "intercepts")
betas_std <- Reduce("cbind2", lapply(res_all, "[[", "betas_std_m"))
if (is.null(nus)) {
if (nnu>1) {
nus <- unname(c(0, quantile(abs(betas_std@x),
probs=seq_len(nnu-1)/(nnu-1))))
} else {
nus <- 0
}
} else {
nnu <- length(nus)
}
## Validation set ----
val_res <- data.frame(nnu = NULL, nu = NULL,
nummod = NULL, numactive = NULL, measure = NULL)
if (!is.null(yval) && !is.null(xval)) {
val_set <- TRUE
} else {
val_set <- FALSE
yval <- y
xval <- x
}
val.meas <- get_val_measure_function(measure, family)
## Fitted values ----
tabnummodres <- lapply(nummods, function(nummod) {
tabres <- lapply(seq_len(nnu), function(l){
thresh <- nus[l]
tmp_coef <- betas_std[, seq_len(nummod), drop = FALSE]
tmp_coef[abs(tmp_coef) < thresh] <- 0
tmp_beta <- Matrix(0, nrow = p, ncol = nummod)
tmp_beta[xscale > 0, ] <- yscale * tmp_coef/(xscale[xscale > 0])
if (avg_type == "link") {
beta_hat <- rowMeans(tmp_beta)
alpha_hat <- mean(intercepts[seq_len(nummod)]) +
(ycenter - sum(xcenter * beta_hat))
eta_hat <- xval %*% beta_hat + alpha_hat
val_measure <- val.meas(yval, eta_hat = eta_hat)
numactive <- sum(beta_hat != 0)
} else {
tmp_intercept <- intercepts[seq_len(nummod)] +
drop(ycenter - crossprod(xcenter, tmp_beta))
eta_hat <- sweep((xval %*% tmp_beta), tmp_intercept,
MARGIN = 2, FUN = "+")
y_hat <- rowMeans(family$linkinv(as.matrix(eta_hat)))
val_measure <- val.meas(yval, y_hat = y_hat)
numactive <- sum(rowSums(tmp_beta != 0) > 0)
}
c(nnu = l, nu = unname(thresh), nummod = nummod,
measure = val_measure, numactive = numactive)
})
out <- do.call("rbind", tabres)
colnames(out) <- c("nnu","nu","nummod","measure", "numactive")
out
})
val_res <- do.call("rbind.data.frame", tabnummodres)
betas <- Matrix(0, p, max_num_mod, sparse = TRUE)
betas[xscale>0,] <- betas_std
if (is.null(colnames(x))) {
rownames(betas) <- paste0("V", seq_len(ncol(x)))
} else {
rownames(betas) <- colnames(x)
}
## Clean up
res <- list(betas = betas, intercepts = intercepts,
scr_coef = scr_coef,
inds = inds, RPMs = RPMs,
val_res = val_res, val_set = val_set,
nus = nus, nummods = nummods,
ycenter = ycenter, yscale = yscale,
xcenter = xcenter, xscale = xscale,
family = family_str,
measure = measure,
avg_type = avg_type,
rp = rp,
screencoef = screencoef,
model = model,
x_rows_for_fitting_marginal_models =
if (!is.null(attr(screencoef, "split_data_prop"))) mar_inds else NULL
)
attr(res,"class") <- "spar"
return(res)
}
#' coef.spar
#'
#' Extract coefficients from \code{'spar'} object
#' @param object result of [spar] function of class \code{'spar'}.
#' @param nummod number of models used to form coefficients; value with minimal
#' validation \code{measure} is used if not provided.
#' @param nu threshold level used to form coefficients; value with minimal
#' validation \code{measure} is used if not provided.
#' @param aggregate character one of c("mean", "median", "none"). If set to "none"
#' the coefficients are not aggregated over the marginal models, otherwise
#' the coefficients are aggregated using the specified method (mean or median).
#' Defaults to mean aggregation.
#' @param ... further arguments passed to or from other methods
#' @return object of class \code{'coefspar'} which is a list with elements
#' \itemize{
#' \item \code{intercept} intercept value
#' \item \code{beta} vector of length p of averaged coefficients
#' \item \code{nummod} number of models based on which the coefficient is computed
#' \item \code{nu} threshold based on which the coefficient is computed
#' }
#' @seealso [print.coefspar], [summary.coefspar]
#' @export
#' @method coef spar
coef.spar <- function(object,
nummod = NULL,
nu = NULL,
aggregate = c("mean", "median", "none"),
...) {
aggregate <- match.arg(aggregate)
if (is.null(nummod) & is.null(nu)) {
best_ind <- which.min(object$val_res$measure)
par <- object$val_res[best_ind,]
nummod <- par$nummod
nu <- par$nu
} else if (is.null(nummod)) {
if (!nu %in% object$val_res$nu) {
stop("Nu needs to be among the previously considered values when nummod is not provided!")
}
tmp_val_res <- object$val_res[object$val_res$nu==nu,]
nummod <- tmp_val_res$nummod[which.min(tmp_val_res$measure)]
} else if (is.null(nu)) {
if (!nummod %in% object$val_res$nummod) {
stop("Number of models needs to be among the previously fitted values when nu is not provided!")
}
tmp_val_res <- object$val_res[object$val_res$nummod==nummod,]
nu <- tmp_val_res$nu[which.min(tmp_val_res$measure)]
} else {
if (length(nummod)!=1 | length(nu)!=1) {
stop("Length of nummod and nu must be 1!")
}
}
if (nummod > ncol(object$betas)) {
warning("Number of models is too high, maximum of number of models use to fit the models is used instead!")
nummod <- ncol(object$betas)
}
# calc for chosen parameters
final_coef <- object$betas[object$xscale>0, seq_len(nummod), drop=FALSE]
final_coef[abs(final_coef) < nu] <- 0
p <- length(object$xscale)
if (aggregate == "none") {
beta <- matrix(0, nrow = p, ncol = nummod)
beta_std <- final_coef
beta[object$xscale>0,] <- as.matrix(object$yscale *
beta_std/(object$xscale[object$xscale>0]))
rownames(beta) <- rownames(object$betas)
colnames(beta) <- paste0("Model_", seq_len(nummod))
intercept <- drop(object$ycenter + object$intercepts[seq_len(nummod)] -
crossprod(object$xcenter, beta))
names(intercept) <- colnames(beta)
} else {
avg_fun <- switch(aggregate,
"mean" = function(x) mean(x, na.rm = TRUE),
"median" = function(x) median(x, na.rm = TRUE),
"Aggregration method not implemented.")
beta <- numeric(p)
beta_std <- apply(final_coef, 1, avg_fun)
beta[object$xscale>0] <- object$yscale * beta_std/(object$xscale[object$xscale>0])
names(beta) <- rownames(object$betas)
intercept <- object$ycenter + avg_fun(object$intercepts[seq_len(nummod)]) - sum(object$xcenter*beta)
names(intercept) <- "(Intercept)"
}
res <- list(intercept = intercept,
beta = beta,
nummod = nummod,
nu = nu)
class(res) <- "coefspar"
best_ind <- which.min(object$val_res$measure)
par <- object$val_res[best_ind,]
attr(res, "M_best") <- par$nummod
attr(res, "nu_best") <- par$nu
attr(res, "M_nu_combination") <- ifelse(nummod == par$nummod & nu == par$nu,
"best", "given")
attr(res, "aggregate") <- aggregate
attr(res, "parent_object") <- class(object)
return(res)
}
#' Print method for coefspar objects
#'
#' Print method showing the basic components of a \code{'coefspar'} object.
#'
#' @param x An object of class \code{'coefspar'}, typically created by a custom model function.
#' @param digits integer digits to be printed, defaults to 4L.
#' @param show integer number of coefficients to be shown, defaults to 6L.
#' @param ... Additional arguments passed to or from other methods (ignored here).
#'
#' @return Invisibly returns the input object \code{x}.
#' @examples
#' example_data <- simulate_spareg_data(n = 200, p = 2000, ntest = 100)
#' spar_res <- spareg(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' coef(spar_res)
#' coef(spar_res, aggregate = "median")
#' coef(spar_res, aggregate = "none")
#' print(coef(spar_res), show = 10L, digits = 6L)
#' @export
#' @method print coefspar
print.coefspar <- function(x, digits = 4L, show = 6L, ...) {
cat(sprintf("Coefficients from %s object:\n", attr(x, "parent_object")))
cat(sprintf("Based on the %s selection\n\n",
switch(attr(x, "M_nu_combination"),
"best" = "*best rule* (min error)",
"1se" = "*1se rule*",
"given"= "*given* nummod and nu")))
if (!is.null(attr(x, "aggregate"))) {
cat(sprintf("Aggregation method over models: %s\n",
attr(x, "aggregate")))
}
cat("Selected combination: ", x$nummod, " models, threshold = ",
x$nu, "\n")
cat("\n")
if (attr(x, "M_nu_combination") != "best") {
cat("Best combination overall (min error):",
attr(x, "M_best"), "models, threshold =",
attr(x, "nu_best"), "\n")
}
if (attr(x, "M_nu_combination") != "1se") {
if (!is.null(attr(x, "M_1se")) & !is.null(attr(x, "nu_1se"))) {
cat("1se combination:", attr(x, "M_1se"), "models, threshold =",
attr(x, "nu_1se"), "\n")
}
}
cat("\n")
# Coefficient vectors or matrices
cat("Coefficients:\n\n")
if (attr(x, "aggregate") == "none") {
## No aggregation ----
coefs <- rbind("(Intercept)" = x$intercept, x$beta)
shown <- head(coefs, show)
# Assign default names if unnamed
# if (any(is.null(rownames(shown))) || any(rownames(shown) == "")) {
# rownames(shown)[-1] <- paste0("V", seq_along(shown - 1))
# }
# Print header and values
print(noquote(format(round(shown, digits), nsmall = digits,
justify = "right")))
# Add inline ...
if (nrow(coefs) > show) {
cat("\n...", sprintf("(%d rows not shown)\n\n", nrow(coefs) - show))
}
cat("Number of active variables: \n")
no_non_zero_coefs <- paste0(colSums(x$beta != 0), "/", nrow(x$beta))
names(no_non_zero_coefs) <- colnames(x$beta)
print(noquote(format(no_non_zero_coefs, nsmall = digits,
justify = "right")))
cat("\n")
} else {
## Aggregation ----
coefs <- c(x$intercept, x$beta)
shown <- head(coefs, show)
# Assign default names if unnamed
if (is.null(names(shown)) || any(names(shown) == "")) {
names(shown) <- paste0("V", seq_along(shown) - 1)
names(shown)[1] <- "(Intercept)"
}
# Print header and values
print(noquote(format(round(shown, digits), nsmall = digits,
justify = "right")))
# Add inline ...
if (length(coefs) > show) {
cat("\n...", sprintf("(%d coefficients not shown)\n\n",
length(coefs) - show))
}
cat("Number of active variables: ",
paste0(sum(x$beta != 0), "/", length(x$beta)), "\n\n")
}
invisible(x)
}
#' Summary method for coefspar objects
#'
#' Provides a summary of a \code{coefspar} object.
#'
#' @param object An object of class \code{coefspar}.
#' @param digits integer digits to be printed, defaults to 4L.
#' @param ... Additional arguments (ignored).
#' @return Invisibly returns \code{object}.
#' @examples
#' example_data <- simulate_spareg_data(n = 200, p = 2000, ntest = 100)
#' spar_res <- spareg(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' summary(coef(spar_res))
#' summary(coef(spar_res, aggregate = "none"))
#' @export
#' @method summary coefspar
summary.coefspar <- function(object, digits = 4L, ...) {
stopifnot(inherits(object, "coefspar"))
cat(sprintf(
"Summary of coefficients from %s object:\n", attr(object, "parent_object")))
cat(sprintf("Based on the %s selection\n\n",
switch(attr(object, "M_nu_combination"),
"best" = "*best rule* (min error)",
"1se" = "*1se rule*",
"given"= "*given* nummod and nu")))
if (!is.null(attr(object, "aggregate"))) {
cat(sprintf("Aggregation method over models: %s\n",
attr(object, "aggregate")))
}
cat("Selected combination: ", object$nummod, " models, threshold = ",
object$nu, "\n")
cat("\n")
if (attr(object, "M_nu_combination") != "best") {
cat("Best combination overall (min error):",
attr(object, "M_best"), "models, threshold =",
attr(object, "nu_best"), "\n\n")
}
if (attr(object, "M_nu_combination") != "1se") {
if (!is.null(attr(object, "M_1se")) & !is.null(attr(object, "nu_1se"))) {
cat("1se combination:", attr(object, "M_1se"), "models, threshold =",
attr(object, "nu_1se"), "\n\n")
}
}
if (attr(object, "aggregate") == "none") {
cat("Number of coefficients equal to zero across all models:",
paste0(sum(rowMeans(object$beta) == 0), "/", nrow(object$beta)), "\n")
cat("Number of coefficients non-zero across all models:",
paste0(sum(rowMeans(object$beta != 0) == 1), "/", nrow(object$beta)), "\n\n")
} else {
cat("Number of active coefficients:",
paste0(sum(object$beta != 0), "/", length(object$beta)), "\n\n")
}
if (attr(object, "aggregate") == "none") {
cat("Intercept:\n ")
}
print(round(object$intercept, digits))
cat("Coefficient summary (beta):\n")
print(summary(object$beta, digits = digits))
invisible(object)
}
#' predict.spar
#'
#' Predict responses for new predictors from \code{'spar'} object
#' @param object result of spar function of class \code{'spar'}.
#' @param xnew matrix of new predictor variables; must have same number of columns as \code{x}.
#' @param type the type of required predictions; either on response level (default) or on link level
#' @param avg_type type of averaging the marginal models; either on link (default) or on response level
#' @param nummod number of models used to form coefficients; value with minimal validation measure is used if not provided.
#' @param nu threshold level used to form coefficients; value with minimal validation measure is used if not provided.
#' @param aggregate character one of c("mean", "median");
#' the aggregation over the ensembles is done using the specified method (mean or median).
#' Defaults to mean aggregation.
#' @param ... further arguments passed to or from other methods
#' @return Vector of predictions
#' @export
predict.spar <- function(object,
xnew = NULL,
type = c("response", "link"),
avg_type = c("link","response"),
nummod = NULL,
nu = NULL,
aggregate = c("mean", "median"),
...) {
if (is.null(xnew)) {
stop("No 'xnew' provided. This 'spar' object does not retain training data.",
"Please provide xnew explicitly.",
"If you want to predict in-sample, use the original data used for fitting the model as xnew.\n")
}
if (ncol(xnew) != length(object$xscale)) {
stop("xnew must have same number of columns as initial x!")
}
type <- match.arg(type)
avg_type <- match.arg(avg_type)
aggregate <- match.arg(aggregate)
object$family <- eval(parse(text = object$family))
if (avg_type != object$avg_type && object$family$link != "identity") {
warning("The best model combination was selected for ",
paste0(object$avg_type, " averaging, but avg_type = ", avg_type,
" is used for prediction. This may lead to suboptimal results."))
}
coefs_avg <- coef(object, nummod, nu, aggregate = aggregate)
eta <- as.numeric(xnew %*% coefs_avg$beta + coefs_avg$intercept)
if (avg_type == "link") {
res <- if (type == "link") eta else object$family$linkinv(eta)
} else {
if (type == "link") {
res <- eta
} else {
avg_fun <- switch(aggregate,
"mean" = function(x) mean(x, na.rm = TRUE),
"median" = function(x) median(x, na.rm = TRUE),
"Aggregration method not implemented.")
coefs_all <- coef(object, nummod, nu, aggregate = "none")
eta_all <- sweep(xnew %*% coefs_all$beta, coefs_all$intercept, MARGIN = 2, FUN = "+")
preds <- object$family$linkinv(eta_all)
res <- apply(preds, 1, avg_fun)
}
}
return(res)
}
#'
#' plot.spar
#'
#' @description
#' Plot values of validation measure or number of active variables over different thresholds or number of models for \code{'spar'} object, or residuals vs fitted
#'
#' @param x result of spar function of class \code{'spar'}.
#' @param plot_type one of \code{c("val_measure", "val_numactive", "res_vs_fitted", "coefs")}.
#' @param plot_along one of \code{c("nu","nummod")}; ignored when \code{plot_type = "res_vs_fitted"}.
#' @param nummod fixed value for number of models when \code{plot_along = "nu"}
#' for \code{plot_type = "val_measure"} or \code{"val_numactive"};
#' same as for \code{\link{predict.spar}} when \code{plot_type="res_vs_fitted"}.
#' @param nu fixed value for \eqn{\nu} when \code{plot_along="nummod"} for
#' \code{plot_type = "val_measure"} or \code{"val_numactive"}; same as for \code{\link{predict.spar}} when \code{plot_type="res_vs_fitted"}.
#' @param xfit data used for predictions in \code{"res_vs_fitted"}.
#' @param yfit data used for predictions in \code{"res_vs_fitted"}.
#' @param prange optional vector of length 2 for \code{"coefs"}-plot to give
#' the limits of the predictors' plot range; defaults to \code{c(1, p)}.
#' @param coef_order optional index vector of length p for \code{plot_type = "coefs"} to give
#' the order of the predictors; defaults to \code{1 : p}.
#' @param digits number of significant digits to be displayed in the axis; defaults to 2L.
#' @param ... further arguments passed to or from other methods
#'
#' @return \code{'\link[ggplot2:ggplot]{ggplot2::ggplot}'} object
#'
#' @import ggplot2
#'
#' @export
#'
plot.spar <- function(x,
plot_type = c("val_measure","val_numactive","res_vs_fitted","coefs"),
plot_along = c("nu","nummod"),
nummod = NULL,
nu = NULL,
xfit = NULL,
yfit = NULL,
prange = NULL,
coef_order = NULL,
digits = 2L, ...) {
spar_res <- x
plot_type <- match.arg(plot_type)
plot_along <- match.arg(plot_along)
mynummod <- nummod
if (plot_type == "res_vs_fitted") {
if (is.null(xfit) | is.null(yfit)) {
stop("xfit and yfit need to be provided for res_vs_fitted plot!")
}
pred <- predict(spar_res, xnew = xfit, nummod = nummod, nu = nu,
type = "response")
res <- ggplot2::ggplot(data = data.frame(fitted=pred,
residuals=yfit-pred),
ggplot2::aes(x=.data$fitted,y=.data$residuals)) +
ggplot2::geom_point() +
ggplot2::geom_hline(yintercept = 0,linetype=2,linewidth=0.5)
} else if (plot_type == "val_measure") {
if (plot_along=="nu") {
if (is.null(nummod)) {
mynummod <- spar_res$val_res$nummod[which.min(spar_res$val_res$measure)]
tmp_title <- "Fixed optimal nummod="
} else {
tmp_title <- "Fixed given nummod="
}
tmp_df <- spar_res$val_res[spar_res$val_res$nummod==mynummod, ]
ind_min <- which.min(tmp_df$measure)
res <- ggplot2::ggplot(data = tmp_df,
ggplot2::aes(x=.data$nnu,y=.data$measure)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::scale_x_continuous(breaks=seq(1,nrow(tmp_df)),
labels=formatC(tmp_df$nu,
format = "e", digits = digits)) +
ggplot2::labs(x=expression(nu),y=spar_res$measure) +
ggplot2::geom_point(data=data.frame(x=tmp_df$nnu[ind_min],
y=tmp_df$measure[ind_min]),
ggplot2::aes(x=.data$x,y=.data$y),col="red") +
ggplot2::ggtitle(paste0(tmp_title,mynummod))
} else {
if (is.null(nu)) {
nu <- spar_res$val_res$nu[which.min(spar_res$val_res$measure)]
tmp_title <- "Fixed optimal "
} else {
tmp_title <- "Fixed given "
}
tmp_df <- spar_res$val_res[spar_res$val_res$nu == nu, ]
ind_min <- which.min(tmp_df$measure)
res <- ggplot2::ggplot(data = tmp_df,
ggplot2::aes(x=.data$nummod,y=.data$measure)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::labs(y=spar_res$measure) +
ggplot2::geom_point(data=data.frame(x=tmp_df$nummod[ind_min],y=tmp_df$measure[ind_min]),
ggplot2::aes(x=.data$x,y=.data$y),col="red")+
ggplot2::ggtitle(substitute(paste(txt,nu,"=",v),list(txt=tmp_title,v=round(nu,3))))
}
} else if (plot_type=="val_numactive") {
if (plot_along=="nu") {
if (is.null(nummod)) {
mynummod <- spar_res$val_res$nummod[which.min(spar_res$val_res$measure)]
tmp_title <- "Fixed optimal nummod="
} else {
tmp_title <- "Fixed given nummod="
}
tmp_df <- spar_res$val_res[spar_res$val_res$nummod==mynummod, ]
ind_min <- which.min(tmp_df$measure)
res <- ggplot2::ggplot(data = tmp_df,ggplot2::aes(x=.data$nnu,y=.data$numactive)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
# ggplot2::scale_x_continuous(breaks=seq(1,nrow(spar_res$val_res),1),labels=round(spar_res$val_res$nu,3)) +
ggplot2::scale_x_continuous(breaks=seq(1,nrow(spar_res$val_res),1),
labels=formatC(spar_res$val_res$nu[seq(1,nrow(spar_res$val_res),1)],
format = "e", digits = digits)) +
ggplot2::labs(x=expression(nu)) +
ggplot2::geom_point(data=data.frame(x=tmp_df$nnu[ind_min],y=tmp_df$numactive[ind_min]),
ggplot2::aes(x=.data$x,y=.data$y),col="red")+
ggplot2::ggtitle(paste0(tmp_title,mynummod))
} else {
if (is.null(nu)) {
nu <- spar_res$val_res$nu[which.min(spar_res$val_res$measure)]
tmp_title <- "Fixed optimal "
} else {
tmp_title <- "Fixed given "
}
tmp_df <- spar_res$val_res[spar_res$val_res$nu==nu, ]
ind_min <- which.min(tmp_df$measure)
res <- ggplot2::ggplot(data = tmp_df,ggplot2::aes(x=.data$nummod,y=.data$numactive)) +
ggplot2::geom_point() +
ggplot2::geom_line() +
ggplot2::geom_point(
data=data.frame(x=tmp_df$nummod[ind_min],
y=tmp_df$numactive[ind_min]),
ggplot2::aes(x = .data$x,y=.data$y),col="red")+
ggplot2::ggtitle(substitute(paste(txt,nu,"=",v),
list(txt=tmp_title,v=round(nu,3))))
}
} else if (plot_type=="coefs") {
p <- nrow(spar_res$betas)
nummod <- ncol(spar_res$betas)
if (is.null(prange)) {
prange <- c(1,p)
}
if (is.null(coef_order)) {
coef_order <- 1:p
}
tmp_mat <- data.frame(t(apply(as.matrix(spar_res$betas)[coef_order,],1,
function(row)row[order(abs(row),decreasing = TRUE)])),
predictor=1:p)
colnames(tmp_mat) <- c(seq_len(nummod),"predictor")
tmp_df <- reshape(tmp_mat, idvar = "predictor",
varying = seq_len(nummod),
v.names = "value",
timevar = "marginal model",
direction = "long")
tmp_df$`marginal model` <- as.numeric(tmp_df$`marginal model`)
mrange <- max(Matrix::rowSums(spar_res$betas != 0))
res <- ggplot2::ggplot(tmp_df,ggplot2::aes(x=.data$predictor,
y=.data$`marginal model`,
fill=.data$value)) +
ggplot2::geom_tile() +
ggplot2::scale_fill_gradient2() +
ggplot2::coord_cartesian(xlim=prange,ylim=c(1,mrange)) +
ggplot2::theme_bw() +
ggplot2::ylab("Index of marginal model") +
ggplot2::theme(panel.border = ggplot2::element_blank())
} else {
res <- NULL
}
return(res)
}
#' print.spar
#'
#' Print summary of \code{'spar'} object
#' @param x result of [spar] function of class \code{'spar'}.
#' @param ... further arguments passed to or from other methods
#' @return text summary
#' @export
print.spar <- function(x, ...) {
mycoef <- coef(x)
beta <- mycoef$beta
measure <- x$val_res$measure[mycoef$nu == x$val_res$nu &
mycoef$nummod == x$val_res$nummod ]
if (nrow(x$val_res) == 1) {
cat(sprintf("spar object: \nValidation measure (%s) of %s reached for nummod=%d,
nu=%s leading to %d / %d active predictors.\n",
x$measure,
formatC(measure,digits = 2,format = "e"),
mycoef$nummod, formatC(mycoef$nu,digits = 2,format = "e"),
sum(beta!=0),length(beta)))
} else {
cat(sprintf("spar object:\nSmallest validation measure (%s) of %s reached for nummod=%d,
nu=%s leading to %d / %d active predictors.\n",
x$measure,
formatC(measure,digits = 2,format = "e"),
mycoef$nummod, formatC(mycoef$nu,digits = 2,format = "e"),
sum(beta!=0),length(beta)))
}
cat("Summary of those non-zero coefficients:\n")
print(summary(beta[beta!=0]))
}
#' Extractor for model coefficients from `\code{coefspar}' object
#' @param x A `\code{coefspar}' object.
#' @return A numeric vector or matrix of coefficients.
#' @seealso [coef.spar], [coef.spar.cv], [print.coefspar], [summary.coefspar]
#' @export
get_coef <- function(x) {
stopifnot(inherits(x, "coefspar"))
x$beta
}
#' Extractor for model intercept from `\code{coefspar}' object
#' @param x A `\code{coefspar}' object.
#' @return Intercept (numeric or vector).
#' @export
get_intercept <- function(x) {
stopifnot(inherits(x, "coefspar"))
x$intercept
}
#' Extractor of model from a '\code{spar}' or '\code{spar.cv}' object
#'
#' @param object A fitted '\code{spar}' or '\code{spar.cv}' model
#' @param opt_par One of "best", "1se"
#'
#' @return A '\code{spar}' or '\code{spar.cv}' object where the beta and intercept elements are
#' the ones which correspond to the best or the 1se model.
#' @examples
#' example_data <- simulate_spareg_data(n = 100, p = 400, ntest = 100)
#' spar_res <- spar(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' best_model <- get_model(spar_res, opt_par = "best")
#' \donttest{
#' spar_cv <- spar.cv(example_data$x, example_data$y,
#' nummods = c(5, 10, 15, 20, 25, 30), nfolds = 4)
#' best_model_cv <- get_model(spar_cv, opt_par = "best")
#' onese_model_cv <- get_model(spar_cv, opt_par = "1se")
#' }
#' @export
get_model <- function(object, opt_par = c("best", "1se")) {
stopifnot(inherits(object, "spar") || inherits(object, "spar.cv"))
opt_nunum <- match.arg(opt_par)
if (opt_nunum == "1se" & inherits(object, "spar")) {
stop("1se model is not available for spar objects, use spar.cv instead.")
}
# best
if(inherits(object, "spar.cv")) {
val_table <- compute_val_summary(object$val_res)
best_ind <- which.min(val_table$mean_measure)
}
if(inherits(object, "spar")) {
val_table <- object$val_res
best_ind <- which.min(val_table$measure)
}
parbest <- val_table[best_ind,]
# 1se model
if (inherits(object, "spar.cv")) {
allowed_ind <- val_table$mean_measure <
(val_table$mean_measure + val_table$sd_measure)[best_ind]
ind_1cv <- which.min(val_table$mean_numactive[allowed_ind])
par1se <- val_table[allowed_ind,][ind_1cv,]
}
nummod <- ifelse(opt_nunum == "best", parbest$nummod,
par1se$nummod)
nu <- ifelse(opt_nunum == "best", parbest$nu, par1se$nu)
final_coef <- object$betas[, seq_len(nummod), drop=FALSE]
final_coef[abs(final_coef) < nu] <- 0
intercepts <- object$intercepts[seq_len(nummod)]
object$betas <- final_coef
object$intercepts <- intercepts
object$val_res <- object$val_res[
object$val_res$nummod == nummod & object$val_res$nu == nu, ,
drop = FALSE]
return(object)
}
#' Extractor for (cross-)validation measure from '\code{spar}' or '\code{spar.cv}' object
#' @param object A fitted '\code{spar}' or '\code{spar.cv}' model
#' @return data.frame containing the (cross-)validation measure for the considered threshold and number of model combinations.
#' For '\code{spar}' objects it contains information about the measure calculated on the validation set (or on the training sample if
#' xval and yval are missing) and the number of active variables. For '\code{spar.cv}' objects it contains information
#' on the average measure obtained across folds together with the standard deviation across the folds and the average number of active variables.
#' the \code{nfolds} of the training set.
#' @examples
#' example_data <- simulate_spareg_data(n = 100, p = 400, ntest = 100)
#' spar_res <- spar(example_data$x, example_data$y, xval = example_data$xtest,
#' yval = example_data$ytest, nummods=c(5, 10, 15, 20, 25, 30))
#' get_measure(spar_res)
#'
#' @seealso [spar], [spar.cv], [get_model]
#' @export
get_measure <- function(object) {
stopifnot(inherits(object, "spar") || inherits(object, "spar.cv"))
if(inherits(object, "spar.cv")) {
val_table <- compute_val_summary(object$val_res)
colnames(val_table)[4] <- paste0("mean_", object$measure)
colnames(val_table)[5] <- paste0("sd_", object$measure)
colnames(val_table)[6] <- "mean_numactive"
}
if(inherits(object, "spar")) {
val_table <- object$val_res
colnames(val_table)[4] <- object$measure
colnames(val_table)[5] <- "numactive"
}
val_table[, !(colnames(val_table) %in% c("nnu"))]
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.