Nothing
#'
#' @importFrom stats coef sd median mad cor pf
#'
#' @title Robust Split Least Angle Regression
#'
#' @description \code{srlars} performs split robust least angle regression.
#'
#' @param x Design matrix.
#' @param y Response vector.
#' @param n_models Number of models into which the variables are split.
#' @param model_saturation Criterion to determine if a model is saturated. Must be one of "fixed" (default) or "p-value".
#' @param alpha P-value used to determine when the model is saturated
#' @param model_size Size of the models in the ensemble.
#' @param robust Argument to determine if robust measures of location, scale and correlation are used. Default is TRUE.
#' @param compute_coef Argument to determine if coefficients are computed (via the elastic net) for each model. Default is FALSE.
#' @param en_alpha Elastic net mixing parmeter for parameters shrinkage. Default is 1/4.
#'
#' @return An object of class srlars
#'
#' @export
#'
#' @author Anthony-Alexander Christidis, \email{anthony.christidis@stat.ubc.ca}
#'
#' @seealso \code{\link{coef.srlars}}, \code{\link{predict.srlars}}
#'
#' @examples
#' # Required library
#' library(mvnfast)
#'
#' # Simulation parameters
#' n <- 50
#' p <- 500
#' rho.within <- 0.8
#' rho.between <- 0.2
#' p.active <- 100
#' group.size <- 25
#' snr <- 3
#' contamination.prop <- 0.2
#'
#' # Setting the seed
#' set.seed(0)
#'
#' # Block correlation structure
#' sigma.mat <- matrix(0, p, p)
#' sigma.mat[1:p.active, 1:p.active] <- rho.between
#' for(group in 0:(p.active/group.size - 1))
#' sigma.mat[(group*group.size+1):(group*group.size+group.size),
#' (group*group.size+1):(group*group.size+group.size)] <- rho.within
#' diag(sigma.mat) <- 1
#'
#' # Simulation of beta vector
#' true.beta <- c(runif(p.active, 0, 5)*(-1)^rbinom(p.active, 1, 0.7), rep(0, p - p.active))
#'
#' # Setting the SD of the variance
#' sigma <- as.numeric(sqrt(t(true.beta) %*% sigma.mat %*% true.beta)/sqrt(snr))
#'
#' # Simulation of uncontaminated data
#' x <- mvnfast::rmvn(n, mu = rep(0, p), sigma = sigma.mat)
#' y <- x %*% true.beta + rnorm(n, 0, sigma)
#'
#' # Contamination of data
#' contamination_indices <- 1:floor(n*contamination.prop)
#' k_lev <- 2
#' k_slo <- 100
#' x_train <- x
#' y_train <- y
#' beta_cont <- true.beta
#' beta_cont[true.beta!=0] <- beta_cont[true.beta!=0]*(1 + k_slo)
#' beta_cont[true.beta==0] <- k_slo*max(abs(true.beta))
#' for(cont_id in contamination_indices){
#'
#' a <- runif(p, min = -1, max = 1)
#' a <- a - as.numeric((1/p)*t(a) %*% rep(1, p))
#' x_train[cont_id,] <- mvnfast::rmvn(1, rep(0, p), 0.1^2*diag(p)) +
#' k_lev * a / as.numeric(sqrt(t(a) %*% solve(sigma.mat) %*% a))
#' y_train[cont_id] <- t(x_train[cont_id,]) %*% beta_cont
#' }
#'
#' # Ensemble models
#' ensemble_fit <- srlars(x_train, y_train,
#' n_models = 5,
#' model_saturation = c("fixed", "p-value")[1],
#' alpha = 0.05, model_size = n - 1,
#' robust = TRUE,
#' compute_coef = TRUE,
#' en_alpha = 1/4)
#'
#' # Ensemble coefficients
#' ensemble_coefs <- coef(ensemble_fit, group_index = 1:ensemble_fit$n_models)
#' sens_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/p.active
#' spec_ensemble <- sum(which((ensemble_coefs[-1]!=0)) <= p.active)/sum(ensemble_coefs[-1]!=0)
#'
#' # Simulation of test data
#' m <- 2e3
#' x_test <- mvnfast::rmvn(m, mu = rep(0, p), sigma = sigma.mat)
#' y_test <- x_test %*% true.beta + rnorm(m, 0, sigma)
#'
#' # Prediction of test samples
#' ensemble_preds <- predict(ensemble_fit, newx = x_test,
#' group_index = 1:ensemble_fit$n_models,
#' dynamic = FALSE)
#' mspe_ensemble <- mean((y_test - ensemble_preds)^2)/sigma^2
#'
srlars <- function(x, y,
n_models = 1,
model_saturation = c("fixed", "p-value")[1],
alpha = 0.05,
model_size = NULL,
robust = TRUE,
compute_coef = FALSE,
en_alpha = 1/4){
# Data input check
DataCheck(x, y,
n_models,
model_saturation,
alpha,
model_size,
robust,
compute_coef,
en_alpha)
# Shuffle the data
n <- nrow(x)
p <- ncol(x)
random.permutation <- sample(1:n, n)
x <- x[random.permutation, ]
y <- y[random.permutation]
# Model size check
if(model_saturation == "fixed"){
if(is.null(model_size))
model_size <- n - 1 else if(model_size >= n)
stop("The size of the models cannot be equal or exceed n.")
}
# Standarization predictors and response
# Computation of correlation for predictors and response
if(robust){
# Standardization of predictors and response
x.std <- apply(x, 2, function(x) return((x - median(x))/mad(x)))
y.std <- (y - median(y))/mad(y)
# Computation of correlations for predictors and response
xy.std <- cbind(x.std, y.std)
DDCxy <- cellWise::DDC(xy.std, DDCpars = list(fastDDC = TRUE, silent = TRUE))
rob.cor <- cor(DDCxy$Ximp)
Rx <- rob.cor[-nrow(rob.cor),-ncol(rob.cor)]
Ry <- rob.cor[-nrow(rob.cor), ncol(rob.cor)]
# Full data
DDCxy <- cellWise::DDC(cbind(x, y), DDCpars = list(fastDDC = TRUE, silent = TRUE))
x_imp <- DDCxy$Ximp[, -ncol(DDCxy$Ximp)]
y_imp <- DDCxy$Ximp[, ncol(DDCxy$Ximp)]
} else{
# Standardization of predictors and response
x.std <- apply(x, 2, function(x) return((x - mean(x))/sd(x)))
y.std <- (y - mean(y))/sd(y)
# Computation of correlations for predictors and response
xy.std <- cbind(x.std, y.std)
CORxy <- cor(xy.std)
Rx <- CORxy[-nrow(CORxy), -ncol(CORxy)]
Ry <- CORxy[-nrow(CORxy), ncol(CORxy)]
# Full data
x_imp <- x
y_imp <- y
}
# Initialize LARS models
lars_models <- list()
lars_models <- initialize_ensembles(lars_models,
x_imp, y_imp,
n_models, Ry)
# Vector of unsaturated models
unsaturated_models <- 1:n_models
# Vector of p-values
p_values <- numeric(5)
# Finding optimal predictor
for(model_id in unsaturated_models)
lars_models[[model_id]] <- find_optimal(lars_models[[model_id]],
x_imp, y_imp,
Rx, Ry,
model_saturation, alpha)
n_available <- p - n_models
# Looping over the models
while(length(unsaturated_models) != 0 & (n_available > 1)){
# Finding optimal unsaturated model
p_values <- sapply(lars_models, function(x) x$p_value)
p_values[-c(unsaturated_models)] <- Inf
optimal_model <- which.min(p_values)
optimal_predictor <- lars_models[[optimal_model]][["optimal_predictor"]]
# Add optimal predictor for optimal model
lars_models[[optimal_model]] <- add_optimal_predictor(lars_models[[optimal_model]],
model_saturation, model_size, n)
n_available <- n_available - 1
# Update optimal model
if(!lars_models[[optimal_model]][["saturated"]])
lars_models[[optimal_model]] <- find_optimal(lars_models[[optimal_model]],
x_imp, y_imp,
Rx, Ry,
model_saturation, alpha)
# Remove optimal predictor for non-optimal unsaturated models
for(model_id in unsaturated_models[-c(which(unsaturated_models == optimal_model))])
lars_models[[model_id]] <- remove_optimal_predictor(lars_models[[model_id]], optimal_predictor,
x_imp, y_imp,
model_saturation, alpha)
# Update unsaturated models
unsaturated_models <- which(!sapply(lars_models, function(x) x$saturated))
}
# Selected predictors
selections <- lapply(lars_models, function(x) x$model_predictors)
# Creating the list for the output
output <- list(x = x, y = y,
n_models = n_models,
model_saturation = model_saturation,
alpha = alpha,
model_size = model_size,
robust = robust,
compute_coef = compute_coef,
selections = selections,
intercepts = list(), coefficients = list(),
DDCx = cellWise::DDC(x, DDCpars = list(fastDDC = TRUE, silent = TRUE)))
# Computation of final coefficients
if(compute_coef){
for(model_id in 1:n_models){
en_fit <- glmnet::cv.glmnet(x_imp[, output$selections[[model_id]]], y_imp,
alpha = en_alpha)
output$intercepts[[model_id]] <- coef(en_fit, s = "lambda.min")[1]
output$coefficients[[model_id]] <- numeric(p)
output$coefficients[[model_id]][output$selections[[model_id]]] <- coef(en_fit, s = "lambda.min")[-1]
}
}
# Create the object of class "srlars"
class(output) <- append("srlars", class(output))
# Returning the output from the stepwise algorithm
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.