#' Run BCVO regression for high dimensional data
#'
#' This function first uses a weighting method to assign weights to
#' each subset in a list of candidate subsets, which are in turn
#' generated by penalized regression methods, then run BC on
#' the remaining data to obtain the significant subset,
#' and finally refit all the data with least squares.
#'
#' @param dat List containing design matrix X and observation y
#' @param hd_methods Vector of strings ("MCP", "SCAD", "lasso") indicating which
#' penalized regression methods are used to generate initial candidate subsets
#' @param dfmax Integer of maximum number of variables selected by each
#' penalized regression, default to floor(sqrt(n)) where n is the sample size
#' @param ratio Value (0 to 1) of proportion of data in obtaining initial
#' candidate subsets
#' @param weight_method String indicating which method to use for weighting
#' @param criteria Vector of strings ('GIC2, GICn, Cp, AIC, BIC, BC') indicating criteria to use
#' in selecting the final subset
#' @param penaltyBC Vector of non-default penalty values in using BC, default to NULL so that
#' only the suggested value n^(1/3) will be considered. If other values are given, the default
#' will be overwritten.
#' @param adaptiveBC Boolean indicating whether to use adaptive BC, default to FALSE so that
#' only the values in penaltyBC will be considered. If TRUE then penaltyBC is appened
#' with another data-driven value
#' @param methodPI String ('BC', 'Drop'. or 'Both') indicating the method to calculate BC, default to 'BC'
#' @param dat_test Matrix of design that is used for comparing performance, default to NULL
#' @return var_imp PI Vector of variable importance
#' @return cri_name Vector of criteria used
#' @return cri_selection List of vectors of the selected subsets
#' @return cri_coefs List of vectors of the effective coefficients
#' @return PI Vector of PI corresponding to each BC penalty in 'penaltyBC'
#' @return err_bc_x Vector of residuals
#' @return loss Vector of prediction loss on test data, default to NA
#' @return corr Vector of correlation between prediction and truth on test data, default to NA
#' @return nOver Vector of the numbers of overfitted variables on test data, default to NA
#' @return nUnder Vector of the numbers of underfitted variables on test data, default to NA
#' @export
#' @examples
#' n <- 150
#' p <- 200
#' X <- genDesignMat(n,p,0.5)
#' beta <- rep(0, p)
#' beta[c(99,199)] = c(10, 5)
#' mu <- X %*% as.matrix(beta, ncol=1)
#' y <- mu + stats::rnorm(n)
#' dat <- list(X = X, y= y)
#' res <- regBCVO(dat)
regBCVO <- function(dat,
hd_methods = c("MCP", "SCAD", "lasso"),
dfmax = NULL,
ratio = 0.7,
weight_method = 'ARM',
criteria = c('BC'),
penaltyBC = NULL,
adaptiveBC = FALSE,
methodPI = 'BC',
dat_test = NULL){
X <- dat$X
p <- dim(X)[2]
y <- dat$y
n <- length(y)
if (is.null(dfmax)){
dfmax <- floor(sqrt(n))
}
# get solution path from partial data
# random subsample instead of 1:n1 is essential for (non-iid) real data
n1 <- floor(n * ratio)
ind_n1 <- sample(1:n, n1, replace = FALSE)
X1 <- X[ind_n1,]
y1 <- y[ind_n1]
X2 <- X[-ind_n1,]
y2 <- y[-ind_n1]
n_methods <- length(hd_methods)
initCandidates <- c()
for (k in 1:n_methods){
res <- regPenalized(list(X=X1, y=y1), hd_methods[k], dfmax)
initCandidates <- mergeSupp(initCandidates, getSupp(res$Beta))
}
# candidate subsets by variable indices (not consider null model here)
res_cand <- getCandidates(initCandidates, list(X=X1, y=y1), weight_method)
candidates <- res_cand$candidates
# summarize variable importance
w_var <- res_cand$w_var
ind_var <- res_cand$ind_var
var_imp <- rep(0, p)
var_imp[ind_var] <- w_var
# select the most promising subset using the remaining partial data
res <- varSelection(candidates, list(X=X2, y=y2), criteria,
penaltyBC, adaptiveBC, methodPI)
cri_name <- res$cri_name
cri_opt <- res$cri_opt
PI <- res$PI
nCrit <- length(cri_name)
loss <- rep(NA, nCrit) #prediction error
nOver <- rep(NA, nCrit) #overfitting measure
nUnder <- rep(NA, nCrit) #underfitting measure
corr <- rep(NA, nCrit) #correlation measure
err_bc_x <- NA
cri_selection <- vector('list', nCrit)
cri_coefs <- vector('list', nCrit)
for (i in 1:nCrit){
cri_selection[[i]] <- candidates[[cri_opt[i]]]
# refit LS using the complete data, including intercept
ls <- stats::lsfit(X[,cri_selection[[i]]], y, intercept = TRUE)
cri_coefs[[i]] <- ls$coefficients
}
if (!is.null(dat_test)){
X_test <- dat_test$X
# use mu instead of y to calculate loss
mu_test <- dat_test$mu
supp0 <- dat_test$supp
#for each criterion get the mean prediction error on a test data
for (i in 1:nCrit){
subs <- candidates[[cri_opt[i]]]
coef <- cri_coefs[[i]]
nvar <- length(subs)
# refit LS using the complete data
ls <- stats::lsfit(X[,subs], y, intercept = TRUE)
pre <- coef[1] + matrix(X_test[,subs], ncol=nvar) %*%
matrix(coef[2:(1+nvar)], ncol=1)
loss[i] <- mean( (mu_test - pre)^2 )
corr[i] <- stats::cor(mu_test, pre)
nOver[i] <- length(setdiff(candidates[[cri_opt[i]]], supp0))
nUnder[i] <- length(setdiff(supp0, candidates[[cri_opt[i]]]))
# calculate the residual of the FIRST BC (by default is the suggested BC)
if(cri_name[i] == 'BC') { err_bc_x <- mu_test - pre }
}
}
res <- list(var_imp = var_imp, cri_name = cri_name, cri_selection = cri_selection,
cri_coef = cri_coefs, PI = PI, err_bc_x = err_bc_x,
loss = loss, corr = corr, nOver = nOver, nUnder = nUnder)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.