#' Variable selection using different criteria
#'
#' This function uses specified criteria to select the optimal subset from a list of subsets,
#' given design matrix X and observation y.
#'
#' @param candidates List of vectors each representing the indices of significant variables
#' @param dat List containing design matrix X and observation y
#' @param criteria Vector of strings ('GIC2, GICn, Cp, AIC, BIC, BC') indicating criteria to use
#' @param penaltyBC Vector of non-default penalty values in using BC, default to NULL so that
#' only the suggested value n^(1/3) will be considered
#' @param adaptiveBC Boolean indicating whether to use adaptive BC, default to FALSE so that
#' only the suggested value n^(1/3) will be considered
#' @param methodPI String ('BC', 'Drop', or 'Both') indicating the method to calculate BC, default to 'BC'
#' @return List (cri_opt) of indices of candidate subsets, each corresponding to a method (cri_name),
#' along with parametricness index PI for each type of BC
#' @export
#' @examples
#' n <- 150
#' p <- 200
#' X <- genDesignMat(n,p,0.5)
#' beta <- rep(0, p)
#' beta[c(99,199)] = c(10, 5)
#' mu <- X %*% as.matrix(beta, ncol=1)
#' y <- mu + stats::rnorm(n)
#' candidates <- vector('list', 0)
#' candidates[[1]] <- c(1, 99)
#' candidates[[2]] <- c(99, 199)
#' candidates[[3]] <- c(1, 99, 199)
#' dat <- list(X = X, y= y)
#' res <- varSelection(candidates, dat)
varSelection <- function(candidates,
dat,
criteria=c('BC'),
penaltyBC=NULL,
adaptiveBC=FALSE,
methodPI='BC'){
X <- dat$X
y <- dat$y
nCand <- length(candidates)
n <- nrow(X)
# obtain the "consistent" estimator of sigma2 in order to cal. gic2 & gicn
# only calculable when the number of distinct variables is less than sample size
vars <- c()
for (j in 1:nCand){
vars <- c(vars, candidates[[j]])
}
vars <- unique(vars)
if (length(y) > length(vars)) {
wellCondition <- TRUE
ls <- stats::lsfit(X[, vars], y, intercept = TRUE)
sig2 <- mean((ls$residuals)^2) * n / (n-length(vars))
}else{
wellCondition <- FALSE
sig2 <- NA
}
# penalty param for BC
if (is.null(penaltyBC)) {
WN <- c(n^(1/3))
}else {
WN <- penaltyBC
}
if (adaptiveBC && wellCondition){
WN <- c(WN, chooseWNAdaptively(y, X, candidates, sig2, 5))
}
# else{
# WN <- c(WN, NA)
# }
nWN <- length(WN)
# pre-allocate
dim <- rep(NA, nCand)
mse <- rep(NA, nCand)
aic <- rep(NA, nCand)
bic <- rep(NA, nCand)
gic2 <- rep(NA, nCand)
gicn <- rep(NA, nCand)
bc <- vector('list', nWN)
for (i in 1:nWN) { bc[[i]] <- rep(NA, nCand) }
#cv1 <- rep(NA, nCand)
#cvd <- rep(NA, nCand)
for (i in 1:nCand){
dim[i] <- length(candidates[[i]])
}
# the parametricness index, where the last one is defined by Liu and Yang
PI <- rep(NA, nWN+1)
#compute mse
for (k in 1:nCand){
if (length(candidates[[k]]) >= (length(y)-1)){
mse[k] <- Inf
}else{
ls <- stats::lsfit(X[,candidates[[k]]], y, intercept = TRUE)
mse[k] <- mean((ls$residuals)^2)
}
}
# result using method AIC, calculated regardless of input param
aic <- log(mse) + 2 * dim / n
aic_opt <- which.min(aic)
# result using method BIC, calculated regardless of input param
bic <- log(mse) + log(n) * dim / n
bic_opt <- which.min(bic)
bc_opt <- rep(NA, nWN)
if (wellCondition){
# result using method GIC2, calculated regardless of input param
gic2 <- mse + 2 * sig2 * dim / n
gic2_opt <- which.min(gic2)
# result using method GICn, calculated regardless of input param
# gicn uses lambda=log(n) to resemble BIC
gicn <- mse + log(n) * sig2 * dim / n
gicn_opt <- which.min(gicn)
# gic2/gicn if well defined and aic/bic otherwise
surro_gic2 <- gic2_opt
surro_gicn <- gicn_opt
}else{
# if not in wellCondition, use AIC, BIC based BC, and return NA for GIC, GICn
gic2_opt <- NA
gicn_opt <- NA
surro_gic2 <- aic_opt
surro_gicn <- bic_opt
}
# result using method BC
for (i in 1:nWN){
for (k in 1:nCand){
if (wellCondition){
bc[[i]][k] <- mse[k] + 2 * sig2 * sum(1/(1:dim[k])) * WN[i] / n
# filter out all the candidates that have dimension larger than gic2's
if (dim[k] > dim[which.min(gic2)]) { bc[[i]][k] <- Inf }
}else{
bc[[i]][k] <- log(mse[k]) + 2 * sum(1/(1:dim[k])) * WN[i] / n
if (dim[k] > dim[which.min(aic)]) { bc[[i]][k] <- Inf }
}
}
bc_opt[i] <- which.min(bc[[i]])
}
for (i in 1:nWN) {
# Note that 1. the candidates (not user-specified but) selected from solution path
# may be just the true model
# 2. d_BC is not necessarily beweteen d_AIC d_BIC
if ((surro_gic2 == surro_gicn) && (surro_gic2 == bc_opt[i])) {
PI[i] <- 1
}else{
PI[i] <- abs( dim[bc_opt[i]] - dim[surro_gic2] ) /
( abs( dim[bc_opt[i]] - dim[surro_gic2] ) + abs( dim[bc_opt[i]] - dim[surro_gicn] ) )
}
}
if (wellCondition && (methodPI == 'Drop' || methodPI == 'Both')){ #only in this case is sig2 defined
PI[nWN+1] <- getLiuPI( mse, dim, which.min(gicn), sig2, n )
}else{
PI[nWN+1] <- NA
}
# use string matching to decide the final return
# index of subset selected by each criterion
cri_opt <- c()
# name of each criterion, for BC of different penalties it looks as 'BC', 'BC 2', ...
cri_name <- c()
if (sum(grepl('GIC2', criteria)) > 0) {
if (!wellCondition) {stop('Due to small sample size, noise variance cannot be estimated for GIC2')}
cri_name <- c(cri_name, 'GIC2')
cri_opt <- c(cri_opt, gic2_opt)
}
if (sum(grepl('Cp', criteria)) > 0) {#same as GIC2
if (!wellCondition) {stop('Due to small sample size, noise variance cannot be estimated for Cp')}
cri_name <- c(cri_name, 'Cp')
cri_opt <- c(cri_opt, gic2_opt)
}
if (sum(grepl('GICn', criteria)) > 0) {
if (!wellCondition) {stop('Due to small sample size, noise variance cannot be estimated for GICn')}
cri_name <- c(cri_name, 'GICn')
cri_opt <- c(cri_opt, gicn_opt)
}
if (sum(grepl('AIC', criteria)) > 0) {
cri_name <- c(cri_name, 'AIC')
cri_opt <- c(cri_opt, aic_opt)
}
if (sum(grepl('BIC', criteria)) > 0) {
cri_name <- c(cri_name, 'BIC')
cri_opt <- c(cri_opt, bic_opt)
}
if (sum(grepl('BC', criteria)) > 0) {
cri_name <- c(cri_name, 'BC')
if (length(bc_opt) > 1){
for (j in 2:length(bc_opt)) {
cri_name <- c(cri_name, paste('BC', j))
}
}
if (adaptiveBC) { cri_name[length(cri_name)] <- 'adaBC' }
cri_opt <- c(cri_opt, bc_opt)
}
res <- list()
res$cri_name <- cri_name
res$cri_opt <- cri_opt
if (methodPI == 'BC') {
res$PI <- PI[1:nWN]
}else{
if (!wellCondition) {stop('Due to small sample size, noise variance cannot be estimated for calculating PI')}
if (methodPI == 'Both'){
res$PI <- PI[1:(nWN+1)]
}else{
res$PI <- PI[nWN+1]
}
}
res
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.