bc: Model Selection with Bridge Criterion

#' Run BCVO regression for high dimensional data
#'
#' This function first uses a weighting method to assign weights to 
#' each subset in a list of candidate subsets, which are in turn 
#' generated by penalized regression methods, then run BC on
#' the remaining data to obtain the significant subset, 
#' and finally refit all the data with least squares.
#'
#' @param dat List containing design matrix X and observation y
#' @param hd_methods Vector of strings ("MCP", "SCAD", "lasso") indicating which
#' penalized regression methods are used to generate initial candidate subsets
#' @param dfmax Integer of maximum number of variables selected by each 
#' penalized regression, default to floor(sqrt(n)) where n is the sample size
#' @param ratio Value (0 to 1) of proportion of data in obtaining initial 
#' candidate subsets
#' @param weight_method String indicating which method to use for weighting
#' @param criteria Vector of strings ('GIC2, GICn, Cp, AIC, BIC, BC') indicating criteria to use
#' in selecting the final subset
#' @param penaltyBC Vector of non-default penalty values in using BC, default to NULL so that
#' only the suggested value n^(1/3) will be considered.  If other values are given, the default 
#' will be overwritten.
#' @param adaptiveBC Boolean indicating whether to use adaptive BC, default to FALSE so that
#' only the values in penaltyBC will be considered. If TRUE then penaltyBC is appened 
#' with another data-driven value 
#' @param methodPI String ('BC', 'Drop'. or 'Both') indicating the method to calculate BC, default to 'BC'
#' @param dat_test Matrix of design that is used for comparing performance, default to NULL
#' @return var_imp PI Vector of variable importance
#' @return cri_name Vector of criteria used
#' @return cri_selection List of vectors of the selected subsets
#' @return cri_coefs List of vectors of the effective coefficients
#' @return PI Vector of PI corresponding to each BC penalty in 'penaltyBC'
#' @return err_bc_x Vector of residuals
#' @return loss Vector of prediction loss on test data, default to NA
#' @return corr Vector of correlation between prediction and truth on test data, default to NA
#' @return nOver Vector of the numbers of overfitted variables on test data, default to NA
#' @return nUnder Vector of the numbers of underfitted variables on test data, default to NA
#' @export
#' @examples
#' n <- 150
#' p <- 200
#' X <- genDesignMat(n,p,0.5)
#' beta <- rep(0, p)
#' beta[c(99,199)] = c(10, 5)
#' mu <- X %*% as.matrix(beta, ncol=1)
#' y <- mu + stats::rnorm(n)
#' dat <- list(X = X, y= y)
#' res <- regBCVO(dat)

regBCVO <- function(dat, 
                    hd_methods = c("MCP", "SCAD", "lasso"),
                    dfmax = NULL, 
                    ratio = 0.7, 
                    weight_method = 'ARM',
                    criteria = c('BC'), 
                    penaltyBC = NULL, 
                    adaptiveBC = FALSE, 
                    methodPI = 'BC',
                    dat_test = NULL){ 
  X <- dat$X
  p <- dim(X)[2]
  y <- dat$y
  n <- length(y)
  if (is.null(dfmax)){
    dfmax <- floor(sqrt(n))
  }

  # get solution path from partial data 
  # random subsample instead of 1:n1 is essential for (non-iid) real data 
  n1 <- floor(n * ratio)
  ind_n1 <- sample(1:n, n1, replace = FALSE) 
  X1 <- X[ind_n1,]
  y1 <- y[ind_n1]
  X2 <- X[-ind_n1,]
  y2 <- y[-ind_n1]
  
  n_methods <- length(hd_methods)
  initCandidates <- c()
  for (k in 1:n_methods){
    res <- regPenalized(list(X=X1, y=y1), hd_methods[k], dfmax)
    initCandidates <- mergeSupp(initCandidates, getSupp(res$Beta))
  }

  # candidate subsets by variable indices (not consider null model here)
  res_cand <- getCandidates(initCandidates, list(X=X1, y=y1), weight_method)
  candidates <- res_cand$candidates 

  # summarize variable importance
  w_var <- res_cand$w_var 
  ind_var <- res_cand$ind_var 
  var_imp <- rep(0, p)
  var_imp[ind_var] <- w_var

  # select the most promising subset using the remaining partial data 
  res <- varSelection(candidates, list(X=X2, y=y2), criteria, 
                      penaltyBC, adaptiveBC, methodPI)
  cri_name <- res$cri_name
  cri_opt <- res$cri_opt
  PI <- res$PI


  nCrit <- length(cri_name)
  loss <- rep(NA, nCrit) #prediction error 
  nOver <- rep(NA, nCrit) #overfitting measure 
  nUnder <- rep(NA, nCrit) #underfitting measure
  corr <- rep(NA, nCrit) #correlation measure
  err_bc_x <- NA
  cri_selection <- vector('list', nCrit)
  cri_coefs <- vector('list', nCrit)

  for (i in 1:nCrit){
    cri_selection[[i]] <- candidates[[cri_opt[i]]]
    # refit LS using the complete data, including intercept
    ls <- stats::lsfit(X[,cri_selection[[i]]], y, intercept = TRUE) 
    cri_coefs[[i]] <- ls$coefficients
  }

  if (!is.null(dat_test)){
    X_test <- dat_test$X 
    # use mu instead of y to calculate loss
    mu_test <- dat_test$mu  
    supp0 <- dat_test$supp

    #for each criterion get the mean prediction error on a test data
    for (i in 1:nCrit){
      subs <- candidates[[cri_opt[i]]]
      coef <- cri_coefs[[i]]
      nvar <- length(subs)
      # refit LS using the complete data 
      ls <- stats::lsfit(X[,subs], y, intercept = TRUE) 
      pre <- coef[1] + matrix(X_test[,subs], ncol=nvar) %*% 
              matrix(coef[2:(1+nvar)], ncol=1)
      loss[i] <- mean( (mu_test - pre)^2 )  
      corr[i] <- stats::cor(mu_test, pre)   
      nOver[i] <- length(setdiff(candidates[[cri_opt[i]]], supp0))
      nUnder[i] <- length(setdiff(supp0, candidates[[cri_opt[i]]]))
            # calculate the residual of the FIRST BC (by default is the suggested BC)
      if(cri_name[i] == 'BC') { err_bc_x <- mu_test - pre }
    }
  }

  res <- list(var_imp = var_imp, cri_name = cri_name, cri_selection = cri_selection, 
              cri_coef = cri_coefs, PI = PI, err_bc_x = err_bc_x,
              loss = loss, corr = corr, nOver = nOver, nUnder = nUnder)
}