VSC: Variable selection by combining method fits (VSC)

Documented in print.vsc vsc

#################################################
## This file includes the main functions for VSC
#################################################
vsc.env <- new.env()

#' Print the coefficients of vsc
#' @export print.vsc
#' @param x output of vsc()
#' @param ... additional arguments to "print"
print.vsc<-function(x, ...){
  print(x$est.b, ...)
}

#' Fit the Variable selection by combining method fits (VSC) method
#' @export vsc
#' @import parallel
#' @import doParallel
#' @import relaxo
#' @import datasets
#' @importFrom grDevices rainbow rgb dev.list dev.off
#' @importFrom graphics abline axis boxplot legend par plot points rect text plot.new
#' @importFrom stats complete.cases median sd lm coef quantile
#' @importFrom utils capture.output head read.csv
#' @param X covariates (n times p matrix, n: number of entries, p: number of covariates)
#' @param Y response (vector with n entries)
#' @param intercept TRUE to fit the data with an intercept, FALSE to fit the data without an intercept
#' @param method.names vector of method names to be used in VSC. Choose among "lasso", "elastic", "relaxo", "mcp" and "scad". Default is to use all methods listed above
#' @param coef.est.method method to estimate the coefficients of covariates after variable selection. User can provide his/her function. Default is ordinary least square
#' @param B number of subsampling. Default is 100
#' @param fit.percent percentage of observations used in fitting in VSC
#' @param q percentile of fitted models used per each subsampling in VSC, according to the selection criterion on out-of-sample data in ascending order. Default is q = 0 (only the fitted model with the lowest MSE in a subsampling data is used)
#' @param selection.criterion = c("mse", "ebic"). Measure to select fitted models in subsampling dataset. "mse" is mean square error and "ebic" is extended BIC. Default is mse
#' @param num.core number of cores to use. Default is 1 (i.e. no parallel running)
#' @param all.fits (optional) all fitted models. If all.fits is provided, then VSC will use the fitted models in all.fitted instead of fitting using subsampling data
#' @return a list, which includes estimated coefficients (est.b), subsampling fitted models (mod.collection), number of times a method is selected (method.freq), relative frequency of each covariate (variable.freq), covariates ordered by relative frequency (variable.order).
#' @examples
#' X = matrix(rnorm(1000), nrow = 100)
#' Y = rowSums(X[,1:3])+rnorm(100)
#' mod.0 = vsc(X, Y, intercept = FALSE, q = 0, method.names = NULL)
#' print(mod.0)
#' mod.5 = vsc(X, Y, intercept = FALSE, q = 5, all.fits = mod.0$all.fits)
#' print(mod.5)
vsc<-function(X, Y, intercept, method.names = NULL,
              coef.est.method = lm.ols,
              B = 100, q = 0,
              fit.percent = 0.5,
              selection.criterion = "mse",
              num.core = 1,
              all.fits = NULL){
  # wrapper function of the main algorithm
  if(is.null(method.names)){
    method.names = names(get.fit.methods())
  }
  mod = get.vsc.mod(X = X, Y = Y, intercept = intercept,
                    method.names = method.names, coef.est.method = coef.est.method,
                    B = B, q = q, fit.percent = fit.percent,
                    selection.criterion = selection.criterion,
                    num.core = num.core,
                    all.fits = all.fits)
  class(mod) <- "vsc"
  return (mod)
}

get.vsc.mod<-function(X, Y, intercept,
                     method.names, coef.est.method,
                     B, q, fit.percent,
                     selection.criterion,
                     num.core,
                     all.fits = NULL){
  # the main function of vsc
  #
  #   X: covariates
  #   Y: response
  #   intercept: intercept
  #   method.names: names of methods to use
  #   coef.est.method: refitting method
  #   B: number of iteration in CV
  #   q: percentile of fitted models from cv to use
  #
  # Returns:
  #   a list (variable selection frequency, solution path)
  if(is.null(all.fits)){
    all.fits = get.vsc.unique.fit(X = X, Y = Y, intercept = intercept,
                                  method.names = method.names,
                                  B = B,
                                  fit.percent = fit.percent,
                                  current.fit = NULL,
                                  num.core = num.core)
  }
  vsc.result = get.vsc.final.mod(X = X, Y = Y, intercept = intercept,
                                 unique.fit = all.fits,
                                 selection.criterion = selection.criterion,
                                 coef.est.method = coef.est.method,
                                 q = q,
                                 method.names = method.names,
                                 B = B)
  # futile.logger::flog.info("finish selecting models")
  # vsc.result$PR.mod.collection = get.PR.mod.collection(X, Y, intercept, all.fits$flds,
  #                                                    vsc.result$mod.collection,
  #                                                    vsc.result$variable.freq,
  #                                                    num.core = num.core)
  # futile.logger::flog.info("finish refiting by ridge regression")
  vsc.result$all.fits = all.fits
  vsc.result$param = list(selection.criterion = selection.criterion,
                          coef.est.method = coef.est.method,
                          fit.percent = fit.percent,
                          q = q,
                          method.names = method.names,
                          B = B)
  return (vsc.result)
}

get.vsc.unique.fit<-function(X, Y, intercept, method.names, B,
                             fit.percent,
                             current.fit = NULL,
                             num.core = 1){
  # the main function of the new method
  #
  #   X: covariates
  #   Y: response
  #   intercept: intercept
  #   method.names: names of methods to use
  #   B: number of iteration in CV
  #
  # Returns:
  #   a list (methods) of list (flds) of list (unique models (with coefficient), mse)

  path.fit.methods = get.path.fit.methods()
  cv.fit = cv.for.methods(X, Y, intercept,
                          fit.percent, num.repeat = B,
                          fit.methods = path.fit.methods[method.names],
                          is.keep.all = FALSE, current.fit = current.fit,
                          num.core = num.core)
  unique.mod = unique.mod.for.methods(cv.fit$cv.result, is.ols = TRUE)

  current.method.names = union(names(current.fit$unique.mod), names(unique.mod))
  current.mod = lapply(current.method.names,
                      function(method.name) c(current.fit$unique.mod[[method.name]], unique.mod[[method.name]]))

  names(current.mod) = current.method.names

  current.fit = list(unique.mod = current.mod,
                       flds = if(length(cv.fit$flds)>length(current.fit$flds)) cv.fit$flds else current.fit$flds)

  attr(current.fit, "type") <- "vsc.all.fits"
  return (current.fit)
}

is.vsc.all.fits<-function(obj){
  return (!is.null(attr(obj, "type")) && attr(obj, "type") == "vsc.all.fits")
}

is.vsc.fit<-function(obj){
  return (class(obj)=="vsc")
}

get.vsc.final.mod<-function(X, Y, intercept, unique.fit,
                            selection.criterion,
                            coef.est.method,
                            q, method.names, B){
  # the main function of the new method
  #
  #   X: covariates
  #   Y: response
  #   intercept: intercept
  #   unique.fit: unique fit from "get.vsc.unique.fit"
  #   coef.est.method: refitting method
  #   q: percentile of fitted models from cv to use
  #
  # Returns:
  #   a list (methods) of list (flds) of list (unique models (with coefficient), mse)
  # filter the models and calculate the variable selection frequency
  sel.mods.n.sel.freq = get.best.mod(unique.fit, selection.criterion = selection.criterion,
                                     is.ols = TRUE, q = q, method.names = method.names,
                                     B = B)
  var.sel.freq.n.order = vsc.var.sel.freq.n.sol.path(sel.mods.n.sel.freq)

  # get the final models
  vsc.s = vsc.thr.by.size(X, Y, intercept,
                          var.order = var.sel.freq.n.order$var.order,
                          size.thr = stats::median(rowSums(sel.mods.n.sel.freq$sel.mod!=0)),
                          coef.est.method)

  vsc.m = vsc.thr.by.freq(X, Y, intercept,
                          var.sel.freq = var.sel.freq.n.order$var.sel.freq,
                          freq.thr = 0.5,
                          coef.est.method)
  vsc.ebic = vsc.by.ebic(X, Y, intercept, var.order = var.sel.freq.n.order$var.order, size.thr = sum(var.sel.freq.n.order$var.sel.freq>=0.1) )
  est.b = rbind(vsc.m = vsc.m,
                vsc.s = vsc.s,
                vsc.ebic = vsc.ebic)

  colnames(est.b) = colnames(sel.mods.n.sel.freq$sel.mod)
  colnames(est.b)[1] = "intercept"
  return (list(est.b = est.b,
               mod.collection = sel.mods.n.sel.freq$sel.mod,
               method.freq = sel.mods.n.sel.freq$sel.method.freq,
               variable.freq = var.sel.freq.n.order$var.sel.freq,
               variable.order = var.sel.freq.n.order$var.order))
}