Nothing
#################################################
## This file includes the main functions for CSUV
#################################################
csuv.env <- new.env()
#' Print the coefficients of csuv
#' @export
#' @param x output of csuv()
#' @param ... additional arguments to "print"
#' @return return value from print(x$est.b)
print.csuv <- function(x, ...) {
print(x$est.b, ...)
}
#' Get the fitted results from Combined Selection and Uncertainty Visualiser (CSUV) method
#' @export csuv
#' @import parallel
#' @import doParallel
#' @import relaxo
#' @import datasets
#' @importFrom grDevices rainbow rgb dev.list dev.off
#' @importFrom graphics abline axis boxplot legend par plot points rect text plot.new
#' @importFrom stats complete.cases median sd lm coef quantile
#' @importFrom utils capture.output head read.csv
#' @param X covariates (n times p matrix, n: number of entries, p: number of covariates)
#' @param Y response (vector with n entries)
#' @param intercept TRUE to fit the data with an intercept, FALSE to fit the data without an intercept
#' @param method.names vector of method names to be used in CSUV. Choose among "lasso", "elastic", "relaxo", "mcp" and "scad". Default is to use all methods listed above
#' @param coef.est.method method to estimate the coefficients of covariates after variable selection. User can provide his/her function. Default is ordinary least square
#' @param B number of subsampling. Default is 100
#' @param fit.percent percentage of observations used in fitting in CSUV
#' @param q percentile of fitted models used per each subsampling in CSUV, according to the selection criterion on out-of-sample data in ascending order. Default is q = 0 (only the fitted model with the lowest MSE in a subsampling data is used)
#' @param selection.criterion = c("mse", "ebic"). Measure to select fitted models in subsampling dataset. "mse" is mean square error and "ebic" is extended BIC. Default is mse
#' @param num.core number of cores to use. Default is 1 (i.e. no parallel running)
#' @param all.fits (optional) all fitted models. If all.fits is provided, then CSUV will use the fitted models in all.fitted instead of fitting using subsampling data
#' @param log.level log level to set. Default is NULL, which means no change in log level. See the function CSUV::set.log.level for more details
#' @return a list, which includes estimated coefficients (est.b), subsampling fitted models (mod.collection), number of times a method is selected (method.freq), relative frequency of each covariate (variable.freq), covariates ordered by relative frequency (variable.order).
#' @examples
#' \donttest{
#' X = matrix(rnorm(1000), nrow = 100)
#' Y = rowSums(X[,1:3])+rnorm(100)
#' mod.0 = csuv(X, Y, intercept = FALSE, q = 0, method.names = NULL)
#' print(mod.0)
#' mod.5 = csuv(X, Y, intercept = FALSE, q = 5, all.fits = mod.0$all.fits)
#' print(mod.5)
#' }
csuv <- function(X, Y, intercept, method.names = NULL,
coef.est.method = lm.ols,
B = 100, q = 0,
fit.percent = 0.5,
selection.criterion = "mse",
num.core = 1,
all.fits = NULL,
log.level = NULL) {
# wrapper function of the main algorithm
if (!is.null(log.level)){
set.log.level(log.level)
}
if (is.null(method.names)) {
method.names <- names(get.fit.methods())
}
mod <- get.csuv.mod(X = X, Y = Y, intercept = intercept,
method.names = method.names,
coef.est.method = coef.est.method,
B = B, q = q, fit.percent = fit.percent,
selection.criterion = selection.criterion,
num.core = num.core,
all.fits = all.fits)
class(mod) <- "csuv"
return(mod)
}
get.csuv.mod <- function(X, Y, intercept,
method.names, coef.est.method,
B, q, fit.percent,
selection.criterion,
num.core,
all.fits = NULL) {
# the main function of csuv
#
# X: covariates
# Y: response
# intercept: intercept
# method.names: names of methods to use
# coef.est.method: refitting method
# B: number of iteration in CV
# q: percentile of fitted models from cv to use
#
# Returns:
# a list (variable selection frequency, solution path)
if (is.null(all.fits)) {
all.fits <- get.csuv.unique.fit(X = X, Y = Y, intercept = intercept,
method.names = method.names,
B = B,
fit.percent = fit.percent,
current.fit = NULL,
num.core = num.core)
}
csuv.result <- get.csuv.final.mod(X = X, Y = Y, intercept = intercept,
unique.fit = all.fits,
selection.criterion = selection.criterion,
coef.est.method = coef.est.method,
q = q,
method.names = method.names,
B = B)
csuv.result$all.fits <- all.fits
csuv.result$param <- list(selection.criterion = selection.criterion,
coef.est.method = coef.est.method,
fit.percent = fit.percent,
q = q,
method.names = method.names,
B = B)
return(csuv.result)
}
#' Helper function, please do not use it
#' @export get.csuv.unique.fit
#' @param X covariates (n times p matrix, n: number of entries, p: number of covariates)
#' @param Y response (vector with n entries)
#' @param intercept TRUE to fit the data with an intercept, FALSE to fit the data without an intercept
#' @param method.names vector of method names to be used in CSUV. Choose among "lasso", "elastic", "relaxo", "mcp" and "scad". Default is to use all methods listed above
#' @param B number of subsampling. Default is 100
#' @param fit.percent percentage of observations used in fitting in CSUV
#' @param num.core number of cores to use. Default is 1 (i.e. no parallel running)
#' @param current.fit (optional) all fitted models
#' @return a list of current fit
get.csuv.unique.fit <- function(X, Y, intercept, method.names, B,
fit.percent,
current.fit = NULL,
num.core = 1) {
# the main function of the new method
#
# X: covariates
# Y: response
# intercept: intercept
# method.names: names of methods to use
# B: number of iteration in CV
#
# Returns:
# a list (methods) of list (flds) of list (unique models (with coefficient), mse)
path.fit.methods <- get.path.fit.methods()
cv.fit <- cv.for.methods(X, Y, intercept,
fit.percent, num.repeat = B,
fit.methods = path.fit.methods[method.names],
is.keep.all = FALSE, current.fit = current.fit,
num.core = num.core)
unique.mod <- unique.mod.for.methods(cv.fit$cv.result, is.ols = TRUE)
current.method.names <- union(names(current.fit$unique.mod), names(unique.mod))
current.mod <- lapply(current.method.names,
function(method.name) c(current.fit$unique.mod[[method.name]], unique.mod[[method.name]]))
names(current.mod) <- current.method.names
current.fit <- list(unique.mod = current.mod,
flds = if (length(cv.fit$flds) > length(current.fit$flds)) cv.fit$flds else current.fit$flds)
attr(current.fit, "type") <- "csuv.all.fits"
return(current.fit)
}
is.csuv.all.fits <- function(obj) {
return(!is.null(attr(obj, "type")) && attr(obj, "type") == "csuv.all.fits")
}
is.csuv.fit <- function(obj) {
return(class(obj) == "csuv")
}
#' Helper function, please do not use it
#' @export get.csuv.final.mod
#' @param X covariates (n times p matrix, n: number of entries, p: number of covariates)
#' @param Y response (vector with n entries)
#' @param intercept TRUE to fit the data with an intercept, FALSE to fit the data without an intercept
#' @param unique.fit from get.csuv.unique.fit
#' @param selection.criterion = c("mse", "ebic"). Measure to select fitted models in subsampling dataset. "mse" is mean square error and "ebic" is extended BIC. Default is mse
#' @param coef.est.method method to estimate the coefficients of covariates after variable selection. User can provide his/her function. Default is ordinary least square
#' @param q percentile of fitted models used per each subsampling in CSUV, according to the selection criterion on out-of-sample data in ascending order. Default is q = 0 (only the fitted model with the lowest MSE in a subsampling data is used)
#' @param method.names vector of method names to be used in CSUV. Choose among "lasso", "elastic", "relaxo", "mcp" and "scad". Default is to use all methods listed above
#' @param B number of subsampling. Default is 100
#' @return a list of current fit
get.csuv.final.mod <- function(X, Y, intercept, unique.fit,
selection.criterion,
coef.est.method = lm.ols,
q, method.names, B) {
# the main function of the new method
#
# X: covariates
# Y: response
# intercept: intercept
# unique.fit: unique fit from "get.csuv.unique.fit"
# coef.est.method: refitting method
# q: percentile of fitted models from cv to use
#
# Returns:
# a list (methods) of list (flds) of list (unique models (with coefficient), mse)
# filter the models and calculate the variable selection frequency
sel.mods.n.sel.freq <- get.best.mod(unique.fit,
selection.criterion = selection.criterion,
is.ols = TRUE, q = q,
method.names = method.names,
B = B)
var.sel.freq.n.order <- csuv.var.sel.freq.n.sol.path(sel.mods.n.sel.freq)
# get the final models
csuv.s <- csuv.thr.by.size(X, Y, intercept,
var.order = var.sel.freq.n.order$var.order,
size.thr = stats::median(rowSums(sel.mods.n.sel.freq$sel.mod[, -1, drop = FALSE] != 0)),
coef.est.method)
csuv.m <- csuv.thr.by.freq(X, Y, intercept,
var.sel.freq = var.sel.freq.n.order$var.sel.freq,
freq.thr = 0.5,
coef.est.method)
csuv.ebic <- csuv.by.ebic(X, Y, intercept,
var.order = var.sel.freq.n.order$var.order,
size.thr = sum(var.sel.freq.n.order$var.sel.freq >= 0.1))
est.b <- rbind(csuv.m = csuv.m,
csuv.s = csuv.s,
csuv.ebic = csuv.ebic)
colnames(est.b) <- colnames(sel.mods.n.sel.freq$sel.mod)
colnames(est.b)[1] <- "intercept"
return(list(est.b = est.b,
mod.collection = sel.mods.n.sel.freq$sel.mod,
method.freq = sel.mods.n.sel.freq$sel.method.freq,
variable.freq = var.sel.freq.n.order$var.sel.freq,
variable.order = var.sel.freq.n.order$var.order))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.