#################################################
## This file includes the helper functions for CSUV
#################################################
## ====== filtering fitted models =======
get.best.mod <- function(cv.results, selection.criterion, beta.type,
q, method.names, B) {
# a function to get the best k (size) fitted models in folds
#
# Args:
# cv.results: cv results from "cv.for.methods"
# is.ols: whether ols or original cv results should be used
# fld.indexes: fld to use. If is.null, then use all
#
# Returns:
# a list (index by folds) of list (mods, selection freq)
# change from list (method) of list (fold) of list (est.b/mse) of
# matrix (path*p+1) to list (fold) of matrix (path*p+1)
est.b.name <- 'est.b'
mse.name <- 'mse'
if (beta.type != '') {
est.b.name <- paste0(beta.type, '.est.b')
mse.name <-paste0(beta.type, '.mse')
}
est.b <- reshape.cv.result(cv.results$unique.mod,
item.name = est.b.name,
method.names, B)
mse <- reshape.cv.result(cv.results$unique.mod,
item.name = mse.name,
method.names, B)
# get ebic / bic / mse
criterion.val <- mse
if (selection.criterion == "ebic") {
criterion.val <- lapply(seq_len(length(mse)), function(fld.idx) {
ebic.val <- get.ebic.value.from.mse(mse = mse[[fld.idx]],
n = length(cv.results$flds[[fld.idx]]),
est.b = est.b[[fld.idx]])
return(ebic.val)
})
} else if (selection.criterion == "bic") {
criterion.val <- lapply(seq_len(length(mse)), function(fld.idx) {
bic.val <- get.bic.value.from.mse(mse = mse[[fld.idx]],
n = length(cv.results$flds[[fld.idx]]),
est.b = est.b[[fld.idx]])
return(bic.val)
}
)
}
# calculate the number of fitted models to retain
num.retain <- 1
if (q != 0) {
avg.mod.size <- mean(sapply(mse, length))
num.retain <- round(q / 100 * avg.mod.size)
}
## ===== get the fitted models with lowest mse/bic ========
result <- lapply(1:B, function(fld.idx)
get.best.mod.helper.one.fld(method.names = method.names,
est.b[[fld.idx]],
criterion.val[[fld.idx]],
num.retain))
return(list(sel.mod = do.call(rbind,
lapply(result, function(x) x$sel.mod)),
sel.method.freq = do.call(rbind,
lapply(result, function(x)
x$sel.method.freq))))
}
## ---- helper functions to get the best fitted models -----
reshape.cv.result <- function(unique.mod, item.name, method.names, B) {
# helper functions which re-structure the unique.mod
#
# Args:
# unique.mod: cv results from "cv.for.methods"
# item.name: item to get (e.g. est.b)
# fld.indexes: fld to use
#
# Returns:
# a list (index by folds) of matrix (index by methods) of the specific item
if (is.vector(unique.mod[[1]][[1]][[item.name]])) {
result <- lapply(1:B,
function(fld.idx) do.call(c, lapply(method.names,
function(method.name)
unique.mod[[method.name]][[fld.idx]][[item.name]])))
return(result)
} else{
return(lapply(1:B,
function(fld.idx)
do.call(rbind, lapply(method.names,
function(method.name) {
x <- unique.mod[[method.name]][[fld.idx]][[item.name]]
rownames(x) <- rep(method.name, nrow(x))
return(x)
}))))
}
}
get.best.mod.helper.one.fld <- function(method.names, est.b, mse, num.retain) {
# a helper function for "get.best.mod.given.size" to get the
# best k (num.retain) fitted models in one fold
#
# Args:
# method.names: method names
# est.b: estimated coefficients of the fitted models
# mse: mse
# num.retain: number of fitted models to retain
#
# Returns:
# a list (mods, selection freq)
num.retain <- min(length(mse), num.retain) # make sure num of retain is not larger than number of models
e.order <- order(mse, rowSums(est.b != 0))
sel.mod <- est.b[e.order[1:num.retain], , drop = FALSE]
mse.thr <- mse[e.order[num.retain]]
candidate.b <- est.b[which(mse <= mse.thr), , drop = FALSE]
candidate.e <- mse[which(mse <= mse.thr)]
last.size <- sum(est.b[e.order[num.retain], ] != 0)
sel.method.names <- rownames(candidate.b)
sel.method.freq <- sapply(method.names, function(method.name)
sum(sel.method.names == method.name))
return(list(sel.mod = sel.mod, sel.method.freq = sel.method.freq))
}
## ======== calculate variable selection frequency and get solution path =============
csuv.var.sel.freq.n.sol.path <- function(sel.mods.n.sel.freq) {
# a function to calculate variable selection frequency and get solution path
#
# Args:
# sel.mod from "get.best.mod"
#
# Returns:
# a list (variable selection frequency, soluation path)
sel.mods <- sel.mods.n.sel.freq$sel.mod
sel.model.freq <- sel.mods.n.sel.freq$sel.method.freq
var.sel.freq <- sapply(2:ncol(sel.mods), # exclude intercept
function(x) max(mean(sel.mods[, x] > 0),
mean(sel.mods[, x] < 0)))
mean.beta <- abs(colMeans(sel.mods))[-1]# exclude intercept
var.order <- order(-var.sel.freq, -mean.beta)
return(list(var.sel.freq = var.sel.freq,
sel.model.freq = sel.model.freq,
var.order = var.order))
}
## ====== get the final fitted models after filtering ==========
csuv.thr.by.size <- function(X, Y, intercept, var.order, size.thr, coef.est.method) {
# a function to get the final fitted model using size threshold
#
# Args:
# X: covariates
# Y: response
# intercept: intercept
# var.order: variable ordering
# size.thr: number of variable to retain
# coef.est.method: refitting method
#
# Returns:
# a fitted model
est.i <- if (size.thr > 0) { est.i <- var.order[1:size.thr] } else c()
return(csuv.refit(X, Y, intercept, est.i, coef.est.method))
}
csuv.thr.by.freq <- function(X, Y, intercept, var.sel.freq, freq.thr, coef.est.method) {
# a function to get the final fitted model using frequency threshold
#
# Args:
# X: covariates
# Y: response
# intercept: intercept
# var.order: variable ordering
# freq.thr: minimum frequency in order to retain a variable
# coef.est.method: refitting method
#
# Returns:
# a fitted model
est.i <- which(var.sel.freq >= freq.thr)
return(csuv.refit(X, Y, intercept, est.i, coef.est.method))
}
csuv.by.ebic <- function(X, Y, intercept, var.order, size.thr) {
# a function to get the final fitted model using size threshold
#
# Args:
# X: covariates
# Y: response
# intercept: intercept
# var.order: variable ordering
# size.thr: number of variable to retain
# coef.est.method: refitting method
#
# Returns:
# a fitted model
p <- ncol(X)
est.b <- do.call(rbind, lapply(1:size.thr, function(size) {
b <- rep(0, p)
b[var.order[1:size]] <- 1
return(b)
}))
est.b <- cbind(intercept, est.b)
est.b <- lm.ols.refit(X, Y, intercept, est.b)$est.b
mse <- lm.mse(X, Y, est.b = est.b)
n <- length(Y)
ebic.val <- get.ebic.value.from.mse(mse, n, est.b)
return(est.b[which.min(ebic.val), ])
}
csuv.refit <- function(X, Y, intercept, est.i, coef.est.method) {
# a helper function for "csuv.thr.by.freq" and "csuv.thr.by.size"
# to refit the selected variables
#
# Args:
# X: covariates
# Y: response
# intercept: intercept
# est.i: selected covariate indexes
# coef.est.method: refitting method
#
# Returns:
# a fitted model
p <- ncol(X)
new.b <- rep(0, (p + 1))
if (length(est.i)) {
new.b[c(0, est.i) + 1] <- coef.est.method(X[, est.i, drop = FALSE], Y,
intercept = intercept)$est.b
}
return(new.b)
}
## ====== sort and remove duplication ==========
unique.mod.for.methods <- function(cv.methods.result, beta.type) {
# a function which removes duplicated models (in terms of selection)
#
# Args:
# cv.folds.result: cv result of methods with folds
# beta.type: which type of beta to use (ols, original or all?)
# fld.idx: which fld index to use. If omitted, all results will be used
#
# Returns:
# a list (methods) of list (flds) of list(est.b, mse)
return(lapply(cv.methods.result, function(cv.folds.result)
unique.mod.helper.one.method(cv.folds.result, beta.type)))
}
unique.mod.helper.one.method <- function(cv.flds.result, beta.type) {
# a helper function of "unique.mod.for.methods" which removes duplicated models (in terms of selection)
#
# Args:
# cv.folds.result: cv result of a method with folds
# beta.type: which type of beta to use (ols, original or all?)
# fld.idx: which fld index to use. If omitted, all results will be used
#
# Returns:
# a list (flds) of list(est.b, mse)
get.required.beta.n.mse <- function(cv.fld.result, beta.type) {
mse <- est.b <- NULL
est.b.name <- mse.name <- NULL
if(beta.type == 'all') {
ols.result <- get.required.beta.n.mse(cv.fld.result, 'ols')
non.ols.result <- get.required.beta.n.mse(cv.fld.result, 'ols')
est.b <- rbind(ols.result$est.b, non.ols.result$est.b)
mse <- c(ols.result$mse, non.ols.result$mse)
est.b.name <- 'all.est.b'
mse.name <- 'all.mse'
} else {
est.b.name <- ifelse(beta.type == 'ols', "ols.est.b", "est.b")
mse.name <- ifelse(beta.type == 'ols', "ols.mse", "mse")
est.b <- cv.fld.result[[est.b.name]]
mse <- cv.fld.result[[mse.name]]
}
attr(est.b, "name") <- est.b.name
attr(mse, "name") <- mse.name
return (list(est.b = est.b,
mse = mse))
}
return(lapply(cv.flds.result, function(cv.fld.result) {
beta.n.mse <- get.required.beta.n.mse(cv.fld.result, beta.type)
est.b <- beta.n.mse$est.b
mse <- beta.n.mse$mse
get.unique.mod(est.b = est.b, mse = mse)
}))
}
get.unique.mod <- function(est.b, mse) {
names.to.use <- c(attr(est.b, "name"), attr(mse, "name"))
if(is.null(names.to.use)) {
names.to.use <- c('est.b', 'mse')
}
# helper function of "unique.mod.helper.one.method", which removes duplicated models (in terms of selection)
#
# Args:
# est.b: estimated coefficients
# mse: mse
#
# Returns:
# a list (est.b, mse)
# sort the est.b so that the one with lowest mse will be selected
e.order <- order(mse, rowSums(est.b != 0)) # order by mse, then by model size
est.b <- est.b[e.order, ]
mse <- mse[e.order]
if(is.null(dim(est.b))){
result = list(est.b = matrix(est.b, nrow = 1),
mse = mse)
} else {
non.duplicate.i <- which(!duplicated(est.b != 0))
result <- list(est.b = est.b[non.duplicate.i, , drop = FALSE],
mse = mse[non.duplicate.i])
if (any(is.na(result$est.b)) || any(is.na(result$mse))) {
stop("get.unique.mod: estimated beta or mse cannot be NA")
}
}
names(result) <- names.to.use
return(result)
}
get.ebic.value.from.mse <- function(mse, n, est.b) {
p <- ncol(est.b[, -1, drop = FALSE]) # number of covariate, first one is intercept
k <- rowSums(est.b[, -1, drop = FALSE] != 0) # selected model size
return(get.bic.value.from.mse(mse, n, est.b) + log(choose(p, k)))
}
get.bic.value.from.mse <- function(mse, n, est.b) {
k <- rowSums(est.b[, -1, drop = FALSE] != 0) # selected model size
bic.value <- n * log(mse) + log(n) * k
return(bic.value)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.