Nothing
#' Conducting Cross-validated Kernel Ensemble
#'
#' Conducting Cross-validated Kernel Ensemble based on user-specified formula.
#'
#' Perform Cross-validated Kernel Ensemble and optionally test for kernel effect
#' based on user-specified formula.
#'
#' @param formula (formula) A user-supplied formula for the null model. Should
#' contain at least one kernel term.
#' @param kern_func_list (list) A list of kernel functions in the kernel library.
#' @param data (data.frame, n*d) A data.frame, list or environment (or object
#' coercible by as.data.frame to a data.frame), containing the variables in
#' formula. Neither a matrix nor an array will be accepted.
#' @param formula_test (formula) A user-supplied formula indicating the alternative
#' effect to test. All terms in the alternative mode must be specified as kernel terms.
#' @param mode (character) A character string indicating which tuning parameter
#' criteria is to be used.
#' @param strategy (character) A character string indicating which ensemble
#' strategy is to be used.
#' @param beta_exp (numeric/character) A numeric value specifying the parameter
#' when strategy = "exp" \code{\link{ensemble_exp}}.
#' @param lambda (numeric) A numeric string specifying the range of
#' tuning parameter to be chosen. The lower limit of lambda must be above 0.
#' @param test (character) Type of hypothesis test to conduct. Must be either
#' 'asymp' or 'boot'.
#' @param alt_kernel_type (character) Type of alternative kernel effect to consider.
#' Must be either "linear" or "ensemble".
#' @param B (numeric) Number of bootstrap samples.
#' @param verbose (logical) Whether to print additional messages.
#'
#' @return \item{lambda}{(numeric) The selected tuning parameter based on the
#' estimated ensemble kernel matrix.}
#'
#' \item{beta}{(matrix, d_fixed*1) Fixed effect estimates.}
#'
#' \item{alpha}{(matrix, n*1) Kernel effect estimates.}
#'
#' \item{K}{(matrix, n*n) Estimated ensemble kernel matrix.}
#'
#' \item{u_hat}{(vector of length K) A vector of weights of the kernels in the
#' library.}
#'
#' \item{base_est}{(list) The detailed estimation results of K kernels.}
#'
#' \item{pvalue}{(numeric) If formula_test is given, p-value of the test is returned.}
#'
#' @author Jeremiah Zhe Liu
#' @seealso \code{\link{estimation}}
#'
#' method: \code{\link{generate_kernel}}
#'
#' mode: \code{\link{tuning}}
#'
#' strategy: \code{\link{ensemble}}
#' @references Xihong Lin. Variance component testing in generalised linear
#' models with random effects. June 1997.
#'
#' Arnab Maity and Xihong Lin. Powerful tests for detecting a gene effect in
#' the presence of possible gene-gene interactions using garrote kernel
#' machines. December 2011.
#'
#' Petra Bu z kova, Thomas Lumley, and Kenneth Rice. Permutation and
#' parametric bootstrap tests for gene-gene and gene-environment interactions.
#' January 2011.
#'
#' @examples
#'
#' kern_par <- data.frame(method = rep("rbf", 3),
#' l = rep(3, 3), p = rep(2, 3),
#' stringsAsFactors = FALSE)
#' # define kernel library
#' kern_func_list <- define_library(kern_par)
#'
#' n <- 10
#' d <- 4
#' formula <- y ~ x1 + x2 + k(x3, x4)
#' formula_test <- y ~ k(x1, x2) * k(x3, x4)
#' set.seed(1118)
#' data <- as.data.frame(matrix(
#' rnorm(n * d),
#' ncol = d,
#' dimnames = list(NULL, paste0("x", 1:d))
#' ))
#' beta_true <- c(1, .41, 2.37)
#' lnr_kern_func <- generate_kernel(method = "rbf", l = 3)
#' kern_effect_lnr <-
#' parse_kernel_variable("k(x3, x4)", lnr_kern_func, data)
#' alpha_lnr_true <- rnorm(n)
#'
#' data$y <- as.matrix(cbind(1, data[, c("x1", "x2")])) %*% beta_true +
#' kern_effect_lnr %*% alpha_lnr_true
#'
#' data_train <- data
#'
#' pvalue <- cvek(formula,
#' kern_func_list,
#' data_train,
#' formula_test,
#' mode = "loocv",
#' strategy = "stack",
#' beta_exp = 1,
#' lambda = exp(seq(-2, 2)),
#' test = "asymp",
#' alt_kernel_type = "linear",
#' verbose = FALSE)$pvalue
#'
#' @export cvek
cvek <- function(formula,
kern_func_list,
data,
formula_test = NULL,
mode = "loocv",
strategy = "stack",
beta_exp = 1,
lambda = exp(seq(-10, 5)),
test = "boot",
alt_kernel_type = "linear",
B = 100,
verbose = FALSE) {
# specify model matrices for main model
model_matrices <- parse_cvek_formula(
formula,
kern_func_list = kern_func_list,
data = data,
verbose = verbose
)
# conduct estimation
est_res <- estimation(
Y = model_matrices$y,
X = model_matrices$X,
K_list = model_matrices$K,
mode = mode,
strategy = strategy,
beta_exp = beta_exp,
lambda = lambda
)
est_res$model_matrices <- model_matrices
est_res$kern_func_list <- kern_func_list
est_res$formula <- formula
est_res$data <- data
# conduct hypothesis test if formula_test is given.
if (class(formula_test) == "formula") {
est_res$pvalue <- cvek_test(est_res,
formula_test,
kern_func_list,
data,
test = test,
alt_kernel_type = alt_kernel_type,
B = B,
verbose = verbose
)
}
class(est_res) <- "cvek"
est_res
}
#' Conduct Hypothesis Testing
#'
#' Conduct hypothesis testing based on CVEK estimation result.
#'
#' Conduct score tests comparing a fitted model and a more general alternative
#' model.
#'
#' There are two tests available here:
#'
#' \bold{Asymptotic Test}
#'
#' This is based on the classical variance component test to construct a
#' testing procedure for the hypothesis about Gaussian process function.
#'
#' \bold{Bootstrap Test}
#'
#' When it comes to small sample size, we can use bootstrap test instead, which
#' can give valid tests with moderate sample sizes and requires similar
#' computational effort to a permutation test. In the case of bootstrap test,
#' we can specify the form of the null derivative kernel (alt_kernel_type) to be
#' linear or ensemble. The bootstrap test allows the null derivative
#' kernel to be estimated adaptively from data using the same corresponding
#' ensemble weights to better represent the alternative hypothesis space.
#'
#' @param est_res (list) Estimation results returned by estimation() procedure.
#' @param formula_test (formula) A user-supplied formula indicating the alternative
#' effect to test. All terms in the alternative mode must be specified as kernel terms.
#' @param kern_func_list (list) A list of kernel functions in the kernel library
#' @param data (data.frame, n*d) A data.frame, list or environment (or object
#' coercible by as.data.frame to a data.frame), containing the variables in
#' formula. Neither a matrix nor an array will be accepted.
#' @param test (character) Type of hypothesis test to conduct.
#' Must be either 'asymp' or 'boot'.
#' @param alt_kernel_type (character) Type of alternative kernel effect to consider.
#' Must be either 'linear' or 'ensemble'
#' @param B (integer) A numeric value indicating times of resampling when test
#' = "boot".
#' @param verbose (logical) Whether to print additional messages.
#'
#' @return \item{pvalue}{(numeric) p-value of the test.}
#'
#' @keywords internal
#' @export cvek_test
cvek_test <- function(est_res,
formula_test,
kern_func_list,
data,
test = "boot",
alt_kernel_type = "linear",
B = 100,
verbose = FALSE) {
alt_kernel_type <-
match.arg(alt_kernel_type, c("linear", "ensemble"))
# define kernel functions for alternative effect
if (alt_kernel_type == "linear") {
lnr_kern_func <- generate_kernel(method = "linear")
alt_kern_func_list <- list(lnr_kern_func)
} else if (alt_kernel_type == "ensemble") {
alt_kern_func_list <- kern_func_list
}
# parse alternative effect formula to get kernel matrices
test_matrices <- parse_cvek_formula(
formula_test,
kern_func_list = alt_kern_func_list,
data = data,
verbose = verbose
)
# check if alternative effect contains linear terms
linear_terms <-
setdiff(colnames(test_matrices$X), "(Intercept)")
if (length(linear_terms) > 0) {
stop(
gettextf(
"'formula_test' should contain only kernel terms. Linear terms found: %s",
paste0(linear_terms, collapse = ", ")
)
)
}
# compute alternative kernel effect
if (alt_kernel_type == "linear") {
K_std_list <-
lapply(test_matrices$K[[1]], function(K)
K / sum(diag(K)))
K_int <- Reduce("+", K_std_list)
} else {
u_weight <- est_res$u_hat
K_int <- 0
for (k in seq(length(kern_func_list))) {
K_std_list <-
lapply(test_matrices$K[[k]], function(K)
K / sum(diag(K)))
K_temp <- Reduce("+", K_std_list)
K_int <- K_int + u_weight[k] * K_temp
}
}
model_matrices <- est_res$model_matrices
# estimate variance component parameters
y_fixed <- model_matrices$X %*% est_res$beta
sigma2_hat <- estimate_sigma2(
Y = model_matrices$y,
X = model_matrices$X,
lambda_hat = est_res$lambda,
y_fixed_hat = y_fixed,
alpha_hat = est_res$alpha,
K_hat = est_res$K
)
tau_hat <- sigma2_hat / est_res$lambda
# compute p-value
func_name <- paste0("test_", test)
do.call(
func_name, list(Y = model_matrices$y,
X = model_matrices$X,
y_fixed = y_fixed,
alpha0 = est_res$alpha,
K_ens = est_res$K,
K_int = K_int,
sigma2_hat = sigma2_hat,
tau_hat = tau_hat,
B = B
)
)
}
#' Parsing User-supplied Formula
#'
#' Parsing user-supplied formula to fixed-effect and kernel matrices.
#'
#' @param formula (formula) A user-supplied formula.
#' @param kern_func_list (list) A list of kernel functions in the kernel library
#' @param data (data.frame, n*d) A data.frame, list or environment (or object
#' coercible by as.data.frame to a data.frame), containing the variables in
#' formula. Neither a matrix nor an array will be accepted.
#' @param data_new (data.frame, n_new*d) New data for computing predictions.
#' @param verbose (logical) Whether to print additional messages.
#'
#' @return A list of three slots:
#' \item{Y}{(matrix, n*1) The vector of response variable.}
#' \item{X}{(matrix, n*d_fix) The fixed effect matrix.}
#' \item{K}{(list of matrices) A nested list of kernel term matrices.
#' The first level corresponds to each base kernel function in
#' kern_func_list, the second level corresponds to each kernel term
#' specified in the formula.}
#'
#' @details
#' The formula object is exactly like the formula for a GLM except that user can
#' use k() to specify kernel terms.
#' Additionally, user can specify interaction between kernel terms (using either '*' and ':'),
#' and exclude interaction term by including -1 on the RHS of formula.
#'
#' @author Jeremiah Zhe Liu
#' @export parse_cvek_formula
parse_cvek_formula <-
function(formula, kern_func_list, data,
data_new = NULL, verbose = FALSE) {
# extract dependent variables and terms
tf <- terms.formula(formula, specials = c("k"))
term_names <- attr(tf, "term.labels")
num_terms <- length(term_names)
var_table <- attr(tf, "factors")
# extract dependent variables and intercept
intercept <- attr(tf, "intercept")
response_vector <- NULL
if ((attr(tf, "response") > 0) & is.null(data_new)) {
response_name <- as.character(attr(tf, "variables")[2])
response_vector <- data[, response_name]
}
# identify fixed-effect and kernel-effect terms
kern_var_idx <- attr(tf, "specials")$k
kern_term_idx <- NULL
fixd_term_idx <- NULL
if (length(kern_var_idx) > 0) {
# if the formula contains kernel terms, identify their locations
kern_term_idx <-
which(colSums(var_table[kern_var_idx, , drop = FALSE]) > 0)
fixd_term_idx <- setdiff(1:num_terms, kern_term_idx)
if (length(fixd_term_idx) == 0) {
# set fixd_term_idx back to NULL if it is an empty set
fixd_term_idx <- NULL
}
} else {
fixd_term_idx <- 1:num_terms
}
# assemble fixed-effect and kernel-effect formula
kern_effect_formula <- NULL
fixed_effect_formula <- NULL
if (!is.null(kern_term_idx)) {
kern_effect_formula <-
as.formula(paste("~", paste(term_names[kern_term_idx], collapse = " + ")))
}
if (!is.null(fixd_term_idx)) {
fixed_effect_formula <-
paste("~", paste(term_names[fixd_term_idx], collapse = " + "))
if (intercept == 0)
fixed_effect_formula <- paste0(fixed_effect_formula, " -1")
fixed_effect_formula <- as.formula(fixed_effect_formula)
} else if (intercept > 0) {
# intercept only
fixed_effect_formula <- ~ 1
}
# produce fixed-effect matrix X using model.frame
fixed_effect_matrix <- NULL
if (!is.null(fixed_effect_formula)) {
fixed_effect_data <- data
if (!is.null(data_new)) {
fixed_effect_data <- data_new
}
mm <- model.frame(fixed_effect_formula, data = data)
xlevels <- .getXlevels(terms.formula(fixed_effect_formula), mm)
m <- model.frame(fixed_effect_formula, data = fixed_effect_data,
xlev = xlevels)
fixed_effect_matrix <- model.matrix(fixed_effect_formula, data = m)
}
# produce kernel-effect matrices Ks
# (list of kernel terms, one for each kernel in library)
kernel_effect_matrix_list <-
vector("list", length = length(kern_func_list))
names(kernel_effect_matrix_list) <- names(kern_func_list)
if (!is.null(kern_effect_formula)) {
if (verbose)
print("Preparing Kernels...")
for (kern_func_id in 1:length(kern_func_list)) {
# prepare kernel function
kern_func <- kern_func_list[[kern_func_id]]
# compute kernel terms using the kern_func, then store to list
kernel_effect_matrix_list[[kern_func_id]] <-
parse_kernel_terms(kern_effect_formula, kern_func,
data = data, data_new = data_new)
}
if (verbose)
print("Done!")
}
# combine and return
list(y = response_vector,
X = fixed_effect_matrix,
K = kernel_effect_matrix_list)
}
#' Compute Kernel Matrix
#'
#' Compute kernel matrix for each kernel term in the formula.
#'
#' @param kern_effect_formula (character) A term in the formula.
#' @param kern_func (function) A kernel function. Will be overwritten to linear
#' kernel if the variable doesn't contain 'k()'.
#' @param data (data.frame, n*d) A data.frame, list or environment (or object
#' coercible by as.data.frame to a data.frame), containing the variables in
#' formula. Neither a matrix nor an array will be accepted.
#' @param data_new (data.frame, n_new*d) New data for computing predictions.
#'
#' @return \item{kern_term_list}{(list) A list of kernel matrices for each term
#' in the formula.}
#'
#' @author Jeremiah Zhe Liu
#' @keywords internal
#' @export parse_kernel_terms
parse_kernel_terms <-
function(kern_effect_formula, kern_func,
data, data_new = NULL) {
kern_var_table <- attr(terms(kern_effect_formula), "factors")
kern_var_names <- rownames(kern_var_table)
kern_term_names <- colnames(kern_var_table)
# prepare kernel variables
kern_var_list <- vector("list", length = length(kern_var_names))
names(kern_var_list) <- kern_var_names
for (var_name in kern_var_names) {
# compute individual kernel variables for each base kernel
kern_var_list[[var_name]] <-
parse_kernel_variable(var_name, kern_func = kern_func,
data = data, data_new = data_new)
}
# assemble kernel terms (for interaction terms)
kern_term_list <-
vector("list", length = length(kern_term_names))
names(kern_term_list) <- kern_term_names
for (term_id in 1:length(kern_term_names)) {
term_name <- kern_term_names[term_id]
kern_term_var_idx <- which(kern_var_table[, term_id] > 0)
# compute interaction term
kern_term_list[[term_name]] <-
Reduce("*", kern_var_list[kern_term_var_idx])
}
kern_term_list
}
#' Create Kernel Matrix
#'
#' Create kernel matrix for each variable in the formula.
#'
#' @param kern_var_name (vector of characters) Names of variables in data to
#' create the kernel matrix from. Must be a single term that is either of the form
#' \eqn{x} (a single linear term) or \eqn{k(x1, x2, \dots)} (a kernel term that may
#' contain multiple variables).
#' @param kern_func (function) A kernel function. Will be overwritten to linear
#' kernel if the variable doesn't contain 'k()'.
#' @param data (data.frame, n*d) A data.frame, list or environment (or object
#' coercible by as.data.frame to a data.frame), containing the variables in
#' formula. Neither a matrix nor an array will be accepted.
#' @param data_new (data.frame, n_new*d) New data for computing predictions.
#'
#' @return \item{kernel_mat}{(matrix, n*n) The kernel matrix corresponding to
#' the variable being computed.}
#'
#' @author Jeremiah Zhe Liu
#' @keywords internal
#' @export parse_kernel_variable
parse_kernel_variable <- function(kern_var_name, kern_func,
data, data_new=NULL) {
# parse kernel term
kern_term_formula <-
terms.formula(formula(paste("~", kern_var_name)),
specials = c("k"))
is_fixed_eff <- is.null(attr(kern_term_formula, "specials")$k)
input_var_names <- all.vars(kern_term_formula)
# sets kern_func to linear kernel if kern_var is a fixed-effect term
if (is_fixed_eff) {
kern_func <- generate_kernel(method = "linear")
}
# extract data
Z_extract_command <- gettextf("cbind(%s)",
paste(input_var_names, collapse = ", "))
Z_mat <- eval(parse(text = Z_extract_command), envir = data)
Z_mat_new <- Z_mat
if (!is.null(data_new)){
Z_mat_new <- eval(parse(text = Z_extract_command), envir = data_new)
}
# compute kernel matrix and return
kernel_mat <- kern_func(Z_mat_new, Z_mat)
kernel_mat
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.