knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(SplitKnockoff)
Author : Yuxuan Chen, Haoxue Wang, Yang Cao, Xinwei Sun, Yuan Yao
Split Knockoff is a data adaptive variable selection framework for controlling the (directional) false discovery rate (FDR) in structural sparsity, where variable selection on linear transformation of parameters is of concern. This proposed scheme relaxes the linear subspace constraint to its neighborhood, often known as variable splitting in optimization, which leads to better incoherence and possible performance improvement in both power and FDR. This vignette illustrates the usage of SplitKnockoff with simulation experiments in Cao et al. (2023)
install.packages("SplitKnockoff") # just one line code to install our package
This is a R implement on the Matlab version of Split Knockoffs. This R package is more convenient as glmnet can be used directly by
install.packages("glmnet") # just one line code to install the glmnet tool
Please update your glmnet package to the latest version for smooth usage of this package, the examples illustrated here are tested with glmnet 4.1-8.
For more information, please see the manual inside this package.
sk.filter(X, D, y, option) : the main function, Split Knockoff filter, for variable selection in structural sparsity problem.
sk.create(X, y, D, nu, option): generate the split knockoff copy for structural sparsity problem.
sk.W_path(X, D, y, nu, option): generate the $W$ statistics for split knockoff on a split LASSO path.
sk.W_fixed(X, D, y, nu, option): generate the $W$ statistics for split knockoff based on a fixed $\beta(\lambda) = \hat{\beta}$.
select(W, q, method): this function is for variable selection based on $W$ statistics.
install.packages("SplitKnockoff") # install our package library(latex2exp) library(ggplot2) library(Matrix) library(glmnet) library(SplitKnockoff)
k <- 20 # sparsity level A <- 1 # magnitude n <- 500 # sample size p <- 100 # dimension of variables c <- 0.5 # feature correlation sigma <-1 # noise level option <- list() # the target (directional) FDR control option$q <- 0.2 # whether to normalize the dataset option$normalize <- 'true' # fraction of data used to estimate \beta(\lambda) option$frac = 2/5 # choice on the set of regularization parameters for split LASSO path option$lambda <- 10.^seq(0, -6, by=-0.01) option$lambda_cv <- 10.^seq(0, -6, by=-0.01) # choice of nu for split knockoffs expo = seq(0, 2, by = 0.2) option$nu <- 10.^expo option$nu_cv <- 10.^expo num_nu <- length(option$nu) # set random seed option$seed = 1 # the number of simulation instances option$tests = 200 option$W = 's'
D_G = matrix(0, p-1, p) for (i in 1:(p-1)){ D_G[i, i] = 1 D_G[i, i+1] = -1 } # generate D1, D2, and D3 D_1 = diag(p) D_2 = D_G D_3 = rbind(diag(p), D_G) D_s = list(D_1 = D_1, D_2 = D_2, D_3 = D_3)
# generate Sigma Sigma = matrix(0, p, p) for( i in 1: p){ for(j in 1: p){ Sigma[i, j] <- c^(abs(i - j)) } }
library(mvtnorm) # package mvtnorm needed for this generation set.seed(100) X <- rmvnorm(n,matrix(0, p, 1), Sigma) # generate X
beta_true <- matrix(0, p, 1) for( i in 1: k){ beta_true[i, 1] = A if ( i%%3 == 1){ beta_true[i, 1] = 0 } }
Split knockoff for all $\nu$
# choice on the way generating the W statistics option$beta = 'path' # save Z t_Z r t_Z for making plots rawvalue = list() # save y Y = list() tests = option$tests # the number of experiments # create matrices to store results fdr_split = array(0, dim = c(3, tests, num_nu, 2)) power_split = array(0, dim = c(3, tests, num_nu, 2)) for (test in 1:tests){ # generate varepsilon set.seed(test) # generate noise and y varepsilon = matrix(rnorm(n), ncol = 1) * sqrt(sigma) y = X %*% beta_true + varepsilon Y[[test]] = y raw = list() for (j in 1:3){ # choose the respective D for each example D = D_s[[j]] gamma_true <- D %*% beta_true sk_results = sk.filter(X, D, y, option) results = sk_results$results raw[[j]] = sk_results$stats for (i in 1:num_nu){ r_sign = sk_results$stats[[i]]$r result = results[[i]]$sk fdr_split[j, test, i, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_split[j, test, i, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) result = results[[i]]$sk_plus fdr_split[j, test, i, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_split[j, test, i, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) } } rawvalue[[test]] = raw }
Split knockoff for $\nu$ chosen by cross validation
# choice on the way generating the W statistics option$beta = 'cv_all' # create matrices to store results fdr_cv = array(0, dim = c(3, tests, 2)) power_cv = array(0, dim = c(3, tests, 2)) for (test in 1:tests){ y <- Y[[test]] for (j in 1:3){ # choose the respective D for each example D = D_s[[j]] gamma_true <- D %*% beta_true sk_results = sk.filter(X, D, y, option) results = sk_results$results r_sign = sk_results$stats$r result = results$sk fdr_cv[j, test, 1] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_cv[j, test, 1] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) result = results$sk_plus fdr_cv[j, test, 2] = sum(sign(gamma_true[result]) != r_sign[result]) / max(length(result), 1) power_cv[j, test, 2] = sum(sign(gamma_true[result]) == r_sign[result]) / sum(gamma_true != 0) } } mean_fdr_cv <- apply(fdr_cv, c(1, 3), mean) sd_fdr_cv <- apply(fdr_cv, c(1, 3), sd) mean_power_cv <- apply(power_cv, c(1, 3), mean) sd_power_cv <- apply(power_cv, c(1, 3), sd)
Knockoff
library(knockoff) # package knockoff needed # create matrices to store results fdr_knockoff = array(0, dim = c(2, tests, 2)) power_knockoff = array(0, dim = c(2, tests, 2)) D = D_s[[2]] # Calculate X_bar, y_bar Z <- t(D) %*% solve(D %*% t(D)) XF <- X %*% rep(1, p) U_X <- XF / norm(XF, "F") UU_X <- cbind(U_X, matrix(0, n, n-1)) qrresult <- qr(UU_X) Qreslt <- qr.Q(qrresult) UX_perp <- Qreslt[,2:n] y_bar <- t(UX_perp) %*% y X_bar <- t(UX_perp) %*% X %*% Z for (test in 1:tests){ y <- Y[[test]] # choose D_1 for each example D = D_s[[1]] gamma_true <- D %*% beta_true k_results = knockoff.filter(X, y, knockoffs = {function(x) create.fixed(x, y=y, method='equi')}, statistic = stat.lasso_lambdasmax) W_k = k_results$statistic result = select(W_k, option$q, 'knockoff') fdr_knockoff[1, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[1, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) result = select(W_k, option$q, 'knockoff+') fdr_knockoff[1, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[1, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) # choose the D_2 for each example D = D_s[[2]] gamma_true <- D %*% beta_true k_results = knockoff.filter(X_bar, y_bar, knockoffs = {function(x) create.fixed(x, y=y_bar, method='equi')}, statistic = stat.lasso_lambdasmax) W_k = k_results$statistic result = select(W_k, option$q, 'knockoff') fdr_knockoff[2, test, 1] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[2, test, 1] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) result = select(W_k, option$q, 'knockoff+') fdr_knockoff[2, test, 2] = sum(gamma_true[result] == 0) / max(length(result), 1) power_knockoff[2, test, 2] = sum(gamma_true[result] != 0) / sum(gamma_true != 0) }
Plot figure 3
x <- expo t_value = qt(c(0.1, 0.9), tests - 1) lower_bound = t_value[1] upper_bound = t_value[2] fdr_knockoff_mean <- apply(fdr_knockoff, c(1, 3), mean) power_knockoff_mean <- apply(power_knockoff, c(1, 3), mean) fdr_knockoff_sd <- apply(fdr_knockoff, c(1, 3), sd) power_knockoff_sd <- apply(power_knockoff, c(1, 3), sd) fdr_split_mean <- apply(fdr_split, c(1, 3, 4), mean) fdr_split_sd <- apply(fdr_split, c(1, 3, 4), sd) power_split_mean <- apply(power_split, c(1, 3, 4), mean) power_split_sd <- apply(power_split, c(1, 3, 4), sd) fdr_split_top <- fdr_split_mean + fdr_split_sd * upper_bound fdr_split_bot <- fdr_split_mean + fdr_split_sd * lower_bound power_split_top <- power_split_mean + power_split_sd * upper_bound power_split_bot <- power_split_mean + power_split_sd * lower_bound ## for D_1 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(fdr_split_mean[1, , 1], fdr_split_mean[1, , 2], rep(fdr_knockoff_mean[1, 1], num_nu), rep(fdr_knockoff_mean[1, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 1], ymax = fdr_split_top[1, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[1, , 2], ymax = fdr_split_top[1, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[1])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_31_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(power_split_mean[1, , 1], power_split_mean[1, , 2], rep(power_knockoff_mean[1, 1], num_nu), rep(power_knockoff_mean[1, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 1], ymax = power_split_top[1, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[1, , 2], ymax = power_split_top[1, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[1])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## for D_2 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(fdr_split_mean[2, , 1], fdr_split_mean[2, , 2], rep(fdr_knockoff_mean[2, 1], num_nu), rep(fdr_knockoff_mean[2, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 1], ymax = fdr_split_top[2, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[2, , 2], ymax = fdr_split_top[2, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[2])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_32_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 4), y = c(power_split_mean[2, , 1], power_split_mean[2, , 2], rep(power_knockoff_mean[2, 1], num_nu), rep(power_knockoff_mean[2, 2], num_nu)), type = rep(c("Split Knockoff", "Split Knockoff+", "Knockoff", "Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 1], ymax = power_split_top[2, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[2, , 2], ymax = power_split_top[2, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[2])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue", "Knockoff" = "green", "Knockoff+" = "yellow")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## for D_3 ## plot for FDR png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_fdr.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 2), y = c(fdr_split_mean[3, , 1], fdr_split_mean[3, , 2]), type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 1], ymax = fdr_split_top[3, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = fdr_split_bot[3, , 2], ymax = fdr_split_top[3, , 2]), fill = "blue", alpha = 0.05) + geom_abline(intercept = option$q, slope=0, linetype="dashed") + labs(x = TeX("$\\log_{10} (\\nu)$"), y = TeX("$FDR_{dir}$")) + ggtitle(expression('D'[3])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off() ## plot for Power png(file='D:/SplitKnockoff_results/simu_experiments/Figure/figure_33_power.png', height=1000, width=1000) plot_data <- data.frame( x = rep(x, 2), y = c(power_split_mean[3, , 1], power_split_mean[3, , 2]), type = rep(c("Split Knockoff", "Split Knockoff+"), each = num_nu) ) # Plotting fig <- ggplot() + geom_line(data = plot_data, aes(x = x, y = y, color = type)) + geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 1], ymax = power_split_top[3, , 1]), fill = "red", alpha = 0.05) + geom_ribbon(aes(x = expo, ymin = power_split_bot[3, , 2], ymax = power_split_top[3, , 2]), fill = "blue", alpha = 0.05) + labs(x = TeX("$\\log_{10} (\\nu)$"), y = "Power") + ggtitle(expression('D'[3])) + coord_cartesian(xlim = c(0, 2), ylim = c(0, 1), expand = FALSE) + scale_color_manual(values = c("Split Knockoff" = "red", "Split Knockoff+" = "blue")) + theme_light() + theme(plot.title = element_text(hjust = 0.5)) print(fig) dev.off()
saveRDS(list(mean_fdr_D1_knockoff = fdr_knockoff_mean[1, 1], sd_fdr_D1_knockoff = fdr_knockoff_sd[1, 1], mean_power_D1_knockoff = power_knockoff_mean[1, 1], sd_power_D1_knockoff = power_knockoff_sd[1, 1], mean_fdr_D2_knockoff = fdr_knockoff_mean[2, 1], sd_fdr_D2_knockoff = fdr_knockoff_sd[2, 1], mean_power_D2_knockoff = power_knockoff_mean[2, 1], sd_power_D2_knockoff = power_knockoff_sd[2, 1], mean_fdr_D1_sk = mean_fdr_cv[1, 1], sd_fdr_D1_sk = sd_fdr_cv[1, 1], mean_power_D1_sk = mean_power_cv[1, 1], sd_power_D1_sk = sd_power_cv[1, 1], mean_fdr_D2_sk = mean_fdr_cv[2, 1], sd_fdr_D2_sk = sd_fdr_cv[2, 1], mean_power_D2_sk = mean_power_cv[2, 1], sd_power_D2_sk = sd_power_cv[2, 1], mean_fdr_D3_sk = mean_fdr_cv[3, 1], sd_fdr_D3_sk = sd_fdr_cv[3, 1], mean_power_D3_sk = mean_power_cv[3, 1], sd_power_D3_sk = sd_power_cv[3, 1], mean_fdr_D1_knockoff_pl = fdr_knockoff_mean[1, 2], sd_fdr_D1_knockoff_pl = fdr_knockoff_sd[1, 2], mean_power_D1_knockoff_pl = power_knockoff_mean[1, 2], sd_power_D1_knockoff_pl = power_knockoff_sd[1, 2], mean_fdr_D2_knockoff_pl = fdr_knockoff_mean[2, 2], sd_fdr_D2_knockoff_pl = fdr_knockoff_sd[2, 2], mean_power_D2_knockoff_pl = power_knockoff_mean[2, 2], sd_power_D2_knockoff_pl = power_knockoff_sd[2, 2], mean_fdr_D1_sk_pl = mean_fdr_cv[1, 2], sd_fdr_D1_sk_pl = sd_fdr_cv[1, 2], mean_power_D1_sk_pl = mean_power_cv[1, 2], sd_power_D1_sk_pl = sd_power_cv[1, 2], mean_fdr_D2_sk_pl = mean_fdr_cv[2, 2], sd_fdr_D2_sk_pl = sd_fdr_cv[2, 2], mean_power_D2_sk_pl = mean_power_cv[2, 2], sd_power_D2_sk_pl = sd_power_cv[2, 2], mean_fdr_D3_sk_pl = mean_fdr_cv[3, 2], sd_fdr_D3_sk_pl = sd_fdr_cv[3, 2], mean_power_D3_sk_pl = mean_power_cv[3, 2], sd_power_D3_sk_pl = sd_power_cv[3, 2]), file = 'D:/SplitKnockoff_results/simu_experiments/Table/table1.rds')
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.