knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)
In this document we will compare several different approaches to building confidence intervals in high dimensions using the example in Section 1.5 of "Rank conditional coverage and confidence intervals in high dimensional problems" by Jean Morrison and Noah Simon.
We will start by setting up the problem and generating data. Then we will run each method one at a time for a single scenario. At the end, you will find the code that generates the results shown in the paper exactly.
For this exploration we will use setting 2 from section 1.5. We have 1000 parameters generated from a $N(0, 1)$ distribution. Each of our observed statistics $Z_i$ is generated as $$Z_i \sim N(\theta_i, 1).$$ We will rank these statistics based on their absolute value.
library(ggplot2) library(rcc) library(rccSims) library(selectiveInference) set.seed(1e7) theta <- rnorm(1000) Z <- rnorm(n=1000, mean=theta) j <- order(abs(Z), decreasing=TRUE) rank <- match(1:1000, j)
We construct standard marginal confidence intervals for the parameters associated with each of these estimates. We will use $\alpha=0.1$ to give an expected rate of coverage of 90\%.
ci.naive <- cbind(Z - qnorm(0.95), Z + qnorm(0.95)) #Average coverage sum(ci.naive[,1] <= theta & ci.naive[,2] >= theta)/1000
We can plot these intervals and color them by whether or not they cover their target parameter. Notice that even though the overall coverage rate is close to the nominal level, a lot of the non-covering intervals are constructed for the most extreme statistics.
rccSims::plot_cis(rank, ci.naive, theta) + ggtitle("Standard Marginal Intervals")
Here is the same plot for only the top 20\% of statistics. In this plot, points show the value of the true parameter.
rccSims::plot_cis(rank, ci.naive, theta, prop = 0.2, plot.truth=TRUE) + ggtitle("Standard Marginal Intervals")
This is just one data set. To see what the expected coverage rates at each rank are we'll simulate 100 more data sets:
z_many <- replicate(n=100, expr={rnorm(n=1000, mean=theta)}) naive.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ j <- order(abs(zz), decreasing=TRUE) ci <- cbind(zz - qnorm(0.975), zz + qnorm(0.975)) covered <- (ci[,1] <= theta & ci[,2] >= theta)[j] return(covered) })
Plotting the average coverage at each rank:
dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(naive.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept = 0.9)+ ggtitle("Average Coverage of Marginal Intervals") + theme_bw() + theme(panel.grid=element_blank())
Now lets look at the intervals from the selection adjusted methods proposed by @Benjamini2005, @Weinstein2013, and @Reid2014. These approaches all require us to make a selection before constructing intervals and we will get different intervals if we select a different subset of estimates. For this exploration, we select the top ten percent of estimates based on the absolute value ranking.
Using this selection rule, the @Benjamini2005 intervals are equivalent to the $1-\frac{100*0.1}{1000} = 0.99$ coverage marginal intervals:
Z.sel <- Z[rank <= 100] theta.sel <- theta[rank <= 100] ci.by <- cbind(Z.sel - qnorm(0.995) , Z.sel + qnorm(0.995)) sum(ci.by[,1] <= theta.sel & ci.by[,2] >= theta.sel)/100
rccSims::plot_cis(rank[rank <= 100], ci.by, theta.sel, plot.truth=TRUE) + ggtitle("Benjamini and Yukatieli Intervals")
Now averaging coverage of the BY intervals over 100 data sets:
system.time(by.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ j <- order(abs(zz), decreasing=TRUE) ci <- cbind(zz[j][1:100] - qnorm(0.995), zz[j][1:100] + qnorm(0.995)) covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100]) return(covered) })) #The average coverage in each simulation is close to the nominal level summary(colMeans(by.coverage))
dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(by.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of Benjamini and Yukatielli Intervals") + theme_bw() + theme(panel.grid=element_blank())
These intervals have reduced coverage for the top ranked estimates even though the average coverage in the selected set is correct.
For the @Weinstein2013 intervals, we use code that accompanied that paper. This code is included in the rccSims
package for convenience. The Shortest.CI
function used below is part of this code. These intervals are asymetric and narrower than the @Benjamini2005 intervals but still control the average coverage in the selected set.
#We need to give this method the "cutpoint" or minimum value of Z ct <- abs(Z[rank==101]) wfb <- lapply(Z, FUN=function(x){ if(abs(x) < ct) return(c(NA, NA)) ci <- try(rccSims:::Shortest.CI(x, ct=ct, alpha=0.1), silent=TRUE) if(class(ci) == "try-error") return(c(NA, NA)) #Sometimes WFB code produces errors return(ci) }) ci.wfb <- matrix(unlist(wfb), byrow=TRUE, nrow=1000)[rank <= 100,] sum(ci.wfb[,1] <= theta[rank <=100] & ci.wfb[,2]>= theta[rank<=100])/100
rccSims::plot_cis(rank[rank <= 100], ci.wfb, theta.sel, plot.truth=TRUE) + ggtitle("Weinstein, Fithian, and Benjamini Intervals")
Most of the non-covering intervals are still for the parameters with the most significnt estimates. This pattern is clear when we look at the average coverage over 100 data sets:
system.time(wfb.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ j <- order(abs(zz), decreasing=TRUE) ct <- abs(zz[j][101]) wfb <- lapply(zz[j][1:100], FUN=function(x){ if(abs(x) < ct) return(c(NA, NA)) ci <- try(rccSims:::Shortest.CI(x, ct=ct, alpha=0.1), silent=TRUE) if(class(ci) == "try-error") return(c(NA, NA)) #Sometimes WFB code produces errors return(ci) }) ci <- matrix(unlist(wfb), byrow=TRUE, nrow=100) covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100]) return(covered) })) #The average coverage in each simulation is close to the nominal level summary(colMeans(wfb.coverage))
dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(wfb.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of Weinstein, Fithian and Benjamini Intervals") + theme_bw() + theme(panel.grid=element_blank())
Finally, lets look at the intervals of @Reid2014. These are implemented in the selectiveInference
R package
M <- manyMeans(y=Z, k=100, alpha=0.1, sigma=1) ci.rtt <- matrix(nrow=1000, ncol=2) ci.rtt[M$selected.set, ] <- M$ci ci.rtt <- ci.rtt[rank <= 100,] sum(ci.rtt[,1] <= theta[rank <=100] & ci.rtt[,2]>= theta[rank<=100])/100
rccSims::plot_cis(rank[rank <= 100], ci.rtt, theta.sel, plot.truth=TRUE) + ggtitle("Reid, Taylor, and Tibshirani Intervals")
For the most part, these intervals are narrower than the @Weinstein2013 intervals. Non-covering intervals tend to be concentrated at the two extremes -- parameters associated with the most and least significant estimates tend to go uncovered. We can see this by looking at the average over 100 simulations:
system.time(rtt.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ j <- order(abs(zz), decreasing=TRUE) M <- manyMeans(y=zz, k=100, alpha=0.1, sigma=1) ci <- matrix(nrow=1000, ncol=2) ci[M$selected.set, ] <- M$ci ci <- ci[j,][1:100,] covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100]) return(covered) })) #The average coverage in each simulation is close to the nominal level summary(colMeans(rtt.coverage))
dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(rtt.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of Reid, Taylor, and Tibshirani Intervals") + theme_bw() + theme(panel.grid=element_blank())
Now we will construct intervals for the same problem using the parametric bootstrap described in Section 2.3. Since we are ranking based on absolute value, we will use the variation given in Supplementary Algorithm 2. The procedure is implemented in the par_bs_ci
function in the rcc
package but here we go through the steps explicitly. First we estimate the average bias at each rank by bootstrapping:
set.seed(13421) B <- replicate(n = 500, expr = { w <- rnorm(n=1000, mean=Z, sd=1) k <- order(abs(w), decreasing=TRUE) sign(w[k])*(w[k]-Z[k]) }) dim(B)
Next we calculate the 0.05 and 0.95 quantiles of the bias for each rank.
qs <- apply(B, MARGIN=1, FUN=function(x){quantile(x, probs=c(0.05, 0.95))})
Finally, we construct the intervals by pivoting
ci.boot <- cbind(Z[j]-qs[2,], Z[j]-qs[1,]) which.neg <- which(Z[j] < 0) ci.boot[ which.neg , ] <- cbind(Z[j][which.neg] + qs[1,which.neg], Z[j][which.neg]+qs[2,which.neg]) #Get CI's in the same order as estimates ci.boot <- ci.boot[rank,] sum(ci.boot[,1] <= theta & ci.boot[,2] >= theta)/1000
For comparison, here is how to get the same intervals using the rcc
package.
set.seed(13421) ci.boot2 <- rcc::par_bs_ci(beta=Z, n.rep=500) head(ci.boot2) all.equal(ci.boot2$ci.lower, ci.boot[,1]) all.equal(ci.boot2$ci.upper, ci.boot[,2])
Here are the parametric bootstrap intervals for all ranks:
rccSims::plot_cis(rank, ci.boot, theta) + ggtitle("Parametric Bootstrap Intervals")
and for just the top 20\% of ranks
rccSims::plot_cis(rank, ci.boot, theta, prop=0.2, plot.truth=TRUE) + ggtitle("Parametric Bootstrap Intervals")
Looking at the average coverage over 100 data sets we see that the parametric bootstrap intervals are slightly conservative but that the average coverage for each rank is close to the nominal level.
system.time(boot.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ ci <- rcc::par_bs_ci(beta=zz)[, c("ci.lower", "ci.upper")] covered <- (ci[,1] <= theta & ci[,2] >= theta) return(covered) })) summary(colMeans(boot.coverage))
dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(boot.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of Parametric Bootstrap Intervals") + theme_bw() + theme(panel.grid=element_blank())
The difference between the observed average coverage at each rank and the nominal level is a result of using $Z_i$ as an estimate for $\theta_i$ in the bootstrapping step. If we knew $\theta_i$ we could generate the oracle bootstrap intervals which achieve exactly the right coverage at each rank (and are much smaller):
oracle.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ ci <- rcc::par_bs_ci(beta=zz, theta=theta)[, c("ci.lower", "ci.upper")] covered <- (ci[,1] <= theta & ci[,2] >= theta) return(covered) }) summary(colMeans(oracle.coverage))
dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(oracle.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of Oracle Intervals") + theme_bw() + theme(panel.grid=element_blank())
There are also empirical Bayes (EB) proposals for estimating praameters in high dimensional settings. Here we will look at the credible intervals generated using the method of @Stephens2016, which is implemented in the ashr
package. This method assumes that $\theta_i$ are drawn from a unimodal distribution and that $Z_i \sim N(\theta_i, \sigma_i)$ where $\sigma_i$ is known. These assumptions both hold in this case so ashr
does well. Unlike the selection adjusted methods, ashr
also has an RCC close to the nominal level at every rank.
library(ashr) ash.res <- ash(betahat = Z, sebetahat = rep(1, 1000), mixcompdist = "normal") ci.ash <- ashci(ash.res, level=0.9, betaindex = 1:1000, trace=FALSE) sum(ci.ash[,1]<= theta & ci.ash[,2] >= theta)/1000
Here are the ashr intervals at all ranks
rccSims::plot_cis(rank, ci.ash, theta) + ggtitle("ashr Credible Intervals (Stephens 2016)")
and for just the parameters with estimates in the top 20\%
rccSims::plot_cis(rank, ci.ash, theta, prop=0.2, plot.truth=TRUE) + ggtitle("ashr Credible Intervals (Stephens 2016)")
We can look at the average coverage of the ashr
credible intervals over 100 simulations. It is worth noting that we have conducted the simulations from more of a frequentist point of view --- the parameters were fixed at the beginning and in each simulation we simply generate new statistics. From a Bayesian perspective, it would be more appropriate to generate a new parameter vector from the prior each time. This might explain why we get overall slight undercoverage from the ashr
intervals in this setting. ashr
also takes substantially longer to run than the parametric bootstrap (on a normal laptop, the code below took nearly 7 minutes while the parametric bootstrap took only 30 seconds.) In this example, ashr
does a good job controlling the RCC for the top parameters. This is, in part, because the true parameters are sparse. We found in the paper that when the parameters are not sparse, performance is much worse.
system.time(ash.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){ j <- order(abs(zz), decreasing = TRUE) ash.res <- ash(betahat = zz, sebetahat = rep(1, 1000), mixcompdist = "normal") ci <- ashci(ash.res, level=0.9, betaindex = 1:1000, trace=FALSE) covered <- (ci[,1] <= theta & ci[,2] >= theta)[j] return(covered) })) summary(colMeans(ash.coverage))
dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(ash.coverage)) ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage") + geom_hline(yintercept=0.9) + ggtitle("Average Coverage of ashr Credible Intervals") + theme_bw() + theme(panel.grid=element_blank())
All of the interval construction methods described in the previous section (except for the @Benjamini2005 intervals) are included in the example_sim
function in the rccSims
package. In Section 1.5, we look at four sets of true parameters:
First we generate the four vectors of parameters
example_params <- cbind(rep(0, 1000), rnorm(n=1000), rep(c(0, 3), c(900, 100)), c(rep(0, 900), rnorm(100))) titles <- c("All Zero", "All N(0, 1)", "100 Effects=3", "100 Effects N(0, 1)")
This matrix is also included as a builtin data set in the rccSims
package.
We generate simulation results using the example_sim
function. This function just repeatedly executes the steps in the previous sections and records the coverage and width of the intervals.
set.seed(6587900) sim.list <- list() for(i in 1:4){ sim.list[[i]] <- example_sim(example_params[,i], n=100, use.abs=TRUE) }
These results are included as a built-in data set to the rccSims
package.
The plot_coverage
and plot_width
functions in the rccSims
package create plots of the results.
data("sim.list", package="rccSims") covplots <- list() widthplots <- list() titles <- c("All Zero", "All N(0, 1)", "100 Effects=3", "100 Effects N(0, 1)") for(i in 1:4){ lp <- "none" covplots[[i]] <- plot_coverage(sim.list[[i]], proportion=0.2, cols=c("black", "deeppink3", "blue", "gold4", "forestgreen", "purple"), simnames=c("naive", "par", "oracle", "ash", "wfb", "selInf1"), ltys= c(2, 1, 3, 6, 4, 2), span=0.5, main=titles[i], y.range=c(-0.02, 1.02), legend.position = lp) + theme(plot.title=element_text(hjust=0.5)) widthplots[[i]] <- plot_width(sim.list[[i]], proportion=0.2, cols=c("black", "deeppink3", "blue", "gold4", "forestgreen", "purple"), simnames=c("naive", "par", "oracle", "ash", "wfb", "selInf1"), ltys= c(2, 1, 3, 6, 4, 2), span=0.5, main=titles[i], legend.position = lp)+ theme(plot.title=element_text(hjust=0.5)) } legend <- rccSims:::make_sim_legend(legend.names = c("Marginal", "Parametric\nBootstrap", "Oracle", "ash", "WFB", "RTT"), cols=c("black", "deeppink3", "blue", "gold4", "forestgreen", "purple"), ltys= c(2, 1, 3, 6, 4, 2))
covplots[[1]] widthplots[[1]]
covplots[[2]] widthplots[[2]]
covplots[[3]] widthplots[[3]]
covplots[[4]] widthplots[[4]]
library(gridExtra) h1 <- grid.arrange(covplots[[1]], covplots[[2]], covplots[[3]], covplots[[4]], ncol=4) h2 <- grid.arrange(widthplots[[1]], widthplots[[2]], widthplots[[3]], widthplots[[4]], ncol=4) ggsave(h1, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_cov.png", heigh=4, width=16, units="in", dpi=300) ggsave(h2, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_width.png", heigh=4, width=16, units="in", dpi=300) ggsave(legend$plot, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_legend.png", height=1, width=8, units="in", dpi=300)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.