In jean997/rccSims: Replicate simulations in "Rank conditional coverage and confidence intervals in high dimensional problems" by Jean Morrison and Noah Simon.

knitr::opts_chunk$set(echo = TRUE, warning=FALSE, message=FALSE)

In this document we will compare several different approaches to building confidence intervals in high dimensions using the example in Section 1.5 of "Rank conditional coverage and confidence intervals in high dimensional problems" by Jean Morrison and Noah Simon.

We will start by setting up the problem and generating data. Then we will run each method one at a time for a single scenario. At the end, you will find the code that generates the results shown in the paper exactly.

Generate data

For this exploration we will use setting 2 from section 1.5. We have 1000 parameters generated from a $N(0, 1)$ distribution. Each of our observed statistics $Z_i$ is generated as $$Z_i \sim N(\theta_i, 1).$$ We will rank these statistics based on their absolute value.

library(ggplot2)
library(rcc)
library(rccSims)
library(selectiveInference)

set.seed(1e7)
theta <- rnorm(1000)
Z <- rnorm(n=1000, mean=theta)
j <- order(abs(Z), decreasing=TRUE)
rank <- match(1:1000, j)

Naive confidence intervals

We construct standard marginal confidence intervals for the parameters associated with each of these estimates. We will use $\alpha=0.1$ to give an expected rate of coverage of 90\%.

ci.naive <- cbind(Z - qnorm(0.95), Z + qnorm(0.95))
#Average coverage
sum(ci.naive[,1] <= theta & ci.naive[,2] >= theta)/1000

We can plot these intervals and color them by whether or not they cover their target parameter. Notice that even though the overall coverage rate is close to the nominal level, a lot of the non-covering intervals are constructed for the most extreme statistics.

rccSims::plot_cis(rank, ci.naive, theta) + ggtitle("Standard Marginal Intervals")

Here is the same plot for only the top 20\% of statistics. In this plot, points show the value of the true parameter.

rccSims::plot_cis(rank, ci.naive, theta, prop = 0.2, plot.truth=TRUE) + ggtitle("Standard Marginal Intervals")

This is just one data set. To see what the expected coverage rates at each rank are we'll simulate 100 more data sets:

z_many <- replicate(n=100, expr={rnorm(n=1000, mean=theta)})
naive.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  j <- order(abs(zz), decreasing=TRUE)
  ci <- cbind(zz - qnorm(0.975), zz + qnorm(0.975))
  covered <- (ci[,1] <= theta & ci[,2] >= theta)[j]
  return(covered)
})

Plotting the average coverage at each rank:

dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(naive.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept = 0.9)+
  ggtitle("Average Coverage of Marginal Intervals") + theme_bw() + theme(panel.grid=element_blank())

Selection adjusted confidence intervals

Now lets look at the intervals from the selection adjusted methods proposed by @Benjamini2005, @Weinstein2013, and @Reid2014. These approaches all require us to make a selection before constructing intervals and we will get different intervals if we select a different subset of estimates. For this exploration, we select the top ten percent of estimates based on the absolute value ranking.

@Benjamini2005 Intervals

Using this selection rule, the @Benjamini2005 intervals are equivalent to the $1-\frac{100*0.1}{1000} = 0.99$ coverage marginal intervals:

Z.sel <- Z[rank <= 100]
theta.sel <- theta[rank <= 100]
ci.by <- cbind(Z.sel - qnorm(0.995) , Z.sel + qnorm(0.995))
sum(ci.by[,1] <= theta.sel & ci.by[,2] >= theta.sel)/100

rccSims::plot_cis(rank[rank <= 100], ci.by, theta.sel, plot.truth=TRUE) + ggtitle("Benjamini and Yukatieli Intervals")

Now averaging coverage of the BY intervals over 100 data sets:

system.time(by.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  j <- order(abs(zz), decreasing=TRUE)
  ci <- cbind(zz[j][1:100] - qnorm(0.995), zz[j][1:100] + qnorm(0.995))
  covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100])
  return(covered)
}))
#The average coverage in each simulation is close to the nominal level
summary(colMeans(by.coverage))

dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(by.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of Benjamini and Yukatielli Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

These intervals have reduced coverage for the top ranked estimates even though the average coverage in the selected set is correct.

@Weinstein2013 Intervals

For the @Weinstein2013 intervals, we use code that accompanied that paper. This code is included in the rccSims package for convenience. The Shortest.CI function used below is part of this code. These intervals are asymetric and narrower than the @Benjamini2005 intervals but still control the average coverage in the selected set.

#We need to give this method the "cutpoint" or minimum value of Z
ct <- abs(Z[rank==101])
wfb <- lapply(Z, FUN=function(x){
              if(abs(x) < ct) return(c(NA, NA))
              ci <- try(rccSims:::Shortest.CI(x, ct=ct, alpha=0.1), silent=TRUE)
              if(class(ci) == "try-error") return(c(NA, NA)) #Sometimes WFB code produces errors
              return(ci)
             })
ci.wfb <- matrix(unlist(wfb), byrow=TRUE, nrow=1000)[rank <= 100,]
sum(ci.wfb[,1] <= theta[rank <=100] & ci.wfb[,2]>= theta[rank<=100])/100

rccSims::plot_cis(rank[rank <= 100], ci.wfb, theta.sel, plot.truth=TRUE) + ggtitle("Weinstein, Fithian, and Benjamini Intervals")

Most of the non-covering intervals are still for the parameters with the most significnt estimates. This pattern is clear when we look at the average coverage over 100 data sets:

system.time(wfb.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  j <- order(abs(zz), decreasing=TRUE)
  ct <- abs(zz[j][101])
  wfb <- lapply(zz[j][1:100], FUN=function(x){
              if(abs(x) < ct) return(c(NA, NA))
              ci <- try(rccSims:::Shortest.CI(x, ct=ct, alpha=0.1), silent=TRUE)
              if(class(ci) == "try-error") return(c(NA, NA)) #Sometimes WFB code produces errors
              return(ci)
             })
  ci <- matrix(unlist(wfb), byrow=TRUE, nrow=100)
  covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100])
  return(covered)
}))
#The average coverage in each simulation is close to the nominal level
summary(colMeans(wfb.coverage))

dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(wfb.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of Weinstein, Fithian and Benjamini Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

@Reid2014 Intervals

Finally, lets look at the intervals of @Reid2014. These are implemented in the selectiveInference R package

M <- manyMeans(y=Z, k=100, alpha=0.1, sigma=1)
ci.rtt <- matrix(nrow=1000, ncol=2)
ci.rtt[M$selected.set, ] <- M$ci
ci.rtt <- ci.rtt[rank <= 100,]
sum(ci.rtt[,1] <= theta[rank <=100] & ci.rtt[,2]>= theta[rank<=100])/100

rccSims::plot_cis(rank[rank <= 100], ci.rtt, theta.sel, plot.truth=TRUE) + ggtitle("Reid, Taylor, and Tibshirani Intervals")

For the most part, these intervals are narrower than the @Weinstein2013 intervals. Non-covering intervals tend to be concentrated at the two extremes -- parameters associated with the most and least significant estimates tend to go uncovered. We can see this by looking at the average over 100 simulations:

system.time(rtt.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  j <- order(abs(zz), decreasing=TRUE)
  M <- manyMeans(y=zz, k=100, alpha=0.1, sigma=1)
  ci <- matrix(nrow=1000, ncol=2)
  ci[M$selected.set, ] <- M$ci
  ci <- ci[j,][1:100,]
  covered <- (ci[,1] <= theta[j][1:100] & ci[,2] >= theta[j][1:100])
  return(covered)
}))
#The average coverage in each simulation is close to the nominal level
summary(colMeans(rtt.coverage))

dat <- data.frame("Rank"=1:100, "Coverage"=rowMeans(rtt.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of Reid, Taylor, and Tibshirani Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

Parametric Bootstrap

Now we will construct intervals for the same problem using the parametric bootstrap described in Section 2.3. Since we are ranking based on absolute value, we will use the variation given in Supplementary Algorithm 2. The procedure is implemented in the par_bs_ci function in the rcc package but here we go through the steps explicitly. First we estimate the average bias at each rank by bootstrapping:

set.seed(13421)
B <- replicate(n = 500, expr = {
    w <- rnorm(n=1000, mean=Z, sd=1)
    k <- order(abs(w), decreasing=TRUE)
    sign(w[k])*(w[k]-Z[k])
})
dim(B)

Next we calculate the 0.05 and 0.95 quantiles of the bias for each rank.

qs <- apply(B, MARGIN=1, FUN=function(x){quantile(x, probs=c(0.05, 0.95))})

Finally, we construct the intervals by pivoting

ci.boot <- cbind(Z[j]-qs[2,], Z[j]-qs[1,])
which.neg <- which(Z[j] < 0)
ci.boot[ which.neg , ] <- cbind(Z[j][which.neg] + qs[1,which.neg], Z[j][which.neg]+qs[2,which.neg])
#Get CI's in the same order as estimates
ci.boot <- ci.boot[rank,]
sum(ci.boot[,1] <= theta & ci.boot[,2] >= theta)/1000

For comparison, here is how to get the same intervals using the rcc package.

set.seed(13421)
ci.boot2 <- rcc::par_bs_ci(beta=Z, n.rep=500)
head(ci.boot2)
all.equal(ci.boot2$ci.lower, ci.boot[,1])
all.equal(ci.boot2$ci.upper, ci.boot[,2])

Here are the parametric bootstrap intervals for all ranks:

rccSims::plot_cis(rank, ci.boot, theta) + ggtitle("Parametric Bootstrap Intervals")

and for just the top 20\% of ranks

rccSims::plot_cis(rank, ci.boot, theta, prop=0.2, plot.truth=TRUE) + ggtitle("Parametric Bootstrap Intervals")

Looking at the average coverage over 100 data sets we see that the parametric bootstrap intervals are slightly conservative but that the average coverage for each rank is close to the nominal level.

system.time(boot.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  ci <- rcc::par_bs_ci(beta=zz)[, c("ci.lower", "ci.upper")]
  covered <- (ci[,1] <= theta & ci[,2] >= theta)
  return(covered)
}))
summary(colMeans(boot.coverage))

dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(boot.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of Parametric Bootstrap Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

The difference between the observed average coverage at each rank and the nominal level is a result of using $Z_i$ as an estimate for $\theta_i$ in the bootstrapping step. If we knew $\theta_i$ we could generate the oracle bootstrap intervals which achieve exactly the right coverage at each rank (and are much smaller):

oracle.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  ci <- rcc::par_bs_ci(beta=zz, theta=theta)[, c("ci.lower", "ci.upper")]
  covered <- (ci[,1] <= theta & ci[,2] >= theta)
  return(covered)
})
summary(colMeans(oracle.coverage))

dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(oracle.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of Oracle Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

Empirical Bayes Credible Intervals

There are also empirical Bayes (EB) proposals for estimating praameters in high dimensional settings. Here we will look at the credible intervals generated using the method of @Stephens2016, which is implemented in the ashr package. This method assumes that $\theta_i$ are drawn from a unimodal distribution and that $Z_i \sim N(\theta_i, \sigma_i)$ where $\sigma_i$ is known. These assumptions both hold in this case so ashr does well. Unlike the selection adjusted methods, ashr also has an RCC close to the nominal level at every rank.

library(ashr)
ash.res <- ash(betahat = Z, sebetahat = rep(1, 1000), mixcompdist = "normal")
ci.ash <- ashci(ash.res, level=0.9, betaindex = 1:1000, trace=FALSE)
sum(ci.ash[,1]<= theta & ci.ash[,2] >= theta)/1000

Here are the ashr intervals at all ranks

rccSims::plot_cis(rank, ci.ash, theta) + ggtitle("ashr Credible Intervals (Stephens 2016)")

and for just the parameters with estimates in the top 20\%

rccSims::plot_cis(rank, ci.ash, theta, prop=0.2, plot.truth=TRUE) + ggtitle("ashr Credible Intervals (Stephens 2016)")

We can look at the average coverage of the ashr credible intervals over 100 simulations. It is worth noting that we have conducted the simulations from more of a frequentist point of view --- the parameters were fixed at the beginning and in each simulation we simply generate new statistics. From a Bayesian perspective, it would be more appropriate to generate a new parameter vector from the prior each time. This might explain why we get overall slight undercoverage from the ashr intervals in this setting. ashr also takes substantially longer to run than the parametric bootstrap (on a normal laptop, the code below took nearly 7 minutes while the parametric bootstrap took only 30 seconds.) In this example, ashr does a good job controlling the RCC for the top parameters. This is, in part, because the true parameters are sparse. We found in the paper that when the parameters are not sparse, performance is much worse.

system.time(ash.coverage <- apply(z_many, MARGIN=2, FUN=function(zz){
  j <- order(abs(zz), decreasing = TRUE)
  ash.res <- ash(betahat = zz, sebetahat = rep(1, 1000), mixcompdist = "normal")
  ci <- ashci(ash.res, level=0.9, betaindex = 1:1000, trace=FALSE)
  covered <- (ci[,1] <= theta & ci[,2] >= theta)[j]
  return(covered)
}))
summary(colMeans(ash.coverage))

dat <- data.frame("Rank"=1:1000, "Coverage"=rowMeans(ash.coverage))
ggplot(dat) + geom_point(aes(x=Rank, y=Coverage)) + ylab("Average Coverage")  + 
  geom_hline(yintercept=0.9) + 
  ggtitle("Average Coverage of ashr Credible Intervals") + 
  theme_bw() + theme(panel.grid=element_blank())

Generating simulation results in the paper

All of the interval construction methods described in the previous section (except for the @Benjamini2005 intervals) are included in the example_sim function in the rccSims package. In Section 1.5, we look at four sets of true parameters:

All the parameters are equal to zero
All the parameters were genreated from a $N(0, 1)$ distribution
900 of the parameters are equal to 0 and 100 are equal to 3
900 of the parameters are equal to 0 and 100 are drawn from a $N(0, 1)$ distribution

First we generate the four vectors of parameters

example_params <- cbind(rep(0, 1000), 
                        rnorm(n=1000), 
                        rep(c(0, 3), c(900, 100)), 
                        c(rep(0, 900), rnorm(100)))
titles <- c("All Zero", "All N(0, 1)", "100 Effects=3", "100 Effects N(0, 1)")

This matrix is also included as a builtin data set in the rccSims package. We generate simulation results using the example_sim function. This function just repeatedly executes the steps in the previous sections and records the coverage and width of the intervals.

set.seed(6587900)
sim.list <- list()
for(i in 1:4){
  sim.list[[i]] <- example_sim(example_params[,i], n=100, use.abs=TRUE)
}

These results are included as a built-in data set to the rccSims package. The plot_coverage and plot_width functions in the rccSims package create plots of the results.

data("sim.list", package="rccSims")
covplots <- list()
widthplots <- list()
titles <- c("All Zero", "All N(0, 1)", "100 Effects=3", "100 Effects N(0, 1)")
for(i in 1:4){
  lp <- "none"
  covplots[[i]] <- plot_coverage(sim.list[[i]], proportion=0.2,
        cols=c("black",  "deeppink3",  "blue", "gold4", "forestgreen", "purple"),
        simnames=c("naive",  "par",    "oracle", "ash", "wfb", "selInf1"),
        ltys= c(2, 1, 3, 6, 4, 2), span=0.5, main=titles[i], y.range=c(-0.02, 1.02),
        legend.position = lp) + theme(plot.title=element_text(hjust=0.5))
  widthplots[[i]] <- plot_width(sim.list[[i]], proportion=0.2,
        cols=c("black",  "deeppink3",  "blue", "gold4", "forestgreen", "purple"),
        simnames=c("naive",  "par", "oracle", "ash", "wfb", "selInf1"),
        ltys= c(2, 1, 3, 6, 4, 2), span=0.5, main=titles[i],
        legend.position = lp)+ theme(plot.title=element_text(hjust=0.5))
}
legend <- rccSims:::make_sim_legend(legend.names = c("Marginal", "Parametric\nBootstrap", 
                                                 "Oracle", "ash", "WFB", "RTT"), 
              cols=c("black",  "deeppink3",  "blue", "gold4", "forestgreen", "purple"),
              ltys= c(2, 1, 3, 6, 4, 2))

covplots[[1]]
widthplots[[1]]

covplots[[2]]
widthplots[[2]]

covplots[[3]]
widthplots[[3]]

covplots[[4]]
widthplots[[4]]

library(gridExtra)
h1 <- grid.arrange(covplots[[1]], covplots[[2]], covplots[[3]], covplots[[4]], ncol=4)
h2 <- grid.arrange(widthplots[[1]], widthplots[[2]], widthplots[[3]], widthplots[[4]], ncol=4)
ggsave(h1, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_cov.png", heigh=4, width=16, units="in", dpi=300)
ggsave(h2, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_width.png", heigh=4, width=16, units="in", dpi=300)
ggsave(legend$plot, file="~/Dropbox/Confidence_Intervals/for_jcgs/img/example_legend.png", height=1, width=8, units="in", dpi=300)