In pdhoff/FABInference: FAB p-Values and Confidence Intervals

Summary

This is a simulation study in which adaptive FAB $p$-values are constructed for logistic regression coefficients. This document serves as the replication code for the example in Section 4.3 of the article "Smaller $p$-values via indirect information" (Hoff 2019).

Linking model

The linking model we consider here is that the logistic regression coefficients $\theta_1,\ldots, \theta_p$ are i.i.d. from a mixture of a point-mass distribution at zero and a normal distribution with some mean and variance.

Here is a function that estimates the parameters in the linking model. More generally, it obtains MLEs of $\mu$ and $\tau^2$ under the model [ \begin{align} y & \sim N_p( \theta , \sigma^2 I) \ \theta_1,\ldots, \theta_p &\sim i.i.d\ (1-\pi) \delta_0(\theta) + \text{dnorm}(\theta, \mu,\tau) \end{align} ]

fitMM<-function(y,s2){

  obj<-function(gam){
    pi<-gam[1] ; mu<-gam[2] ; t2<-gam[3]
    -sum( log( pi*dnorm(y,mu,sqrt(t2+s2)) + (1-pi)*dnorm(y,0,sqrt(s2))) )
  }

  mut2<-optim( c(.9,mean(y),var(y)),obj,method="L-BFGS-B",
               lower=c(.001,-Inf,.001),upper=c(.999,Inf,Inf))$par[2:3]
  names(mut2)<-c("mu","t2")
  mut2
}

For this example, for each $\theta_j$ we will plug-in an estimate $\hat\theta_{-j}$ of the vector $\theta_{-j}$ using data that is independent (asymptotically) of $\hat\theta_j$.

Here is the simulation:

set.seed(1)
p<-30
theta0<-rep(c(3,0),times=c(p/2,p/2))
nvals<-c(200,400,800,1600) 
nsim<-5000

histW0<-histF0<-histW1<-histF1<-list() 
sigTN<-sigNN<-NULL

par(mfrow=c(4,3),mar=c(3,3,1,1),mgp=c(1.75,.75,0)) 

for(k in 1:length(nvals)){

  n<-nvals[k] 

  PF<-PW<-NULL
  for(s in 1:nsim){ 

    X<-matrix(rnorm(n*p),n,p) 
    lp<-X%*%theta0/sqrt(n)
    y<-rbinom(n,1,1/(1+exp(-lp))) 
    fit<-glm(y~-1+X,family="binomial")

    theta<-fit$coef
    Sigma<-summary(fit)$cov.scaled
    s2<-diag(Sigma)  

    MUT2<-NULL
    for(j in 1:p){
      G<-MASS::Null(Sigma[,j])
      MUT2<-rbind(MUT2,fitMM(  (G%*%(t(G)%*%theta))[-j],s2[-j]) )
    }

    z<-theta/sqrt(s2)
    pW<-1-abs( pnorm(z)-pnorm(-z))
    pF<-1-abs( pnorm(z+2*sqrt(s2)*MUT2[,1]/MUT2[,2]) - pnorm(-z) ) 
    PF<-rbind(PF,pF) ; PW<-rbind(PW,pW)   
  } 

  histW0[[k]]<-hist(PW[,theta0==0],plot=FALSE) 
  histW1[[k]]<-hist(PW[,theta0!=0],plot=FALSE)  
  histF0[[k]]<-hist(PF[,theta0==0],plot=FALSE) 
  histF1[[k]]<-hist(PF[,theta0!=0],plot=FALSE)   

  hist(PF[,theta0!=0],col=rgb(.3,.3,.3,.5),prob=TRUE) 
  hist(PW[,theta0!=0],col=rgb(.8,.2,.5,.5),add=TRUE,prob=TRUE) 

  hist(PF[,theta0==0]) 
  hist(PW[,theta0==0]) 

 pW<-mean(PW[,theta0==0]<.05) ; pF<-mean(PF[,theta0==0]<.05) 
 sigTN<-cbind(sigTN,c(pW,pF))  

 pW<-mean(PW[,theta0!=0]<.05) ; pF<-mean(PF[,theta0!=0]<.05) 
 sigNN<-cbind(sigNN,c(pW,pF))  

}

Here we save the results:

colnames(sigTN)<-colnames(sigNN)<-nvals
rownames(sigTN)<-rownames(sigNN)<-c("pW","pF")
names(histW0)<-names(histW1)<-nvals
names(histF0)<-names(histF1)<-nvals  
save(sigTN,sigNN,histW0,histW1,histF0,histF1,file="resultsLogistic.RData")

Here are the fractions of true-null $p$-values below 0.05:

sigTN

Here are the fractions of non-null $p$-values below 0.05:

sigNN

Here are the plots from the article:

par(mfrow=c(4,3),mar=c(2.75,3.2,0,1),mgp=c(1.75,.75,0))
for(k in 1:4){

  xlab<-1*(k==4) + 1

  plot(histF1[[k]],col=rgb(.1,.4,.6,.5),main="",freq=FALSE,
       xlab=c("","non-null p-values")[xlab],ylab="",xaxt=c("n","s")[xlab] )
  plot(histW1[[k]],col=rgb(.9,.3,.3,.5),add=TRUE,freq=FALSE)
  mtext(paste0("n=",names(histF1)[k]),2,line=2,cex=.85)

  if(k==1){
    legend(.5,7,legend=c("Wald","FAB"),pch=15,col=c(rgb(.9,.3,.3,.5),rgb(.1,.4,.6,.5)),bty="n")
  }

  plot(histW0[[k]],col="gray",main="",freq=FALSE,
    xlab=c("","null Wald p-values")[xlab],ylab="",xaxt=c("n","s")[xlab])

  plot(histF0[[k]],col="gray",main="",freq=FALSE,
   xlab=c("","null FAB p-values")[xlab],ylab="",xaxt=c("n","s")[xlab])
}