knitr::opts_chunk$set(fig.width=2*3, fig.height=1.5*2, fig.align="center", fig.path='Figs/',echo=T, warning=F, message=F, cache=T, results='hold');
# Packages
library(BinaryModels);
C1 = require(foreach);
C2 = require(ggplot2) & require(plyr);

Contents

Probit Model

Data for $n=10^{3}$ subjects were simulated from the model:

$$ \eta_{i} = x_{i}'\beta \ z_{i} \sim N(\eta_{i},1) \ y_{i} = I[z_{i}>0] $$

Only the binary outcomes $y_{i}$, and not the latent variables $z_{i}$, are observed. The linear predictor $\eta_{i}$ depends on three independent covariates $(x_{1,i},x_{2,i},x_{3,i})$, drawn from the normal, $t_{3}$ and $\chi_{2}^{2}$ distributions. In each case, the distribution was centered and scaled to have mean zero and unit variance. The regression coefficient $\beta$ was chosen s.t. the approximate power of a Wald test to reject $H_{0}:\beta_{j}=0$ is $50\%$.

The function fitProbit uses the parameter expanded EM algorithm to estimate $\beta$, then attempts to refine the estimate using IRLS.

Simulation

For each of $R=10^{3}$ Monte Carlo data sets, the regression coefficient $\beta$ and its variance were estimated using fitProbit.

# Observations
n = 1e3;
# MC replicates
R = 1e3;
# Regression Coefficients
gamma = (qnorm(0.975))/sqrt(n);
beta = gamma*c(1,1,1);
# Simulation
set.seed(200);
Sim = foreach(r=1:R,.combine=rbind) %do% {
  # Draw covariates
  x1 = rnorm(n);
  x2 = rt(n,df=3)/sqrt(3);
  x3 = (rchisq(n,df=2)-2)/2;
  X = cbind(x1,x2,x3);
  # Linear predictor
  eta = X %*% beta;
  # Outcome
  y = simProbit(eta);
  # Fit probit model
  M = fitProbit(y=y,X=X);
  # Coefficients
  b = M$Beta;
  names(b) = paste0("b",seq(from=0,to=3));
  # Variance estimates
  v = diag(M$Var);
  names(v) = paste0("v",seq(from=0,to=3));
  # Output
  Out = c(b,v);
  return(Out);
}

Estimation Bias

Estimation of the regression coefficients is unbiased.

library(ggplot2);
# Assess bias
bhat = plyr::aaply(Sim[,1:4],.margins=2,.fun=mean);
bias = bhat - c(0,beta);
# Confidence intervals
v = plyr::aaply(Sim[,1:4],.margins=2,.fun=var);
L = bias - 2*sqrt(v);
U = bias + 2*sqrt(v);
# Plotting frame
df = data.frame("Coord"=colnames(Sim[,1:4]),"Bias"=bias,"L"=L,"U"=U);
q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + 
  geom_point(aes(x=Coord,y=Bias),color="royalblue");
q = q + labs("x"="Coefficient","y"="Bias") + coord_flip() + 
  ggtitle("Bias in Coefficient Estimation");
print(q);

Variance Accuracy

The variance $\sigma_{j}^{2}$ of $\beta_{j}$ was estimated using component $I_{jj}$ of the Fisher information matrix. To assess the accuracy of this estimate, the mean of $\sigma_{j}^{2}$ across MC replicates was compared with the empirical variance of the point estimates:

$$ \rho_{j} = \frac{\text{Mean}{r}\left(\sigma{j}^{2,(r)}\right)}{\text{Var}{r}\left(\beta{j}^{(r)}\right)} $$

Confidence intervals for $\rho_{j}$ were constructed using its asymptotic variance on the log scale. Estimation of the variance in $\beta_{j}$ is accurate in that the confidence interval for each $\rho_{j}$ includes one.

#' Function to calculate variance of log mean-variance ratio
#' @param a Point parameter estimates
#' @param b Variance estimates
vLMVR = function(a,b){
  # Rho
  rho = mean(b)/var(a);
  # Numerator
  p = mean((rho*(a-mean(a))^2-b)^2);
  # Denominator
  q = mean(b);
  # Output
  return(p/q);
}
# Mean variance estimate
est.v = plyr::aaply(Sim[,5:8],.margins=2,.fun=mean);
# Log ratio of estimated to empirical variance
tau = log(est.v/v);
# Variance estimates
tau.v = c(vLMVR(a=Sim[,1],b=Sim[,5]),vLMVR(a=Sim[,2],b=Sim[,6]),
          vLMVR(a=Sim[,3],b=Sim[,7]),vLMVR(a=Sim[,4],b=Sim[,8]));
# Confidence intervals
L = tau - 2*sqrt(tau.v);
U = tau + 2*sqrt(tau.v);
# Plotting frame
df = data.frame("Coord"=colnames(Sim[,1:4]),"Rho"=exp(tau),"L"=exp(L),"U"=exp(U));
q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + 
  geom_point(aes(x=Coord,y=Rho),color="coral");
q = q + geom_hline(yintercept=1,color="gray",linetype="dashed");
q = q + labs("x"="Coefficient","y"="Rho") + coord_flip() + 
  ggtitle("Ratio of Estimated to Empirical Variance");
print(q);

Robit Model

Data for $n=10^{3}$ subjects were simulated from the model:

$$ \tau_{i} \sim \Gamma(\nu/2,\nu/2) \ z_{i} \sim N(\eta_{i},1/\tau_{i}) \ y_{i} = I(z_{i}>0) $$

The linear predictor $\eta_{i}$ was constructed as for the probit model. In the robit model, the latent $z_{i}$ follow a marginal $t_{\nu}$ distribution.

The function fitRobit uses the parameter expanded EM algorithm to estimate $\beta$

Simulation

For each of $R=10^{3}$ Monte Carlo data sets, the regression coefficient $\beta$ and its variance were estimated using fitRobit, allowing for up to $10$ iterations of EM.

# Simulation
set.seed(201);
rSim = foreach(r=1:R,.combine=rbind) %do% {
  # Draw covariates
  x1 = rnorm(n);
  x2 = rt(n,df=3)/sqrt(3);
  x3 = (rchisq(n,df=2)-2)/2;
  X = cbind(x1,x2,x3);
  # Linear predictor
  eta = X %*% beta;
  # Outcome
  y = simRobit(eta=eta,nu=7);
  # Fit probit model
  M = fitRobit(y=y,X=X,m1=10);
  # Coefficients
  b = M$Beta;
  names(b) = paste0("b",seq(from=0,to=3));
  # Variance estimates
  v = diag(M$Var);
  names(v) = paste0("v",seq(from=0,to=3));
  # Output
  Out = c(b,v);
  return(Out);
}

Estimation Bias

Estimation of the regression coefficients is unbiased.

library(ggplot2);
# Assess bias
bhat = plyr::aaply(rSim[,1:4],.margins=2,.fun=mean);
bias = bhat - c(0,beta);
# Confidence intervals
v = plyr::aaply(rSim[,1:4],.margins=2,.fun=var);
L = bias - 2*sqrt(v);
U = bias + 2*sqrt(v);
# Plotting frame
df = data.frame("Coord"=colnames(rSim[,1:4]),"Bias"=bias,"L"=L,"U"=U);
q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + 
  geom_point(aes(x=Coord,y=Bias),color="royalblue");
q = q + labs("x"="Coefficient","y"="Bias") + coord_flip() + 
  ggtitle("Bias in Coefficient Estimation");
print(q);

Variance Accuracy

For each coefficient $\beta_{j}$, the ratio $\rho_{j}$ of the mean estimated variance, across MC replicates, was compared with the empirical variance of the point estimates. Estimation of the variance in $\beta_{j}$ is accurate in so far as the confidence interval for each $\rho_{j}$ includes one.

# Mean variance estimate
est.v = plyr::aaply(rSim[,5:8],.margins=2,.fun=mean);
# Log ratio of estimated to empirical variance
tau = log(est.v/v);
# Variance estimates
tau.v = c(vLMVR(a=rSim[,1],b=rSim[,5]),vLMVR(a=rSim[,2],b=rSim[,6]),
          vLMVR(a=rSim[,3],b=rSim[,7]),vLMVR(a=rSim[,4],b=rSim[,8]));
# Confidence intervals
L = tau - 2*sqrt(tau.v);
U = tau + 2*sqrt(tau.v);
# Plotting frame
df = data.frame("Coord"=colnames(Sim[,1:4]),"Rho"=exp(tau),"L"=exp(L),"U"=exp(U));
q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + 
  geom_point(aes(x=Coord,y=Rho),color="coral");
q = q + geom_hline(yintercept=1,color="gray",linetype="dashed");
q = q + labs("x"="Coefficient","y"="Rho") + coord_flip() + 
  ggtitle("Ratio of Estimated to Empirical Variance");
print(q);


zrmacc/BinaryModels documentation built on Jan. 14, 2018, 9:09 a.m.