# In zrmacc/BinaryModels: Binary Regression Models

knitr::opts_chunkset(fig.width=2*3, fig.height=1.5*2, fig.align="center", fig.path='Figs/',echo=T, warning=F, message=F, cache=T, results='hold'); # Packages library(BinaryModels); C1 = require(foreach); C2 = require(ggplot2) & require(plyr);  ## Contents ## Probit Model Data forn=10^{3}$subjects were simulated from the model: $$\eta_{i} = x_{i}'\beta \ z_{i} \sim N(\eta_{i},1) \ y_{i} = I[z_{i}>0]$$ Only the binary outcomes$y_{i}$, and not the latent variables$z_{i}$, are observed. The linear predictor$\eta_{i}$depends on three independent covariates$(x_{1,i},x_{2,i},x_{3,i})$, drawn from the normal,$t_{3}$and$\chi_{2}^{2}$distributions. In each case, the distribution was centered and scaled to have mean zero and unit variance. The regression coefficient$\beta$was chosen s.t. the approximate power of a Wald test to reject$H_{0}:\beta_{j}=0$is$50\%$. The function fitProbit uses the parameter expanded EM algorithm to estimate$\beta$, then attempts to refine the estimate using IRLS. #### Simulation For each of$R=10^{3}$Monte Carlo data sets, the regression coefficient$\beta$and its variance were estimated using fitProbit. # Observations n = 1e3; # MC replicates R = 1e3; # Regression Coefficients gamma = (qnorm(0.975))/sqrt(n); beta = gamma*c(1,1,1); # Simulation set.seed(200); Sim = foreach(r=1:R,.combine=rbind) %do% { # Draw covariates x1 = rnorm(n); x2 = rt(n,df=3)/sqrt(3); x3 = (rchisq(n,df=2)-2)/2; X = cbind(x1,x2,x3); # Linear predictor eta = X %*% beta; # Outcome y = simProbit(eta); # Fit probit model M = fitProbit(y=y,X=X); # Coefficients b = M$Beta;
names(b) = paste0("b",seq(from=0,to=3));
# Variance estimates
v = diag(M$Var); names(v) = paste0("v",seq(from=0,to=3)); # Output Out = c(b,v); return(Out); }  #### Estimation Bias Estimation of the regression coefficients is unbiased. library(ggplot2); # Assess bias bhat = plyr::aaply(Sim[,1:4],.margins=2,.fun=mean); bias = bhat - c(0,beta); # Confidence intervals v = plyr::aaply(Sim[,1:4],.margins=2,.fun=var); L = bias - 2*sqrt(v); U = bias + 2*sqrt(v); # Plotting frame df = data.frame("Coord"=colnames(Sim[,1:4]),"Bias"=bias,"L"=L,"U"=U); q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + geom_point(aes(x=Coord,y=Bias),color="royalblue"); q = q + labs("x"="Coefficient","y"="Bias") + coord_flip() + ggtitle("Bias in Coefficient Estimation"); print(q);  #### Variance Accuracy The variance$\sigma_{j}^{2}$of$\beta_{j}$was estimated using component$I_{jj}$of the Fisher information matrix. To assess the accuracy of this estimate, the mean of$\sigma_{j}^{2}$across MC replicates was compared with the empirical variance of the point estimates: $$\rho_{j} = \frac{\text{Mean}{r}\left(\sigma{j}^{2,(r)}\right)}{\text{Var}{r}\left(\beta{j}^{(r)}\right)}$$ Confidence intervals for$\rho_{j}$were constructed using its asymptotic variance on the log scale. Estimation of the variance in$\beta_{j}$is accurate in that the confidence interval for each$\rho_{j}$includes one. #' Function to calculate variance of log mean-variance ratio #' @param a Point parameter estimates #' @param b Variance estimates vLMVR = function(a,b){ # Rho rho = mean(b)/var(a); # Numerator p = mean((rho*(a-mean(a))^2-b)^2); # Denominator q = mean(b); # Output return(p/q); }  # Mean variance estimate est.v = plyr::aaply(Sim[,5:8],.margins=2,.fun=mean); # Log ratio of estimated to empirical variance tau = log(est.v/v); # Variance estimates tau.v = c(vLMVR(a=Sim[,1],b=Sim[,5]),vLMVR(a=Sim[,2],b=Sim[,6]), vLMVR(a=Sim[,3],b=Sim[,7]),vLMVR(a=Sim[,4],b=Sim[,8])); # Confidence intervals L = tau - 2*sqrt(tau.v); U = tau + 2*sqrt(tau.v); # Plotting frame df = data.frame("Coord"=colnames(Sim[,1:4]),"Rho"=exp(tau),"L"=exp(L),"U"=exp(U)); q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + geom_point(aes(x=Coord,y=Rho),color="coral"); q = q + geom_hline(yintercept=1,color="gray",linetype="dashed"); q = q + labs("x"="Coefficient","y"="Rho") + coord_flip() + ggtitle("Ratio of Estimated to Empirical Variance"); print(q);  ## Robit Model Data for$n=10^{3}$subjects were simulated from the model: $$\tau_{i} \sim \Gamma(\nu/2,\nu/2) \ z_{i} \sim N(\eta_{i},1/\tau_{i}) \ y_{i} = I(z_{i}>0)$$ The linear predictor$\eta_{i}$was constructed as for the probit model. In the robit model, the latent$z_{i}$follow a marginal$t_{\nu}$distribution. The function fitRobit uses the parameter expanded EM algorithm to estimate$\beta$#### Simulation For each of$R=10^{3}$Monte Carlo data sets, the regression coefficient$\beta$and its variance were estimated using fitRobit, allowing for up to$10$iterations of EM. # Simulation set.seed(201); rSim = foreach(r=1:R,.combine=rbind) %do% { # Draw covariates x1 = rnorm(n); x2 = rt(n,df=3)/sqrt(3); x3 = (rchisq(n,df=2)-2)/2; X = cbind(x1,x2,x3); # Linear predictor eta = X %*% beta; # Outcome y = simRobit(eta=eta,nu=7); # Fit probit model M = fitRobit(y=y,X=X,m1=10); # Coefficients b = M$Beta;
names(b) = paste0("b",seq(from=0,to=3));
# Variance estimates
v = diag(M$Var); names(v) = paste0("v",seq(from=0,to=3)); # Output Out = c(b,v); return(Out); }  #### Estimation Bias Estimation of the regression coefficients is unbiased. library(ggplot2); # Assess bias bhat = plyr::aaply(rSim[,1:4],.margins=2,.fun=mean); bias = bhat - c(0,beta); # Confidence intervals v = plyr::aaply(rSim[,1:4],.margins=2,.fun=var); L = bias - 2*sqrt(v); U = bias + 2*sqrt(v); # Plotting frame df = data.frame("Coord"=colnames(rSim[,1:4]),"Bias"=bias,"L"=L,"U"=U); q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) + geom_point(aes(x=Coord,y=Bias),color="royalblue"); q = q + labs("x"="Coefficient","y"="Bias") + coord_flip() + ggtitle("Bias in Coefficient Estimation"); print(q);  #### Variance Accuracy For each coefficient$\beta_{j}$, the ratio$\rho_{j}$of the mean estimated variance, across MC replicates, was compared with the empirical variance of the point estimates. Estimation of the variance in$\beta_{j}$is accurate in so far as the confidence interval for each$\rho_{j}\$ includes one.

# Mean variance estimate
est.v = plyr::aaply(rSim[,5:8],.margins=2,.fun=mean);
# Log ratio of estimated to empirical variance
tau = log(est.v/v);
# Variance estimates
tau.v = c(vLMVR(a=rSim[,1],b=rSim[,5]),vLMVR(a=rSim[,2],b=rSim[,6]),
vLMVR(a=rSim[,3],b=rSim[,7]),vLMVR(a=rSim[,4],b=rSim[,8]));
# Confidence intervals
L = tau - 2*sqrt(tau.v);
U = tau + 2*sqrt(tau.v);
# Plotting frame
df = data.frame("Coord"=colnames(Sim[,1:4]),"Rho"=exp(tau),"L"=exp(L),"U"=exp(U));
q = ggplot(data=df) + theme_bw() + geom_linerange(aes(x=Coord,ymin=L,ymax=U)) +
geom_point(aes(x=Coord,y=Rho),color="coral");
q = q + geom_hline(yintercept=1,color="gray",linetype="dashed");
q = q + labs("x"="Coefficient","y"="Rho") + coord_flip() +
ggtitle("Ratio of Estimated to Empirical Variance");
print(q);


zrmacc/BinaryModels documentation built on Jan. 14, 2018, 9:09 a.m.