#' Bayesian regression with missing covariates
#'
#' \code{bayesmiss} generates JAGS model code and an R script to perform
#' Bayesian regression, allowing for missingness in covariates.
#'
#' \code{bayesmiss} faciliates running Bayesian regression models in which there
#' are missing values in some of the covariates. The function generates two files,
#' one a JAGS model file, and one a R script file. The JAGS model defines the
#' user's substantive model, and models as required to handle the missing covariates,
#' and any auxiliary variables if present. The R script file generated contains
#' commands to generate the required JAGS data and parameter objects, and code
#' required to fit the model. Note that \code{bayesmiss} does not actually run this
#' code. Instead, the user is advised to check the model specification and R
#' script to ensure it is as desired. The R code can then be run to fit the model.
#'
#' The method argument specifies what model type to use for each variable.
#' Currently the possible values are:
#' \itemize{
#' \item \code{"norm"} (normal linear model)
#' \item \code{"logit"} (logistic regression)
#' \item \code{"mlogit"} (multinomial logistic regression)
#' \item \code{"ologit"} (ordinal logistic regression)
#' \item \code{"pois"} (Poisson regression)
#' }
#' The element corresponding to the outcome of the
#' substantive model should be specified as desired according to the desired
#' type of substantive model. A model type must be specified for all auxiliary
#' variables. Entries corresponding to variables which are covariates in the
#' substantive model and which are fully observed should be specified as "".
#' Variables modelled using mlogit or ologit should be stored as numeric
#' (not factors), and coded 1:K, where K is the number of categories.
#'
#' The \code{order} argument is used to specify the order in which the joint
#' model is factorized. Elements corresponding to variables which are covariates
#' in the substantive model should be specified as 0. Substantive model covariates
#' which have missing values should come next. The substantive model outcome variable
#' should come next, followed by any auxiliary variables.
#'
#' Note that the MCMC options in the call to \code{jags} are just suggested defaults.
#' It is up to the user to ensure, via the usual diagnostics for MCMC, that a
#' sufficient number of iterations have been run to ensure convergence of the chains.
#' In particular, the code generated by \code{bayesmiss} does not specify initial
#' values for parameters. To assess convergence general advice is to choose different
#' overdispersed initial values for each chain.
#'
#' If it is desired to add interactions or non-linear covariate effects, first
#' run \code{bayesmiss} omitting these terms, and then modify the JAGS code file
#' and R code specifying the priors as needed.
#'
#' @param originaldata the data frame upon which the analysis is to be performed.
#'
#' @param smoutcome the name of the column corresponding to the outcome variable
#' of the substantive model.
#'
#' @param method a vector of strings specifying the model type to use for each of
#' the columns in \code{originaldata}.
#'
#' @param order a vector specifying the order in which the joint model should
#' be constructed. e.g. c(0,0,1,2) specifies that the joint model be factorized
#' as f(v3|v1,v2)f(v4|v1,v2,v3).
#'
#' @return \code{bayesmiss} generates two files in the current working directory:
#' \code{bayesmissmod.bug} is a JAGS model file for the constructed model, and \code{bayesmissRscript.r}
#' is an R file containing R code for generating the required JAGS parameters and data
#' objects, and a call to the \code{jags} function of the \code{R2jags} package for
#' fitting the model.
#'
#' @export
bayesmiss <- function(originaldata,smoutcome,method,order,factorcov=TRUE) {
n <- dim(originaldata)[1]
#create matrix of response indicators
r <- 1*(is.na(originaldata)==0)
if (ncol(originaldata)!=length(order)) stop("Argument to order must have the same length as the number of columns in the data frame.")
if (ncol(originaldata)!=length(method)) stop("Argument to method must have the same length as the number of columns in the data frame.")
smoutcomecol <- which(colnames(originaldata)==smoutcome)
if (length(smoutcomecol)==0) stop(paste("No variable found with name: ",smoutcome,sep=""))
#start constructing model code string
modelCode <- c(
"model {",
" for (i in 1:n) {")
#start constructing code for R script file
rScriptText <- "library(rjags,coda)"
rScriptText <- c(rScriptText,paste("jags.data <- as.list(",deparse(substitute(originaldata)),")", sep=""),
"jags.data$n <- n")
#models for variables with non-zero order, i.e.
#partially observed substantive model covariates, the outcome of the substantive model,
#and any auxiliary variables
numVars <- max(order)
prevVars <- which(order==0)
priorCode <- vector(mode="character",0)
if (numVars>0) {
for (var in 1:numVars) {
targetCol <- which(order==var)
varName <- colnames(originaldata)[targetCol]
covNames <- colnames(originaldata)[prevVars]
if ((method[targetCol]=="norm") | (method[targetCol]=="logit") | (method[targetCol]=="pois")) {
if (method[targetCol]=="norm") {
varDist <- paste(" ",varName,"[i] ~ dnorm(mu_",varName,"[i], tau_",varName,")", sep="")
linPred <- paste(" ","mu_",varName,"[i] <- beta_",varName,"[1]", sep="")
}
else if (method[targetCol]=="logit") {
varDist <- paste(" ",varName,"[i] ~ dbern(mu_",varName,"[i])", sep="")
linPred <- paste(" ","logit(mu_",varName,"[i]) <- beta_",varName,"[1]", sep="")
}
else if (method[targetCol]=="pois") {
varDist <- paste(" ",varName,"[i] ~ dpois(mu_",varName,"[i])", sep="")
linPred <- paste(" ","log(mu_",varName,"[i]) <- beta_",varName,"[1]", sep="")
}
parNum <- 1
if (length(covNames)>0) {
for (i in 1:length(covNames)) {
covColNum <- which(colnames(originaldata)==covNames[i])
if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
#include this covariate as factor variable here
for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
parNum <- parNum + 1
linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
}
}
else {
parNum <- parNum + 1
linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
}
}
}
}
else if (method[targetCol]=="mlogit") {
varDist <- paste(" ",varName,"[i] ~ dcat(pi_",varName,"[i,])", sep="")
numCats <- max(originaldata[,targetCol],na.rm=TRUE)
linPred <- paste(" ","pi_",varName,"[i,1] <- 1", sep="")
for (catNum in 2:numCats) {
linPred <- paste(linPred, " - pi_",varName,"[i,",catNum,"]", sep="")
}
for (catNum in 2:numCats) {
linPred <- c(linPred, paste(" ","pi_",varName,"[i,",catNum,"] <- exp(xb_",varName,"[i,",catNum-1,"])/denom_",varName,"[i]", sep=""))
}
denomExpr <- paste(" ","denom_",varName,"[i] <- 1", sep="")
for (catNum in 2:numCats) {
denomExpr <- paste(denomExpr,"+exp(xb_",varName,"[i,",catNum-1,"])", sep="")
}
linPred <- c(linPred,denomExpr)
for (catNum in 2:numCats) {
xbExpr <- paste(" ","xb_",varName,"[i,",catNum-1,"] <- beta_",varName,"_",catNum-1,"[1]", sep="")
parNum <- 1
if (length(covNames)>0) {
for (i in 1:length(covNames)) {
covColNum <- which(colnames(originaldata)==covNames[i])
if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
#include this covariate as factor variable here
for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
parNum <- parNum + 1
xbExpr <- paste(xbExpr, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
}
}
else {
parNum <- parNum + 1
xbExpr <- paste(xbExpr, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
}
}
}
linPred <- c(linPred, xbExpr)
}
}
else if (method[targetCol]=="ologit") {
varDist <- paste(" ",varName,"[i] ~ dcat(pi_",varName,"[i,])", sep="")
numCats <- max(originaldata[,targetCol],na.rm=TRUE)
linPred <- paste(" ","xb_",varName,"[i] <- 0", sep="")
parNum <- 0
if (length(covNames)>0) {
for (i in 1:length(covNames)) {
covColNum <- which(colnames(originaldata)==covNames[i])
if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
#include this covariate as factor variable here
for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
parNum <- parNum + 1
linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
}
}
else {
parNum <- parNum + 1
linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
}
}
}
piExpr <- paste(" ","pi_",varName,"[i,1] <- 1", sep="")
for (catNum in 2:numCats) {
piExpr <- paste(piExpr, " - pi_",varName,"[i,",catNum,"]", sep="")
}
for (catNum in 2:(numCats-1)) {
piExpr <- c(piExpr, paste(" ","pi_",varName,"[i,",catNum,"] <- 1/(1+exp(-k_",varName,"_",catNum," + xb_",
varName,"[i])) - 1/(1+exp(-k_",varName,"_",catNum-1," + xb_",varName,"[i]))", sep=""))
}
piExpr <- c(piExpr, paste(" ","pi_",varName,"[i,",numCats,"] <- 1 - 1/(1+exp(-k_",varName,"_",numCats-1," + xb_",varName,"[i]))", sep=""))
linPred <- c(linPred, piExpr)
}
else if (method[targetCol]=="") stop("You must enter a method for variables with nonzero order values.")
else stop(paste("Method ",method[targetCol]," not recognised.",sep=""))
#append to modelCode
modelCode <- c(modelCode,"",varDist,linPred)
#append to priorCode
if (method[targetCol]=="mlogit") {
priorCode <- c(priorCode,"")
for (catNum in 2:numCats) {
priorCode <- c(priorCode,paste(" beta_",varName,"_",catNum-1," ~ dmnorm(beta_",varName,"_",catNum-1,"_mean,beta_",varName,"_",catNum-1,"_prec)", sep=""))
}
}
else if (method[targetCol]=="ologit") {
if (length(covNames)>0) {
priorCode <- c(priorCode,"",paste(" beta_",varName," ~ dmnorm(beta_",varName,"_mean,beta_",varName,"_prec)", sep=""))
}
#priors for cutpoints in ordinal logistic regression
priorCode <- c(priorCode, paste(" k_",varName,"_1 ~ dnorm(k_",varName,"_1_mean, k_",varName,"_1_prec)", sep=""))
for (catNum in 2:(numCats-1)) {
priorCode <- c(priorCode, paste(" k_",varName,"_",catNum," <- k_",varName,"_1 + kinc_",varName,"_",catNum-1, sep=""))
priorCode <- c(priorCode, paste(" kinc_",varName,"_",catNum-1," ~ dlnorm(kinc_",varName,"_",catNum-1,"_mean, kinc_",varName,"_",catNum-1,"_prec)",sep=""))
}
}
else {
priorCode <- c(priorCode,"",paste(" beta_",varName," ~ dmnorm(beta_",varName,"_mean,beta_",varName,"_prec)", sep=""))
if (method[targetCol]=="norm") {
priorCode <- c(priorCode, paste(" tau_",varName," ~ dgamma(tau_",varName,"_alpha, tau_",varName,"_beta)", sep=""))
}
}
#append priors to JAGS data list
if (method[targetCol]=="mlogit") {
for (catNum in 2:numCats) {
rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_",catNum-1,"_mean <- rep(0, ",parNum,")", sep=""),
paste("jags.data$beta_",varName,"_",catNum-1,"_prec <- 0.0001*diag(",parNum,")", sep=""))
}
}
else if (method[targetCol]=="ologit") {
if (length(covNames)>0) {
rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_mean <- rep(0, ",parNum,")", sep=""),
paste("jags.data$beta_",varName,"_prec <- 0.0001*diag(",parNum,")", sep=""))
}
rScriptText <- c(rScriptText, paste("jags.data$k_",varName,"_1_mean <- 0", sep=""))
rScriptText <- c(rScriptText, paste("jags.data$k_",varName,"_1_prec <- 0.0001", sep=""))
for (catNum in 2:(numCats-1)) {
rScriptText <- c(rScriptText, paste("jags.data$kinc_",varName,"_",catNum-1,"_mean <- 0", sep=""))
rScriptText <- c(rScriptText, paste("jags.data$kinc_",varName,"_",catNum-1,"_prec <- 0.0001", sep=""))
}
}
else {
rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_mean <- rep(0, ",parNum,")", sep=""),
paste("jags.data$beta_",varName,"_prec <- 0.0001*diag(",parNum,")", sep=""))
}
if (method[targetCol]=="norm") {
rScriptText <- c(rScriptText, paste("jags.data$tau_",varName,"_alpha <- 0.5",sep=""),
paste("jags.data$tau_",varName,"_beta <- 0.5", sep=""))
}
#if this is the outcome variable in the substantive model, add the model parameters to jags.params to monitor
if (targetCol==smoutcomecol) {
jagsparamline <- "jags.params <- c("
if (method[targetCol]=="mlogit") {
jagsparamline <- paste(jagsparamline, "\"beta_",varName,"_1\"", sep="")
if (numCats>2) {
for (catNum in 3:numCats) {
jagsparamline <- paste(jagsparamline, ",\"beta_",varName,"_",catNum-1,"\"", sep="")
}
}
jagsparamline <- paste(jagsparamline, ")", sep="")
}
else if (method[targetCol]=="ologit") {
jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\"", sep="")
jagsparamline <- paste(jagsparamline, ",\"k_",varName,"_1\"", sep="")
for (catNum in 2:(numCats-1)) {
jagsparamline <- paste(jagsparamline, ",\"kinc_",varName,"_",catNum-1,"\"", sep="")
}
jagsparamline <- paste(jagsparamline, ")", sep="")
}
else if (method[targetCol]=="norm") {
jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\",\"tau_",varName,"\")", sep="")
}
else {
jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\")", sep="")
}
rScriptText <- c(rScriptText, jagsparamline)
}
prevVars <- c(prevVars,targetCol)
}
}
#write model to file
modFileName <- "bayesmissmod.bug"
fileConn <- file(modFileName )
modelCode <- c(modelCode, " }","")
priorCode <- c(priorCode, "}")
writeLines(c(modelCode, priorCode), fileConn)
close(fileConn)
print(paste("Your JAGS model has been written to: ", modFileName, sep=""))
#run bayes analysis
rScriptFileName <- "bayesmissRScript.r"
fileConn <- file(rScriptFileName)
rScriptText <- c(rScriptText, paste("jagsmodel <- jags.model(data=jags.data, file=\"",modFileName,"\",n.chains=5)", sep=""))
rScriptText <- c(rScriptText, "burnin <- coda.samples(jagsmodel, variable.names=jags.params, n.iter=200)")
rScriptText <- c(rScriptText, "mainsample <- coda.samples(jagsmodel, variable.names=jags.params, n.iter=200)")
rScriptText <- c(rScriptText, "summary(mainsample)")
rScriptText <- c(rScriptText, "gelman.diag(mainsample)")
writeLines(rScriptText, fileConn)
close(fileConn)
print(paste("Your R script has been written to: ", rScriptFileName, sep=""))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.