R/bayesmiss.R

#' Bayesian regression with missing covariates
#'
#' \code{bayesmiss} generates JAGS model code and an R script to perform
#' Bayesian regression, allowing for missingness in covariates.
#'
#' \code{bayesmiss} faciliates running Bayesian regression models in which there
#' are missing values in some of the covariates. The function generates two files,
#' one a JAGS model file, and one a R script file. The JAGS model defines the
#' user's substantive model, and models as required to handle the missing covariates,
#' and any auxiliary variables if present. The R script file generated contains
#' commands to generate the required JAGS data and parameter objects, and code
#' required to fit the model. Note that \code{bayesmiss} does not actually run this
#' code. Instead, the user is advised to check the model specification and R
#' script to ensure it is as desired. The R code can then be run to fit the model.
#'
#' The method argument specifies what model type to use for each variable.
#' Currently the possible values are:
#' \itemize{
#'  \item \code{"norm"} (normal linear model)
#'  \item \code{"logit"} (logistic regression)
#'  \item \code{"mlogit"} (multinomial logistic regression)
#'  \item \code{"ologit"} (ordinal logistic regression)
#'  \item \code{"pois"} (Poisson regression)
#' }
#' The element corresponding to the outcome of the
#' substantive model should be specified as desired according to the desired
#' type of substantive model. A model type must be specified for all auxiliary
#' variables. Entries corresponding to variables which are covariates in the
#' substantive model and which are fully observed should be specified as "".
#' Variables modelled using mlogit or ologit should be stored as numeric
#' (not factors), and coded 1:K, where K is the number of categories.
#'
#' The \code{order} argument is used to specify the order in which the joint
#' model is factorized. Elements corresponding to variables which are covariates
#' in the substantive model should be specified as 0. Substantive model covariates
#' which have missing values should come next. The substantive model outcome variable
#' should come next, followed by any auxiliary variables.
#'
#' Note that the MCMC options in the call to \code{jags} are just suggested defaults.
#' It is up to the user to ensure, via the usual diagnostics for MCMC, that a
#' sufficient number of iterations have been run to ensure convergence of the chains.
#' In particular, the code generated by \code{bayesmiss} does not specify initial
#' values for parameters. To assess convergence general advice is to choose different
#' overdispersed initial values for each chain.
#'
#' If it is desired to add interactions or non-linear covariate effects, first
#' run \code{bayesmiss} omitting these terms, and then modify the JAGS code file
#' and R code specifying the priors as needed.
#'
#' @param originaldata the data frame upon which the analysis is to be performed.
#'
#' @param smoutcome the name of the column corresponding to the outcome variable
#' of the substantive model.
#'
#' @param method a vector of strings specifying the model type to use for each of
#' the columns in \code{originaldata}.
#'
#' @param order a vector specifying the order in which the joint model should
#' be constructed. e.g. c(0,0,1,2) specifies that the joint model be factorized
#' as f(v3|v1,v2)f(v4|v1,v2,v3).
#'
#' @return \code{bayesmiss} generates two files in the current working directory:
#' \code{bayesmissmod.bug} is a JAGS model file for the constructed model, and \code{bayesmissRscript.r}
#' is an R file containing R code for generating the required JAGS parameters and data
#' objects, and a call to the \code{jags} function of the \code{R2jags} package for
#' fitting the model.
#'
#' @export
bayesmiss <- function(originaldata,smoutcome,method,order,factorcov=TRUE) {
  n <- dim(originaldata)[1]
  #create matrix of response indicators
  r <- 1*(is.na(originaldata)==0)

  if (ncol(originaldata)!=length(order)) stop("Argument to order must have the same length as the number of columns in the data frame.")
  if (ncol(originaldata)!=length(method)) stop("Argument to method must have the same length as the number of columns in the data frame.")

  smoutcomecol <- which(colnames(originaldata)==smoutcome)
  if (length(smoutcomecol)==0) stop(paste("No variable found with name: ",smoutcome,sep=""))

  #start constructing model code string
  modelCode <- c(
    "model {",
    "   for (i in 1:n) {")

  #start constructing code for R script file
  rScriptText <- "library(rjags,coda)"
  rScriptText <- c(rScriptText,paste("jags.data <- as.list(",deparse(substitute(originaldata)),")", sep=""),
  "jags.data$n <- n")

  #models for variables with non-zero order, i.e.
  #partially observed substantive model covariates, the outcome of the substantive model,
  #and any auxiliary variables
  numVars <- max(order)
  prevVars <- which(order==0)
  priorCode <- vector(mode="character",0)

  if (numVars>0) {
    for (var in 1:numVars) {
      targetCol <- which(order==var)
      varName <- colnames(originaldata)[targetCol]
      covNames <- colnames(originaldata)[prevVars]

      if ((method[targetCol]=="norm") | (method[targetCol]=="logit") | (method[targetCol]=="pois")) {
        if (method[targetCol]=="norm") {
          varDist <- paste("      ",varName,"[i] ~ dnorm(mu_",varName,"[i], tau_",varName,")", sep="")
          linPred <- paste("      ","mu_",varName,"[i] <- beta_",varName,"[1]", sep="")
        }
        else if (method[targetCol]=="logit") {
          varDist <- paste("      ",varName,"[i] ~ dbern(mu_",varName,"[i])", sep="")
          linPred <- paste("      ","logit(mu_",varName,"[i]) <- beta_",varName,"[1]", sep="")
        }
        else if (method[targetCol]=="pois") {
          varDist <- paste("      ",varName,"[i] ~ dpois(mu_",varName,"[i])", sep="")
          linPred <- paste("      ","log(mu_",varName,"[i]) <- beta_",varName,"[1]", sep="")
        }
        parNum <- 1

        if (length(covNames)>0) {
          for (i in 1:length(covNames)) {
            covColNum <- which(colnames(originaldata)==covNames[i])
            if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
              #include this covariate as factor variable here
              for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
                parNum <- parNum + 1
                linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
              }
            }
            else {
              parNum <- parNum + 1
              linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
            }
          }
        }

      }
      else if (method[targetCol]=="mlogit") {
        varDist <- paste("      ",varName,"[i] ~ dcat(pi_",varName,"[i,])", sep="")
        numCats <- max(originaldata[,targetCol],na.rm=TRUE)

        linPred <- paste("      ","pi_",varName,"[i,1] <- 1", sep="")
        for (catNum in 2:numCats) {
          linPred <- paste(linPred, " - pi_",varName,"[i,",catNum,"]", sep="")
        }
        for (catNum in 2:numCats) {
          linPred <- c(linPred, paste("      ","pi_",varName,"[i,",catNum,"] <- exp(xb_",varName,"[i,",catNum-1,"])/denom_",varName,"[i]", sep=""))
        }
        denomExpr <- paste("      ","denom_",varName,"[i] <- 1", sep="")
        for (catNum in 2:numCats) {
          denomExpr <- paste(denomExpr,"+exp(xb_",varName,"[i,",catNum-1,"])", sep="")
        }
        linPred <- c(linPred,denomExpr)
        for (catNum in 2:numCats) {
          xbExpr <- paste("      ","xb_",varName,"[i,",catNum-1,"] <- beta_",varName,"_",catNum-1,"[1]", sep="")
          parNum <- 1
          if (length(covNames)>0) {
            for (i in 1:length(covNames)) {
              covColNum <- which(colnames(originaldata)==covNames[i])
              if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
                #include this covariate as factor variable here
                for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
                  parNum <- parNum + 1
                  xbExpr <- paste(xbExpr, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
                }
              }
              else {
                parNum <- parNum + 1
                xbExpr <- paste(xbExpr, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
              }
            }
          }
          linPred <- c(linPred, xbExpr)
        }
      }
      else if (method[targetCol]=="ologit") {
        varDist <- paste("      ",varName,"[i] ~ dcat(pi_",varName,"[i,])", sep="")
        numCats <- max(originaldata[,targetCol],na.rm=TRUE)

        linPred <- paste("      ","xb_",varName,"[i] <- 0", sep="")

        parNum <- 0
        if (length(covNames)>0) {
          for (i in 1:length(covNames)) {
            covColNum <- which(colnames(originaldata)==covNames[i])
            if ((method[covColNum]=="ologit") | (method[covColNum]=="mlogit")) {
              #include this covariate as factor variable here
              for (levelNum in 2:max(originaldata[,covColNum],na.rm=TRUE)) {
                parNum <- parNum + 1
                linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*equals(", covNames[i], "[i],",levelNum,")", sep="")
              }
            }
            else {
              parNum <- parNum + 1
              linPred <- paste(linPred, " + beta_",varName,"[",parNum,"]*", covNames[i], "[i]", sep="")
            }
          }
        }
        piExpr <- paste("      ","pi_",varName,"[i,1] <- 1", sep="")
        for (catNum in 2:numCats) {
          piExpr <- paste(piExpr, " - pi_",varName,"[i,",catNum,"]", sep="")
        }
        for (catNum in 2:(numCats-1)) {
          piExpr <- c(piExpr, paste("      ","pi_",varName,"[i,",catNum,"] <- 1/(1+exp(-k_",varName,"_",catNum," + xb_",
                                    varName,"[i])) - 1/(1+exp(-k_",varName,"_",catNum-1," + xb_",varName,"[i]))", sep=""))
        }
        piExpr <- c(piExpr, paste("      ","pi_",varName,"[i,",numCats,"] <- 1 - 1/(1+exp(-k_",varName,"_",numCats-1," + xb_",varName,"[i]))", sep=""))
        linPred <- c(linPred, piExpr)
      }
      else if (method[targetCol]=="") stop("You must enter a method for variables with nonzero order values.")
      else stop(paste("Method ",method[targetCol]," not recognised.",sep=""))

      #append to modelCode
      modelCode <- c(modelCode,"",varDist,linPred)

      #append to priorCode
      if (method[targetCol]=="mlogit") {
        priorCode <- c(priorCode,"")
        for (catNum in 2:numCats) {
          priorCode <- c(priorCode,paste("   beta_",varName,"_",catNum-1," ~ dmnorm(beta_",varName,"_",catNum-1,"_mean,beta_",varName,"_",catNum-1,"_prec)", sep=""))
        }
      }
      else if (method[targetCol]=="ologit") {
        if (length(covNames)>0) {
          priorCode <- c(priorCode,"",paste("   beta_",varName," ~ dmnorm(beta_",varName,"_mean,beta_",varName,"_prec)", sep=""))
        }
        #priors for cutpoints in ordinal logistic regression
        priorCode <- c(priorCode, paste("   k_",varName,"_1 ~ dnorm(k_",varName,"_1_mean, k_",varName,"_1_prec)", sep=""))
        for (catNum in 2:(numCats-1)) {
          priorCode <- c(priorCode, paste("   k_",varName,"_",catNum," <- k_",varName,"_1 + kinc_",varName,"_",catNum-1,  sep=""))
          priorCode <- c(priorCode, paste("   kinc_",varName,"_",catNum-1," ~ dlnorm(kinc_",varName,"_",catNum-1,"_mean, kinc_",varName,"_",catNum-1,"_prec)",sep=""))
        }
      }
      else {
        priorCode <- c(priorCode,"",paste("   beta_",varName," ~ dmnorm(beta_",varName,"_mean,beta_",varName,"_prec)", sep=""))
        if (method[targetCol]=="norm") {
          priorCode <- c(priorCode, paste("   tau_",varName," ~ dgamma(tau_",varName,"_alpha, tau_",varName,"_beta)", sep=""))
        }
      }

      #append priors to JAGS data list
      if (method[targetCol]=="mlogit") {
        for (catNum in 2:numCats) {
          rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_",catNum-1,"_mean <- rep(0, ",parNum,")", sep=""),
                           paste("jags.data$beta_",varName,"_",catNum-1,"_prec <- 0.0001*diag(",parNum,")", sep=""))
        }
      }
      else if (method[targetCol]=="ologit") {
        if (length(covNames)>0) {
          rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_mean <- rep(0, ",parNum,")", sep=""),
                           paste("jags.data$beta_",varName,"_prec <- 0.0001*diag(",parNum,")", sep=""))
        }
        rScriptText <- c(rScriptText, paste("jags.data$k_",varName,"_1_mean <- 0", sep=""))
        rScriptText <- c(rScriptText, paste("jags.data$k_",varName,"_1_prec <- 0.0001", sep=""))
        for (catNum in 2:(numCats-1)) {
          rScriptText <- c(rScriptText, paste("jags.data$kinc_",varName,"_",catNum-1,"_mean <- 0", sep=""))
          rScriptText <- c(rScriptText, paste("jags.data$kinc_",varName,"_",catNum-1,"_prec <- 0.0001", sep=""))
        }
      }
      else {
        rScriptText <- c(rScriptText, paste("jags.data$beta_",varName,"_mean <- rep(0, ",parNum,")", sep=""),
          paste("jags.data$beta_",varName,"_prec <- 0.0001*diag(",parNum,")", sep=""))
      }
      if (method[targetCol]=="norm") {
        rScriptText <- c(rScriptText, paste("jags.data$tau_",varName,"_alpha <- 0.5",sep=""),
                         paste("jags.data$tau_",varName,"_beta <- 0.5", sep=""))
      }

      #if this is the outcome variable in the substantive model, add the model parameters to jags.params to monitor
      if (targetCol==smoutcomecol) {
        jagsparamline <- "jags.params <- c("

        if (method[targetCol]=="mlogit") {
          jagsparamline <- paste(jagsparamline, "\"beta_",varName,"_1\"", sep="")
          if (numCats>2) {
            for (catNum in 3:numCats) {
              jagsparamline <- paste(jagsparamline, ",\"beta_",varName,"_",catNum-1,"\"", sep="")
            }
          }
          jagsparamline <- paste(jagsparamline, ")", sep="")
        }
        else if (method[targetCol]=="ologit") {
          jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\"", sep="")
          jagsparamline <- paste(jagsparamline, ",\"k_",varName,"_1\"", sep="")
          for (catNum in 2:(numCats-1)) {
            jagsparamline <- paste(jagsparamline, ",\"kinc_",varName,"_",catNum-1,"\"", sep="")
          }
          jagsparamline <- paste(jagsparamline, ")", sep="")
        }
        else if (method[targetCol]=="norm") {
          jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\",\"tau_",varName,"\")", sep="")
        }
        else {
          jagsparamline <- paste(jagsparamline, "\"beta_",varName,"\")", sep="")
        }
        rScriptText <- c(rScriptText, jagsparamline)
      }

      prevVars <- c(prevVars,targetCol)
    }
  }

  #write model to file
  modFileName <- "bayesmissmod.bug"
  fileConn <- file(modFileName )
  modelCode <- c(modelCode, "   }","")
  priorCode <- c(priorCode, "}")
  writeLines(c(modelCode, priorCode), fileConn)
  close(fileConn)

  print(paste("Your JAGS model has been written to: ", modFileName, sep=""))

  #run bayes analysis
  rScriptFileName <- "bayesmissRScript.r"
  fileConn <- file(rScriptFileName)

  rScriptText <- c(rScriptText, paste("jagsmodel <- jags.model(data=jags.data, file=\"",modFileName,"\",n.chains=5)", sep=""))
  rScriptText <- c(rScriptText, "burnin <- coda.samples(jagsmodel, variable.names=jags.params, n.iter=200)")
  rScriptText <- c(rScriptText, "mainsample <- coda.samples(jagsmodel, variable.names=jags.params, n.iter=200)")
  rScriptText <- c(rScriptText, "summary(mainsample)")
  rScriptText <- c(rScriptText, "gelman.diag(mainsample)")
  writeLines(rScriptText, fileConn)
  close(fileConn)

  print(paste("Your R script has been written to: ", rScriptFileName, sep=""))

}
jwb133/bayesmiss documentation built on May 20, 2019, 5:21 a.m.