mlmi: Maximum Likelihood Multiple Imputation

Documented in scoreBased

#' Score based variance estimation for multiple imputation
#'
#' This function implements the score based variance estimation approach described by von Hippel
#' and Bartlett (2021), which is based on earlier work by Wang and Robins (1998).
#'
#' @param imps A list of imputed datasets produced by one of the imputation functions
#' in \code{mlmi} or another package.
#' @param analysisFun A function to analyse the imputed datasets that when applied to
#' a dataset returns a list containing a vector \code{est}.
#' @param scoreFun A function whose first argument is a dataset and whose second argument is
#' a vector of parameter values. It should return a matrix of subject level scores
#' evaluated at the parameter value passed to it.
#' @param pd If \code{imps} was not generated by one of the imputation functions in
#' \code{mlmi}, this argument must be specified to indicate whether the imputations
#' were generated using posterior draws (TRUE) or not (FALSE).
#' @param dfComplete The complete data degrees of freedom. If \code{analysisFun} returns a vector
#' of parameter estimates, \code{dfComplete} should be a vector of the same length. If not
#' specified, it is assumed that the complete data degrees of freedom is effectively infinite (1e+05).
#' @param ... Other parameters that are to be passed through to \code{analysisFun}.
#' @return A list containing the overall parameter estimates, its corresponding covariance matrix, and
#' degrees of freedom for each parameter.
#'
#' @references Wang N., Robins J.M. (1998) Large-sample theory for parametric multiple imputation procedures.
#' Biometrika 85(4): 935-948. \doi{10.1093/biomet/85.4.935}.
#'
#' @references von Hippel P.T. and Bartlett J.W. Maximum likelihood multiple imputation: faster,
#' more efficient imputation without posterior draws. Statistical Science 2021; 36(3) 400-420 \doi{10.1214/20-STS793}.
#'
#' @example data-raw/sbExample.r
#'
#'
#' @export
scoreBased <- function(imps, analysisFun, scoreFun, pd=NULL, dfComplete=NULL, ...) {
  M <- length(imps)
  if ("pd" %in% names(attributes(imps)) == TRUE) {
    pd <- as.logical(attributes(imps)['pd'])
  } else {
    if (is.null(pd)==TRUE) {
      stop("Your imputed datasets doesn't have a pd attribute stored, so you must use specify the pd argument when calling wb.")
    }
  }

  #run analysis on first imputation to find out length of parameter vector
  result <- analysisFun(imps[[1]],...)
  numParms <- length(result$est)
  N <- dim(imps[[1]])[1]

  #analyse each imputed datasets
  ests <- array(0, dim=c(M,numParms))
  for (m in 1:M) {
    result <- analysisFun(imps[[m]],...)
    ests[m,] <- result$est
  }
  thetaHat <- colMeans(ests)
  Bhat <- var(ests)
  scores <- array(0, dim=c(M,N,numParms))
  Vcom_inv <- array(0, dim=c(numParms,numParms))
  for (m in 1:M) {
    scores[m,,] <- scoreFun(imps[[m]], thetaHat,...)
    Vcom_inv <- Vcom_inv + t(scores[m,,]) %*% scores[m,,]
  }
  Vcom_inv <- Vcom_inv / M
  scom_bar <- apply(scores, c(2,3), mean)
  Vmis_inv <- array(0, dim=c(numParms,numParms))
  for (m in 1:M) {
    Vmis_inv <- Vmis_inv + t(scores[m,,]-scom_bar) %*% (scores[m,,]-scom_bar)
  }
  Vmis_inv <- Vmis_inv/(M-1)
  VML_inv <- Vcom_inv - Vmis_inv
  Vcom <- solve(Vcom_inv)

  #ensure matrices are positive definite for next calculation to avoid numerical errors
  Vcom <- Matrix::nearPD(Vcom)$mat
  Vmis_inv <- Matrix::nearPD(Vmis_inv)$mat

  gammaHatMis <- Vmis_inv %*% Vcom
  gammaHatObs <- diag(numParms) - gammaHatMis
  gammaTildeMis <- H(gammaHatMis, (M-1)*N)

  Vcom <- as.matrix(Vcom)
  gammaTildeMis <- as.matrix(gammaTildeMis)

  gammaTildeObs <- diag(numParms) - gammaTildeMis
  VTildeML <- Vcom %*% solve(gammaTildeObs)
  totalVar <- VTildeML + Bhat/M

  if (is.null(dfComplete)) {
    #assume effectively infinite degrees of freedom for each parameter
    dfComplete <- rep(100000,numParms)
  }

  nuObs <- dfComplete*diag(gammaTildeObs)*(dfComplete+3)/(dfComplete+1)
  MIdf <-  diag(totalVar)^2 / (diag(VTildeML)^2/nuObs + (diag(Bhat)/M)^2/(M-1))
  #don't let the df go below 3
  MIdf <- pmax(3,MIdf)

  list(est=thetaHat, var=totalVar, df=MIdf)
}