#' Bootstrap then impute an incomplete dataset
#'
#' Bootstraps an incomplete dataset and then imputes each bootstrap a number
#' of times. The resulting list of bootstrapped then imputed datasets can
#' be analysed with \code{\link{bootImputeAnalyse}}.
#'
#' The \code{impfun} must be a function which when passed an incomplete datasets
#' and possibly additional arguments, returns a list of (e.g. 2) imputed datasets.
#' The number of imputed datasets that \code{impfun} returns should match the value
#' you specify for the argument \code{nImp}. Depending on what your imputation function
#' returns by default, you may need to write a small wrapper function that calls
#' the imputation procedure and returns the list of \code{nImp} datasets.See the
#' Example for an illustration with the \code{mice} package.
#'
#' To improve computation times, \code{bootImpute} now supports
#' multiple cores through the \code{nCores} argument which uses the \code{parallel}
#' package.
#'
#' @param obsdata The data frame to be imputed.
#' @param impfun A function which when passed an incomplete dataset will
#' return a list of imputed data frames.
#' @param nBoot The number of bootstrap samples to take. It is recommended
#' that you use a minimum of 200. If you specify \code{nCores>1}, \code{nBoot} must
#' be a multiple of the specified \code{nCores} value.
#' @param nImp The number of times to impute each bootstrap sample. Two
#' is recommended.
#' @param nCores The number of CPU cores to use. If specified greater than one,
#' \code{bootImpute} will impute using the number of cores specified.
#' @param seed Random number seed.
#' @param ... Other parameters that are to be passed through to \code{impfun},
#' which will often include the argument that tells \code{impfun} to generate
#' as many imputations as specified by the value passed to \code{nImp}.
#' @return A list of imputed datasets.
#'
#' @example data-raw/bootImputeExamples.r
#'
#' @export
bootImpute <- function(obsdata, impfun, nBoot=200, nImp=2, nCores=1, seed=NULL, ...) {
if (nBoot<200) {
warning("It is recommended to use at least 200 bootstraps.")
}
if ((nCores>1) & (is.null(seed))) {
stop("If you specify nCores>1 you must set a seed.")
}
n <- dim(obsdata)[1]
imps <- vector("list", nBoot*nImp)
count <- 1
#check impfun returns what we want
newImps <- impfun(obsdata, ...)
#check newImps is a list of right type and length
if (typeof(newImps)!="list") {
stop("Your imputation function must return a list of imputed datasets.")
}
if (length(newImps)!=nImp) {
stop("Your imputation function must return the same number of imputed datasets as the value you specify for nImp.")
}
#capture extra arguments into an object called args
args <- list(...)
if (nCores>1) {
#use multiple cores
if ((nBoot %% nCores)!=0) stop("nBoot must be a multiple of nCores.")
nBootPerCore <- nBoot/nCores
#the setup_strategy argument here is to temporarily deal with
#this issue which affects Macs: https://github.com/rstudio/rstudio/issues/6692
cl <- parallel::makeCluster(nCores, setup_strategy = "sequential")
#cl <- parallel::makeCluster(nCores)
if (!is.null(seed)) {
parallel::clusterSetRNGStream(cl, seed)
}
dots <- list(...)
if (length(dots)==0) {
#no extra arguments to pass to imputation function
parallel::clusterExport(cl, c("obsdata", "impfun", "nBootPerCore", "nImp"),
envir=environment())
parImps <- parallel::parLapply(cl, X=1:nCores, fun = function(no){
bootImpute::bootImpute(obsdata, impfun, nBoot=nBootPerCore, nImp=nImp, nCores=1)
})
} else {
#some extra arguments to pass to imputation function
parallel::clusterExport(cl, c("obsdata", "impfun", "nBootPerCore", "nImp", "dots"),
envir=environment())
parImps <- parallel::parLapply(cl, X=1:nCores, fun = function(no){
newarg <- c(list(obsdata=obsdata, impfun=impfun, nBoot=nBootPerCore, nImp=nImp, nCores=1), dots)
do.call(bootImpute::bootImpute, newarg)
})
}
parallel::stopCluster(cl)
imps <- do.call(c, parImps)
} else {
if (!is.null(seed)) {
set.seed(seed)
}
for (b in 1:nBoot) {
#take bootstrap sample
bsIndices <- sample(1:n, replace=TRUE)
#impute nImp times
newImps <- impfun(obsdata[bsIndices,], ...)
for (m in 1:nImp) {
imps[[count]] <- newImps[[m]]
count <- count + 1
}
}
}
attributes(imps) <- list(nBoot=nBoot, nImp=nImp)
#return list of imputations
imps
}
#' Analyse bootstrapped and imputed estimates
#'
#' Applies the user specified analysis function to each imputed dataset contained
#' in \code{imps}, then calculates estimates, confidence intervals and p-values
#' for each parameter, as proposed by von Hippel and Bartlett (2021).
#'
#' Multiple cores can be used by using the \code{nCores} argument, which may be
#' useful for reducing computation times.
#'
#' @param imps The list of imputed datasets returned by \code{\link{bootImpute}}
#' @param analysisfun A function which when applied to a single dataset returns
#' the estimate of the parameter(s) of interest. The dataset to be analysed
#' is passed to \code{analysisfun} as its first argument.
#' @param nCores The number of CPU cores to use. If specified greater than one,
#' \code{bootImputeAnalyse} will use the number of cores specified. The
#' number of bootstrap samples in \code{imps} should be divisible by \code{nCores}.
#' @param quiet Specify whether to print a table of estimates, confidence intervals
#' and p-values.
#' @param ... Other parameters that are to be passed through to \code{analysisfun}.
#' @return A vector containing the point estimate(s), variance estimates, and
#' degrees of freedom.
#'
#' @references von Hippel PT, Bartlett JW. Maximum likelihood multiple imputation: faster,
#' more efficient imputation without posterior draws. Statistical Science, 2021, 36(3):400-420.
#' \doi{10.1214/20-STS793}
#'
#' @import stats
#'
#' @example data-raw/bootImputeAnalyseExamples.r
#'
#'
#' @export
bootImputeAnalyse <- function(imps, analysisfun, nCores=1, quiet=FALSE, ...) {
nBoot <- attributes(imps)$nBoot
nImp <- attributes(imps)$nImp
#find out how many parameters are returned by analysisFun
firstResult <- analysisfun(imps[[1]],...)
nParms <- length(firstResult)
ests <- array(0, dim=c(nBoot,nImp,nParms))
if (nCores>1) {
#use multiple cores
if ((nBoot %% nCores)!=0) stop("nBoot must be divisible by nCores.")
nBootPerCore <- nBoot/nCores
#the setup_strategy argument here is to temporarily deal with
#this issue: https://github.com/rstudio/rstudio/issues/6692
cl <- parallel::makeCluster(nCores, setup_strategy = "sequential")
dots <- list(...)
if (length(dots)==0) {
#no extra arguments to pass to analysis function
parallel::clusterExport(cl, c("imps", "analysisfun", "nBootPerCore", "nImp", "nParms"),
envir=environment())
parEsts <- parallel::parLapply(cl, X=1:nCores, fun = function(no){
estArray <- array(0, dim=c(nBootPerCore, nImp, nParms))
bootStart <- (no-1)*nBootPerCore+1
impToAnalyse <- (bootStart-1)*nImp + 1
for (i in 1:nBootPerCore) {
for (j in 1:nImp) {
estArray[i,j,] <- analysisfun(imps[[impToAnalyse]])
impToAnalyse <- impToAnalyse + 1
}
}
estArray
})
} else {
#some extra arguments to pass to analysis function
parallel::clusterExport(cl, c("imps", "analysisfun", "nBootPerCore", "nImp", "nParms", "dots"),
envir=environment())
parEsts <- parallel::parLapply(cl, X=1:nCores, fun = function(no){
estArray <- array(0, dim=c(nBootPerCore, nImp, nParms))
bootStart <- (no-1)*nBootPerCore+1
impToAnalyse <- (bootStart-1)*nImp + 1
for (i in 1:nBootPerCore) {
for (j in 1:nImp) {
newarg <- c(list(data=imps[[impToAnalyse]]), dots)
estArray[i,j,] <- do.call(analysisfun, newarg)
impToAnalyse <- impToAnalyse + 1
}
}
estArray
})
}
parallel::stopCluster(cl)
for (i in 1:nCores) {
ests[((i-1)*nBootPerCore+1):(i*nBootPerCore),,] <- parEsts[[i]]
}
} else {
#use single core
count <- 1
for (b in 1:nBoot) {
for (m in 1:nImp) {
ests[b,m,] <- analysisfun(imps[[count]],...)
count <- count + 1
}
}
}
#fit one way model, separately for each parameter
est <- array(0, dim=nParms)
var <- array(0, dim=nParms)
ci <- array(0, dim=c(nParms,2))
df <- array(0, dim=nParms)
for (i in 1:nParms) {
SSW <- sum((ests[,,i]-rowMeans(ests[,,i]))^2)
SSB <- nImp*sum((rowMeans(ests[,,i])-mean(ests[,,i]))^2)
MSW <- SSW/(nBoot*(nImp-1))
MSB <- SSB/(nBoot-1)
resVar <- MSW
randIntVar <- (MSB-MSW)/nImp
if (randIntVar <= 0) {
warning(paste("Parameter ",i," has an estimated between bootstrap variance of zero. You should re-run with a larger nBoot value.",sep=""))
randIntVar <- 0
resVar <- (SSW+SSB)/(nBoot*nImp-1)
}
est[i] <- mean(ests[,,i])
var[i] <- (1+1/nBoot)*randIntVar + resVar/(nBoot*nImp)
df[i] <- (var[i]^2)/((((nBoot+1)/(nBoot*nImp))^2*MSB^2 / (nBoot-1)) + MSW^2/(nBoot*nImp^2*(nImp-1)))
#prevent df from going below 3
df[i] <- max(3,df[i])
ci[i,] <- c(est[i]-stats::qt(0.975,df[i])*var[i]^0.5, est[i]+stats::qt(0.975,df[i])*var[i]^0.5)
}
if (quiet==FALSE) {
resTable <- array(0, dim=c(nParms,5))
resTable[,1] <- est
resTable[,2] <- var^0.5
resTable[,3] <- ci[,1]
resTable[,4] <- ci[,2]
resTable[,5] <- 2*stats::pt(abs(est/var^0.5), df=df,lower.tail = FALSE)
colnames(resTable) <- c("Estimate", "Std. error", "95% CI lower", "95% CI upper", "p")
rownames(resTable) <- names(firstResult)
print(resTable)
}
list(ests=est, var=var, ci=ci, df=df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.