R/pool.R
In rbmi: Reference Based Multiple Imputation

Documented in as.data.frame.pool get_ests_bmlmi get_pool_components parametric_ci pool pool_bootstrap_normal pool_bootstrap_percentile pool_internal pool_internal.bmlmi pool_internal.bootstrap pool_internal.jackknife pool_internal.rubin print.pool pval_percentile rubin_df rubin_rules transpose_results

#' Pool analysis results obtained from the imputed datasets
#'
#' @param results an analysis object created by [analyse()].
#'
#' @param conf.level confidence level of the returned confidence interval.
#' Must be a single number between 0 and 1. Default is 0.95.
#'
#' @param alternative a character string specifying the alternative hypothesis,
#' must be one of `"two.sided"` (default), `"greater"` or `"less"`.
#'
#' @param type a character string of either `"percentile"` (default) or
#' `"normal"`. Determines what method should be used to calculate the bootstrap confidence
#' intervals. See details.
#' Only used if `method_condmean(type = "bootstrap")` was specified
#' in the original call to [draws()].
#'
#' @param x a `pool` object generated by [pool()].
#'
#' @param ... not used.
#'
#' @details
#'
#' The calculation used to generate the point estimate, standard errors and
#' confidence interval depends upon the method specified in the original
#' call to [draws()]; In particular:
#'
#' - `method_approxbayes()` & `method_bayes()` both use Rubin's rules to pool estimates
#'  and variances across multiple imputed datasets, and the Barnard-Rubin rule to pool
#'  degree's of freedom; see Little & Rubin (2002).
#' - `method_condmean(type = "bootstrap")` uses percentile or normal approximation;
#' see Efron & Tibshirani (1994). Note that for the percentile bootstrap, no standard error is
#' calculated, i.e. the standard errors will be `NA` in the object / `data.frame`.
#' - `method_condmean(type = "jackknife")` uses the standard jackknife variance formula;
#' see Efron & Tibshirani (1994).
#' - `method_bmlmi` uses pooling procedure for Bootstrapped Maximum Likelihood MI (BMLMI).
#' See Von Hippel & Bartlett (2021).
#'
#' @references
#' Bradley Efron and Robert J Tibshirani. An introduction to the bootstrap. CRC
#' press, 1994. \[Section 11\]
#'
#' Roderick J. A. Little and Donald B. Rubin. Statistical Analysis with Missing
#' Data, Second Edition. John Wiley & Sons, Hoboken, New Jersey, 2002. \[Section 5.4\]
#'
#' Von Hippel, Paul T and Bartlett, Jonathan W.
#' Maximum likelihood multiple imputation: Faster imputations and consistent standard
#' errors without posterior draws. 2021.

#' @export
pool <- function(
    results,
    conf.level = 0.95,
    alternative = c("two.sided", "less", "greater"),
    type = c("percentile", "normal")
) {

    assert_that(
        has_class(results, "analysis")
    )
    validate(results)

    alternative <- match.arg(alternative)
    type <- match.arg(type)

    assert_that(
        is.numeric(conf.level),
        length(conf.level) == 1,
        0 < conf.level & conf.level < 1,
        msg = "`conf.level` must be between 0 and 1"
    )

    pool_type <- class(results$results)[[1]]

    results_transpose <- transpose_results(
        results$results,
        get_pool_components(pool_type)
    )

    pars <- lapply(
        results_transpose,
        function(x, ...) pool_internal(as_class(x, pool_type), ...),
        conf.level = conf.level,
        alternative = alternative,
        type = type,
        D = results$method$D
    )

    if (pool_type == "bootstrap") {
        method <- sprintf("%s (%s)", pool_type, type)
    } else {
        method <- pool_type
    }

    ret <- list(
        pars = pars,
        conf.level = conf.level,
        alternative = alternative,
        N = length(results$results),
        method = method
    )
    class(ret) <- "pool"
    return(ret)
}


#' Expected Pool Components
#'
#' Returns the elements expected to be contained in the analyse object
#' depending on what analysis method was specified.
#'
#' @param x Character name of the analysis method, must one of
#' either `"rubin"`, `"jackknife"`, "`bootstrap"` or `"bmlmi"`.
get_pool_components <- function(x) {
    switch(x,
        "rubin" = c("est", "df", "se"),
        "jackknife" = c("est"),
        "bootstrap" = c("est"),
        "bmlmi" = c("est")
    )
}


#' Internal Pool Methods
#'
#' Dispatches pool methods based upon results object class. See
#' [pool()] for details.
#'
#' @inheritParams pool
#' @param results a list of results i.e. the `x$results` element of an
#' `analyse` object created by [analyse()]).
#' @param D numeric representing the number of imputations between each bootstrap sample in the BMLMI method.
#'
#' @name pool_internal
#' @export
pool_internal <- function(results, conf.level, alternative, type, D) {
    UseMethod("pool_internal")
}


#' @importFrom stats qnorm pnorm
#' @rdname pool_internal
#' @export
pool_internal.jackknife <- function(results, conf.level, alternative, type, D) {
    alpha <- 1 - conf.level
    ests <- results$est
    est_point <- ests[1]
    ests_jack <- ests[-1] # First estimate is full dataset
    mean_jack <- mean(ests_jack)
    N_jack <- length(ests_jack)
    se_jack <- sqrt(((N_jack - 1) / N_jack) * sum((ests_jack - mean_jack)^2))
    ret <- parametric_ci(est_point, se_jack, alpha, alternative, qnorm, pnorm)
    return(ret)
}



#' @rdname pool_internal
#' @export
pool_internal.bootstrap <- function(
    results,
    conf.level,
    alternative,
    type = c("percentile", "normal"),
    D
) {
    type <- match.arg(type)
    bootfun <- switch(
        type,
        percentile = pool_bootstrap_percentile,
        normal = pool_bootstrap_normal
    )

    ret <- bootfun(results$est, conf.level, alternative)
    return(ret)
}


#' @importFrom stats qt pt var
#' @rdname pool_internal
#' @export
pool_internal.bmlmi <- function(
    results,
    conf.level,
    alternative,
    type,
    D
) {

    ests <- results$est
    alpha <- 1 - conf.level

    pooled_est <- get_ests_bmlmi(ests, D)

    ret <- parametric_ci(
        point = pooled_est$est_point,
        se = sqrt(pooled_est$est_var),
        alpha = alpha,
        alternative = alternative,
        qfun = qt,
        pfun = pt,
        df = pooled_est$df
    )

    return(ret)
}


#' @title Von Hippel and Bartlett pooling of BMLMI method
#'
#' @description Compute pooled point estimates, standard error and degrees of freedom
#' according to the Von Hippel and Bartlett formula for Bootstrapped Maximum Likelihood
#' Multiple Imputation (BMLMI).
#'
#' @param ests numeric vector containing estimates from the analysis of the imputed datasets.
#' @param D numeric representing the number of imputations between each bootstrap sample in the BMLMI method.
#'
#' @return a list containing point estimate, standard error and degrees of freedom.
#'
#' @details `ests` must be provided in the following order: the firsts D elements are related to analyses from
#' random imputation of one bootstrap sample. The second set of D elements (i.e. from D+1 to 2*D)
#' are related to the second bootstrap sample and so on.
#'
#' @references
#'   Von Hippel, Paul T and Bartlett, Jonathan W8.
#'   Maximum likelihood multiple imputation: Faster imputations and consistent standard errors
#' without posterior draws. 2021
get_ests_bmlmi <- function(ests, D) {

    l <- length(ests)

    assert_that(
        l %% D == 0,
        msg = "length of `ests` must be a multiple of `D`"
    )

    assert_that(
        D > 1,
        msg = "`D` must be a numeric larger than 1"
    )

    B <- l / D

    ests <- matrix(ests, nrow = B, ncol = D, byrow = TRUE)

    est_point <- mean(ests)

    SSW <- sum((ests - rowMeans(ests))^2)
    SSB <- D * sum((rowMeans(ests) - est_point)^2)
    MSW <- SSW / (B * (D - 1))
    MSB <- SSB / (B - 1)
    resVar <- MSW
    randIntVar <- (MSB - MSW) / D

    est_var <- (1 + 1 / B) * randIntVar + resVar / (B * D)
    df <- (est_var^2) / ((((B + 1) / (B * D))^2 * MSB^2 / (B - 1)) + MSW^2 / (B * D^2 * (D - 1)))
    df <- max(3, df)

    ret_obj <- list(
        est_point = est_point,
        est_var = est_var,
        df = df
    )
    return(ret_obj)
}

#' @importFrom stats qt pt
#' @rdname pool_internal
#' @export
pool_internal.rubin <- function(results, conf.level, alternative, type, D) {
    ests <- results$est
    ses <- results$se
    dfs <- results$df
    alpha <- 1 - conf.level

    v_com <- unique(dfs)

    assert_that(
        length(v_com) == 1,
        msg = "Degrees of freedom should be consistent across all samples"
    )

    res_rubin <- rubin_rules(
        ests = ests,
        ses = ses,
        v_com = v_com
    )

    ret <- parametric_ci(
        point = res_rubin$est_point,
        se = sqrt(res_rubin$var_t),
        alpha = alpha,
        alternative = alternative,
        qfun = qt,
        pfun = pt,
        df = res_rubin$df
    )

    return(ret)
}


#' @title Barnard and Rubin degrees of freedom adjustment
#'
#' @description Compute degrees of freedom according to the Barnard-Rubin formula.
#'
#' @param v_com Positive number representing the degrees of freedom in the complete-data analysis.
#' @param var_b Between-variance of point estimate across multiply imputed datasets.
#' @param var_t Total-variance of point estimate according to Rubin's rules.
#' @param M Number of imputations.
#'
#' @return Degrees of freedom according to Barnard-Rubin formula. See Barnard-Rubin (1999).
#'
#' @details The computation takes into account limit cases where there is no missing data
#'   (i.e. the between-variance `var_b` is zero) or where the complete-data degrees of freedom is
#'   set to `Inf`. Moreover, if `v_com` is given as `NA`, the function returns `Inf`.
#'
#' @references
#'   Barnard, J. and Rubin, D.B. (1999).
#'   Small sample degrees of freedom with multiple imputation. Biometrika, 86, 948-955.
rubin_df <- function(v_com, var_b, var_t, M) {

    if (is.na(v_com) || (is.infinite(v_com) && var_b == 0)) {
        df <- Inf
    } else {
        lambda <- (1 + 1 / M) * var_b / var_t

        if (!is.infinite(v_com)) {
            v_obs <- ((v_com + 1) / (v_com + 3)) * v_com * (1 - lambda)
        }

        if (lambda != 0) {
            v_old <- (M - 1)  / lambda^2
            df <- if (is.infinite(v_com)) {
                v_old
            } else {
                (v_old * v_obs) / (v_old + v_obs)
            }
        } else {
            df <- v_obs
        }
    }

    return(df)
}


#' @title Combine estimates using Rubin's rules
#'
#' @description Pool together the results from `M` complete-data analyses according to Rubin's rules. See details.
#'
#' @param ests Numeric vector containing the point estimates from the complete-data analyses.
#' @param ses Numeric vector containing the standard errors from the complete-data analyses.
#' @param v_com Positive number representing the degrees of freedom in the complete-data analysis.
#'
#' @return
#' A list containing:
#'
#' - `est_point`: the pooled point estimate according to Little-Rubin (2002).
#' - `var_t`: total variance according to Little-Rubin (2002).
#' - `df`: degrees of freedom according to Barnard-Rubin (1999).
#'
#' @details `rubin_rules` applies Rubin's rules (Rubin, 1987) for pooling together
#' the results from a multiple imputation procedure. The pooled point estimate `est_point` is
#' is the average across the point estimates from the complete-data analyses (given by the input argument `ests`).
#' The total variance `var_t` is the sum of two terms representing the within-variance
#' and the between-variance (see Little-Rubin (2002)). The function
#' also returns `df`, the estimated pooled degrees of freedom according to Barnard-Rubin (1999)
#' that can be used for inference based on the t-distribution.
#'
#' @seealso [rubin_df()] for the degrees of freedom estimation.
#'
#' @references
#' Barnard, J. and Rubin, D.B. (1999).
#' Small sample degrees of freedom with multiple imputation. Biometrika, 86, 948-955
#'
#' Roderick J. A. Little and Donald B. Rubin. Statistical Analysis with Missing
#' Data, Second Edition. John Wiley & Sons, Hoboken, New Jersey, 2002. \[Section 5.4\]
#' @importFrom stats var
rubin_rules <- function(ests, ses, v_com) {

    M <- length(ests)
    est_point <- mean(ests)

    if (all(is.na(ses))) {
        return(
            list(
                est_point = est_point,
                var_t = NA,
                df = NA
            )
        )
    }

    var_w <- mean(ses^2)
    var_b <- var(ests)
    var_t <- var_w + var_b + var_b / M

    df <- rubin_df(
        v_com = v_com,
        var_b = var_b,
        var_t = var_t,
        M = M
    )

    ret_obj <- list(
        est_point = est_point,
        var_t = var_t,
        df = df
    )

    return(ret_obj)
}







#' Bootstrap Pooling via Percentiles
#' @description
#' Get point estimate, confidence interval and p-value using
#' percentiles. Note that quantile "type=6" is used,
#' see [stats::quantile()] for details.
#' @param est a numeric vector of point estimates from each bootstrap sample.
#' @inheritParams pool
#' @details
#' The point estimate is taken to be the first element of `est`. The remaining
#' n-1 values of `est` are then used to generate the confidence intervals.
pool_bootstrap_percentile <- function(est, conf.level, alternative) {
    est_orig <- est[1]
    est <- est[-1]

    if (length(est) == 0) { # if n_samples = 0 we cannot estimate pvalue and CIs
        ret <- list(
            est = est_orig,  # First estimate should be original dataset
            ci = c(NA, NA),
            se = NA,
            pvalue = NA
        )
        return(ret)
    }

    alpha <- 1 - conf.level

    pvals <- pval_percentile(est = est)
    names(pvals) <- NULL

    index <- switch(
        alternative,
        two.sided = c(1, 2),
        greater = 1,
        less = 2
    )

    quant_2_side <- quantile(
        est,
        probs = c(alpha / 2, 1 - alpha / 2),
        type = 6,
        names = FALSE
    )
    quant_1_side <- quantile(
        est,
        probs = c(alpha, 1 - alpha),
        type = 6,
        names = FALSE
    )

    ci <- switch(
        alternative,
        two.sided = quant_2_side,
        greater = c(-Inf, quant_1_side[2]),
        less = c(quant_1_side[1], Inf)
    )

    ret <- list(
        est = est_orig,  # First estimate should be original dataset
        ci = ci,
        se = NA,
        pvalue = min(min(pvals[index]) * length(pvals[index]), 1)
    )
    return(ret)
}

#' @title P-value of percentile bootstrap
#'
#' @description Determines the (not necessarily unique) quantile (type=6) of "est" which gives a value of 0
#' From this, derive the p-value corresponding to the percentile bootstrap via inversion.
#'
#' @param est a numeric vector of point estimates from each bootstrap sample.
#'
#' @details The p-value for H_0: theta=0 vs H_A: theta>0 is the value `alpha` for which `q_alpha = 0`.
#' If there is at least one estimate equal to zero it returns the largest `alpha` such that `q_alpha = 0`.
#' If all bootstrap estimates are > 0 it returns 0; if all bootstrap estimates are < 0 it returns 1. Analogous
#' reasoning is applied for the p-value for H_0: theta=0 vs H_A: theta<0.
#'
#' @return A named numeric vector of length 2 containing the p-value for H_0: theta=0 vs H_A: theta>0
#' (`"pval_greater"`) and the p-value for H_0: theta=0 vs H_A: theta<0 (`"pval_less"`).
#'
#' @importFrom stats uniroot
pval_percentile <- function(est) {

    if (all(est == 0)) {
        ret <- c(1, 1)
    } else if (min(est) > 0) {
        ret <- c(0, 1)
    } else if (max(est) < 0) {
        ret <- c(1, 0)
    } else if (any(est == 0)) {
        # see ?quantile "type=6" section for explanation
        quant <- rank(est, ties.method = "first") / (length(est) + 1)
        ret <- c(
            min(quant[est == 0]),
            1 - max(quant[est == 0])
        )
    } else {
        x <- uniroot(
            function(q) quantile(est, probs = q, type = 6),
            lower = 0,
            upper = 1
        )$root
        ret <- c(x, 1 - x)
    }
    names(ret) <- c("pval_greater", "pval_less")
    return(ret)
}

#' Bootstrap Pooling via normal approximation
#' @description
#' Get point estimate, confidence interval and p-value using
#' the normal approximation.
#' @param est a numeric vector of point estimates from each bootstrap sample.
#' @inheritParams pool
#' @details
#' The point estimate is taken to be the first element of est. The remaining
#' n-1 values of est are then used to generate the confidence intervals.
#' @importFrom stats sd qnorm pnorm quantile
pool_bootstrap_normal <- function(est, conf.level, alternative) {
    est_orig <- est[1] # First estimate should be original dataset
    est <- est[-1]
    alpha <- 1 - conf.level
    se <- sd(est)
    ret <- parametric_ci(est_orig, se, alpha, alternative, qnorm, pnorm)
    return(ret)
}



#' Calculate parametric confidence intervals
#'
#' @description
#' Calculates confidence intervals based upon a parametric
#' distribution.
#'
#' @param point The point estimate.
#'
#' @param se The standard error of the point estimate. If using a non-"normal"
#' distribution this should be set to 1.
#'
#' @param alpha The type 1 error rate, should be a value between 0 and 1.
#'
#' @param alternative a character string specifying the alternative hypothesis,
#' must be one of "two.sided" (default), "greater" or "less".
#'
#' @param qfun The quantile function for the assumed distribution i.e. `qnorm`.
#'
#' @param pfun The CDF function for the assumed distribution i.e. `pnorm`.
#'
#' @param ... additional arguments passed on `qfun` and `pfun` i.e. `df = 102`.
#'
parametric_ci <- function(point, se, alpha, alternative, qfun, pfun, ...) {
    ci <- switch(
        alternative,
        two.sided = c(-1, 1) * qfun(1 - alpha / 2, ...),
        greater =  c(-Inf, 1) *  qfun(1 - alpha, ...),
        less = c(-1, Inf) * qfun(1 - alpha, ...)
    ) * se + point

    pvals <- c(
        pfun(point / se, lower.tail = TRUE, ...),
        pfun(point / se, lower.tail = FALSE, ...)
    )

    index <- switch(
        alternative,
        two.sided = c(1, 2),
        greater = 2,
        less = 1
    )

    ret <- list(
        est = point,
        ci = ci,
        se = se,
        pvalue = min(pvals[index]) * length(pvals[index])
    )
    return(ret)
}





#' Transpose results object
#'
#' Transposes a Results object (as created by [analyse()]) in order to group
#' the same estimates together into vectors.
#'
#' @param results A list of results.
#' @param components a character vector of components to extract
#' (i.e. `"est", "se"`).
#'
#' @details
#'
#' Essentially this function takes an object of the format:
#'
#' ```
#' x <- list(
#'     list(
#'         "trt1" = list(
#'             est = 1,
#'             se  = 2
#'         ),
#'         "trt2" = list(
#'             est = 3,
#'             se  = 4
#'         )
#'     ),
#'     list(
#'         "trt1" = list(
#'             est = 5,
#'             se  = 6
#'         ),
#'         "trt2" = list(
#'             est = 7,
#'             se  = 8
#'         )
#'     )
#' )
#' ```
#'
#' and produces:
#'
#' ```
#' list(
#'     trt1 = list(
#'         est = c(1,5),
#'         se = c(2,6)
#'     ),
#'     trt2 = list(
#'         est = c(3,7),
#'         se = c(4,8)
#'     )
#' )
#' ```
transpose_results <- function(results, components) {
    elements <- names(results[[1]])
    results_transpose <- list()
    for (element in elements) {
        results_transpose[[element]] <- list()
        for (comp in components) {
            results_transpose[[element]][[comp]] <- vapply(
                results,
                function(x) x[[element]][[comp]],
                numeric(1)
            )
        }
    }
    return(results_transpose)
}


#' @rdname pool
#' @export
as.data.frame.pool <- function(x, ...) {
    data.frame(
        parameter = names(x$pars),
        est = vapply(x$pars, function(x) x$est, numeric(1)),
        se = vapply(x$pars, function(x) x$se, numeric(1)),
        lci = vapply(x$pars, function(x) x$ci[[1]], numeric(1)),
        uci = vapply(x$pars, function(x) x$ci[[2]], numeric(1)),
        pval = vapply(x$pars, function(x) x$pvalue, numeric(1)),
        stringsAsFactors = FALSE,
        row.names = NULL
    )
}


#' @rdname pool
#' @export
print.pool <- function(x, ...) {

    n_string <- ife(
        x$method %in% c("rubin", "bmlmi"),
        as.character(x$N),
        sprintf("1 + %s", x$N - 1)
    )

    string <- c(
        "",
        "Pool Object",
        "-----------",
        sprintf("Number of Results Combined: %s", n_string),
        sprintf("Method: %s", x$method),
        sprintf("Confidence Level: %s", x$conf.level),
        sprintf("Alternative: %s", x$alternative),
        "",
        "Results:",
        as_ascii_table(as.data.frame(x), pcol = "pval"),
        ""
    )

    cat(string, sep = "\n")
    return(invisible(x))
}