R/inference.R

Defines functions bws_wrapper glm_wrapper qgcomp_lin_wrapper fin_wrapper bma_wrapper bkmr_wrapper fit.mpower_InferenceModel fit new_InferenceModel InferenceModel

Documented in bkmr_wrapper bma_wrapper bws_wrapper fin_wrapper fit glm_wrapper InferenceModel qgcomp_lin_wrapper

#' Statistical model that returns significance criterion
#'
#' This function creates a wrapper function for a statistical model and its
#' applicable significance criterion. It finds relationships between a matrix
#' of predictors and a vector of outcomes using the statistical model, and
#' determines if the relationships are 'significant' according to pre-specified
#' criterion for that model.
#'
#' @param model A string of the name of a built-in statistical model or a function
#'   implements a statistical model and returns a list of significance
#'   criteria for each predictor. Built-in options include 'bma', 'bkmr', 'mixselect',
#'   'bws', 'qgc', 'fin', 'glm'.
#' @param name A string, name of the statistical model. Default to string input
#'   of model.
#' @param ... Additional keyword arguments for the statistical model
#' @return An InferenceModel object.
#' \item{model}{a function that takes matrices of predictors and outcomes and
#' returns a list of significance criteria.}
#' \item{model_name}{a string.}
#' @examples
#' imod <- mpower::InferenceModel(model = 'glm', family = 'gaussian',
#' formula = y ~ Poverty*(poly(Age, 2) + HHIncome + HomeOwn + Education))
#' @export
InferenceModel <- function(model, name = NULL, ...) {
    mod <- list(model_name = model, args = list(...))
    if (is.function(model)) {
        mod[["model"]] <- model
        mod[["model_name"]] <- name
    } else if (is.character(model)) {
        if (model == "bma") {
            if (!requireNamespace("BMA")) {
                stop("Package 'BMA' not installed.")
            }
            mod[["model"]] <- bma_wrapper
        } else if (model == "bkmr") {
            if (!requireNamespace("bkmr")) {
                stop("Package 'bkmr' not installed.")
            }
            mod[["model"]] <- bkmr_wrapper
        } else if (model == "bws") {
            if (!requireNamespace("bws")) {
                stop("Package 'bws' not installed.")
            }
            message("Estimating the power of detecting the overall effect based on posterior credible interval. Running sim_power() with cores > 1 is recommended.")
            mod[["model"]] <- bws_wrapper
        } else if (model == "qgc") {
            if (!requireNamespace("qgcomp")) {
                stop("Package 'qgcomp' not installed.")
            }
            mod[["model"]] <- qgcomp_lin_wrapper
        } else if (model == "fin") {
            if (!requireNamespace("infinitefactor")) {
                stop("Package 'infinitefactor' not installed.")
            }
            mod[["model"]] <- fin_wrapper
        } else if (model == "glm") {
            message("Estimating the power of conditional t-test on each regression coefficient.")
            mod[["model"]] <- glm_wrapper
        } else {
            stop(paste(model, "not implemented"))
        }
    } else {
        stop("Invalid input for `model`")
    }

    new_InferenceModel(mod)
}

new_InferenceModel <- function(l = list()) {
    stopifnot(is.list(l))
    structure(l, class = "mpower_InferenceModel")
}

#' Fits the model to given data and gets a list of significance criteria
#' @param mod An InferenceModel object
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @return A list of some of the following significance criteria:
#' \item{beta}{The smaller
#' posterior probability of being to one side of zero for linear term, given
#' either the main effect or interaction is non-zero. Applicable to 'bma',
#' 'bws', 'fin', and 'mixselect' model.}
#' \item{interact_beta}{Same as linear_beta but
#' for pair-wise interactions. Applicable to 'fin' model.}
#' \item{pip}{Posterior inclusion probability (PIP) of either a linear or
#' non-linear effect. Applicable to 'bma', 'bkmr', and 'mixselect' model.}
#' \item{group_pip}{PIP of either a linear or non-linear effect. Applicable to
#' 'bkmr' model.}
#' \item{linear_pip}{PIP of a linear effect. Applicable to 'mixselect' model.}
#' \item{gp_pip}{PIP of a non-linear effect. Applicable to 'mixselect' model.}
#' \item{pval}{The p-value of the combined effect, the same for all
#' predictors. Applicable to 'glm', and 'qgc' model.}
#' \item{time}{elapsed time to fit the model.}
#' @export
fit <- function(mod, x, y) {
    UseMethod("fit")
}

#' @export
fit.mpower_InferenceModel <- function(mod, x, y) {
    if (length(mod$args) == 0) {
        return(mod$model(y = y, x = x))
    }
    return(mod$model(y = y, x = x, args = mod$args))
}

#' Fits a BKMR model with significance criteria: PIP and group-wise PIP
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments, see R function `bkmr::kmbayes()`
#' @return A list of vectors whose values are between 0 and 1
#' \item{pip}{PIP for component-wise selection or conditional (with-in group)
#' PIP for hierarchical variable selection.}
#' \item{group_pip}{PIP for group-specific selection.}
#' \item{time}{elapsed time to fit the model.}
#' @section Reference:
#'
#'   Bobb JF, Henn BC, Valeri L, Coull BA (2018). “Statistical software for
#'   analyzing the health effects of multiple concurrent exposures via Bayesian
#'   kernel machine regression.”Environ-mental
#'   Health,17(67).doi:10.1186/s12940-018-0413-y.
#' @export
bkmr_wrapper <- function(y, x, args = list()) {
    args[["varsel"]] <- TRUE
    x <- stats::model.matrix(~. - 1, x)
    s <- Sys.time()
    km_fit <- do.call(bkmr::kmbayes, c(list(y = y, Z = x), args))
    km_time <- Sys.time() - s
    km_pip <- bkmr::ExtractPIPs(km_fit)  # by default keeps the second half of all samples
    # km_pip <- km_pip[, (ncol(km_pip)/2 + 1):ncol(km_pip)]
    pip <- km_pip$PIP
    if (is.null(pip))
        pip <- km_pip$condPIP
    names(pip) <- colnames(x)
    group_pip <- km_pip$groupPIP
    if (!is.null(group_pip))
        names(group_pip) <- colnames(x)
    return(list(pip = pip, group_pip = group_pip, time = km_time))
}

#' Fits a linear model with Bayesian model selection with significance criteria:
#' PIP and posterior probability of nonzero coefficients being on one side of
#' zero.
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments see R function `BMA::bic.glm()`.
#' @return A list of vectors whose values are between 0 and 1
#' \item{beta}{The smaller posterior probability of the coefficients being to
#' one side of zero: min(Pr(beta >0), Pr(beta<0)).}
#' \item{pip}{PIP of the
#' effect not being zero.}
#' \item{time}{elapsed time to fit the model.}
#' @section Reference:
#'
#'   Raftery A, Hoeting J, Volinsky C, Painter I, Yeung KY (2021).BMA: Bayesian
#'   model averaging. R package version 3.18.15.
#' @export
bma_wrapper <- function(y, x, args = list()) {
    x <- stats::model.matrix(~., x)[, -1]  # full rank matrix with dummy variables
    s <- Sys.time()
    bma_out <- do.call(BMA::bic.glm, c(list(x = x, y = y), args))
    bma_time <- Sys.time() - s
    p <- ncol(x)  # Model-averaged coef, the first one is the intercept term
    # Posterior probability of being less than zero.
    bma_zero <- stats::pnorm(0, bma_out$postmean[2:(p + 1)], bma_out$postsd[2:(p +
        1)])
    bma_zero <- vapply(bma_zero, function(x) min(x, 1 - x), numeric(1))
    # Sum inclusion probability postprob of all models with a main effec Xi
    bma_pip <- bma_out$probne0
    names(bma_pip) <- names(bma_zero) <- colnames(x)
    return(list(beta = bma_zero, pip = bma_pip, time = bma_time))
}

#' Fits a Bayesian factor model with interactions
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments see R function
#'   `infinitefactor::interactionDL()` in 'infinitefactor' package.
#' @return A list of vectors whose values are between 0 and 1
#' \item{beta}{The smallest posterior probability of the coefficients being to
#' one side of zero for either main effect or interaction: min(Pr(beta >0),
#' Pr(beta<0)).}
#' \item{linear_beta}{The smaller of posterior probability of
#' the main effects being to one side of zero.}
#' \item{interact_beta}{Same as
#' linear_beta but for pair-wise interactions.}
#' \item{time}{elapsed time to fit the model.}
#' @section Reference:
#'
#'   Ferrari F, Dunson DB (2020). “Bayesian factor analysis for inference on
#'   interactions.”Journal of the American Statistical Association, 0(0), 1–12.
#' @export
fin_wrapper <- function(y, x, args = list(nrun = 2000)) {
    y <- matrix(y, length(y), 1)
    x <- stats::model.matrix(~. - 1, x)
    if (!is.null(args$z))
        warning("Inclusion of linear effects for covariates not implemented in package infinitefactor for FIN")
    s <- Sys.time()
    fin_out <- do.call(infinitefactor::interactionDL, c(list(y = y, X = x), args))
    fin_time <- Sys.time() - s
    # Posterior probability of effects being to one side of zero
    fin_beta <- apply(fin_out$mainEffectSamps > 0, 1, mean, na.rm = T)
    fin_beta <- vapply(fin_beta, function(x) min(x, 1 - x), numeric(1))
    fin_int <- apply(apply(fin_out$interactionSamps, 3, c) > 0, 1, mean, na.rm = T)
    fin_int <- vapply(fin_int, function(x) min(x, 1 - x), numeric(1))
    p <- ncol(x)
    names(fin_int) <- paste0(rep(1:p, each = p), rep(1:p, p))
    fin_int <- fin_int[c(paste0(1:p, 1:p), utils::combn(1:p, m = 2, FUN = paste0,
        collapse = ""))]
    # Significance of a variable: either main effect or interaction is
    # significant
    fin_zero <- rep(NA, p)
    for (i in 1:p) {
        id <- which(grepl(i, colnames(fin_int)))
        fin_zero[i] <- min(fin_beta[i], fin_int[id])
    }
    names(fin_zero) <- names(fin_beta) <- colnames(x)
    return(list(beta = fin_zero, linear_beta = fin_beta, interact_beta = fin_int,
        time = fin_time))
}

#' Fits a linear Quantile G-Computation model with no interactions
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments see R function `qgcomp::qgcomp.noboot()`.
#' @return A list
#' \item{pval}{The p-value of the combined effect, the
#' same for all predictors.}
#' \item{pos_weights}{Positive weights. See 'qgcomp' package.}
#' \item{neg_weights}{Negative weights. See 'qgcomp' package.}
#' \item{time}{elapsed time to fit the model.}
#' @section Reference:
#'
#'   Keil AP, Buckley JP, O’Brien KM, Ferguson KK, Zhao S, White AJ (2020). “A
#'   Quantile-based g-computation  approach  to  addressing  the  effects of
#'   exposure  mixtures.”Environmental Health Perspectives, 128(4).
#' @export
qgcomp_lin_wrapper <- function(y, x, args = list()) {
    dat <- data.frame(y = y, x = x)
    colnames(dat) <- c("y", colnames(x))
    s <- Sys.time()
    qc.fit <- do.call(qgcomp::qgcomp.noboot, c(list(y ~ ., data = dat), args))
    qgc_time <- Sys.time() - s
    p <- ncol(x)
    qgc_pval <- qc.fit$pval[2]  #rep(qc.fit$pval[2], p)
    names(qgc_pval) <- "overall"
    return(list(pval = qgc_pval, pos_weights = qc.fit$pos.weights, neg_weights = qc.fit$neg.weights,
        time = qgc_time))
}

#' Fits a generalized linear model
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments see R `glm` function.
#' @return A list
#' \item{pval}{The p-value of the linear main effect.}
#' \item{time}{elapsed time to fit the model.}
#' @export
glm_wrapper <- function(y, x, args = list()) {
    if (is.null(args[["formula"]])) {
        args[["formula"]] <- y ~ .
    }
    dat <- data.frame(y = y, x = x) %>%
        set_colnames(c("y", colnames(x)))
    s <- Sys.time()
    fit <- do.call(stats::glm, c(list(data = dat), args))
    time <- Sys.time() - s
    var_names <- colnames(fit$model)[-1]
    p <- length(var_names)
    all_pval <- stats::summary.glm(fit)$coef[, 4]
    all_int_pval <- all_pval[grepl("\\:", names(all_pval))]
    all_main_pval <- all_pval[!grepl("\\:", names(all_pval))]
    pval <- int_pval <- main_pval <- rep(NA, p)
    names(pval) <- names(int_pval) <- names(main_pval) <- var_names
    # Save the smallest p-value of all terms with the variable name
    for (var in var_names) {
        int_pval[var] <- min(all_int_pval[grepl(var, names(all_int_pval))])
        main_pval[var] <- min(all_main_pval[grepl(var, names(all_main_pval))])
        pval[var] <- min(int_pval[var], main_pval[var])
    }
    return(list(pval = pval, main_pval = main_pval, int_pval = int_pval, time = time))
}

#' Fits a Bayesian weighted sums
#' @param x A matrix of predictors
#' @param y A vector of outcome
#' @param args A list of arguments see R `bws::bws()`` function.
#' @return A list
#' \item{beta}{The smaller posterior probability of
#' the combined overall effect being to one side of zero: min(Pr(beta >0),
#' Pr(beta<0)). The same for all predictor.}
#' \item{weights}{The 95\% CI of the
#' contribution of each predictor to the overall effect. Between 0 and 1.}
#' \item{time}{elapsed time to fit the model.}
#' @section Reference:
#'
#'   Hamra GB, MacLehose RF, Croen L, Kauffman EM, Newschaffer C (2021).
#'   “Bayesian weighted sums: a flexible approach to estimate summed mixture
#'   effects.” International Journal of Environmental Research and Public
#'   Health, 18(4), 1373.
#' @export
bws_wrapper <- function(y, x, args = list(iter = 2000)) {
    message("Sampling bws")
    s <- Sys.time()
    fit <- do.call(bws::bws, c(list(y = y, X = x), args))
    bws_time <- Sys.time() - s
    samps <- rstan::extract(fit)
    beta_zero <- mean(samps$theta1 > 0)
    beta_zero <- min(beta_zero, 1 - beta_zero)
    # beta_zero <- rep(beta_zero, ncol(x))
    weights <- apply(samps$w, 2, mean)
    names(beta_zero) <- "overall"
    names(weights) <- colnames(x)
    return(list(beta = beta_zero, weights = weights, time = bws_time))
}
phuchonguyen/mpower documentation built on Oct. 2, 2022, 7:57 p.m.