R/eif_moderated.R
In biotmle: Targeted Learning with Moderated Statistics for Biomarker Discovery

Documented in modtest_ic

#' Moderated Statistical Tests for Influence Functions
#'
#' Performs variance shrinkage via application of an empirical Bayes procedure
#' (of LIMMA) on the observed data after a transformation moving the data to
#' influence function space, based on the average treatment effect parameter.
#'
#' @param biotmle \code{biotmle} object as generated by \code{biomarkertmle}
#' @param adjust the multiple testing correction to be applied to p-values that
#'  are generated from the moderated tests. The recommended (default) method
#'  is that of Benjamini and Hochberg. See \link[limma]{topTable} for a list of
#'  appropriate methods.
#' @param pval_type The reference distribution to be used for computing the
#'  p-value. Those based on the normal approximation tend to provide misleading
#'  inference when working with moderately sized (finite) samples. Use of the
#'  logistic distribution has been found to empirically improve performance in
#'  settings where multiple hypothesis testing is a concern.
#' @param ... Other arguments to be passed directly to \code{limma::topTable}.
#'
#' @importFrom methods is
#' @importFrom dplyr "%>%"
#' @importFrom stats plogis p.adjust
#' @importFrom limma lmFit eBayes topTable
#' @importFrom tibble as_tibble rownames_to_column
#' @importFrom assertthat assert_that
#'
#' @return \code{biotmle} object containing output from \code{limma::lmFit} and
#'  \code{limma::topTable}
#'
#' @export modtest_ic
#'
#' @examples
#' library(dplyr)
#' library(biotmleData)
#' library(SuperLearner)
#' library(SummarizedExperiment)
#' data(illuminaData)
#'
#' colData(illuminaData) <- colData(illuminaData) %>%
#'   data.frame() %>%
#'   dplyr::mutate(age = as.numeric(age > median(age))) %>%
#'   DataFrame()
#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene")
#'
#' biomarkerTMLEout <- biomarkertmle(
#'   se = illuminaData[1:2, ],
#'   varInt = benz_idx,
#'   parallel = FALSE,
#'   g_lib = c("SL.mean", "SL.glm"),
#'   Q_lib = c("SL.bayesglm", "SL.glm")
#' )
#'
#' limmaTMLEout <- modtest_ic(biotmle = biomarkerTMLEout)
modtest_ic <- function(biotmle,
                       adjust = "BH",
                       pval_type = c("normal", "logistic"),
                       ...) {
  # check for input type and set argument defaults
  assertthat::assert_that(is(biotmle, "bioTMLE"))
  pval_type <- match.arg(pval_type)

  # extract influence function estimates and number of observations
  biomarkertmle_out <- as.matrix(biotmle@tmleOut)
  n_obs <- nrow(colData(biotmle))

  # build design matrix for forcing limma to only perform shrinkage
  fit <- limma::lmFit(
    object = biomarkertmle_out,
    design = rep(1, n_obs)
  )
  fit <- limma::eBayes(fit = fit)

  # compute inference
  tt <- limma::topTable(
    fit = fit,
    coef = 1,
    number = Inf,
    adjust.method = adjust,
    sort.by = "none",
    ...
  ) %>%
    tibble::rownames_to_column(var = "ID") %>%
    tibble::as_tibble() %>%
    mutate(
      # remove log-fold change and overwrite average expression with ATE
      logFC = NULL,
      AveExpr = biotmle@ateOut,
      var_bayes = fit$s2.post / n_obs,
    )

  # use logistic distribution as reference for p-values by default
  if (pval_type == "logistic") {
    p_val <- 2 * stats::plogis(-abs(tt$t), location = 0, scale = sqrt(3) / pi)
    p_val_adj <- stats::p.adjust(p_val, method = adjust)
    tt$P.Value <- p_val
    tt$adj.P.Val <- p_val_adj
  }

  # store in slot of output object
  biotmle@topTable <- tt
  return(biotmle)
}