R/test_genesets.R

Defines functions test_genesets

Documented in test_genesets

#' Perform geneset enrichment testing using any supported method
#'
#' @details
#' After application of the enrichment testing algorithm (e.g. GOAT, ORA or GSEA), multiple testing correction is applied to obtain adjusted p-values using `padjust_genesets`.
#' That function will first apply the specified pvalue adjustment procedure in the `padj_method` parameter within each 'source' in the genesets table. Second, it applies Bonferroni adjustment to all p-values according to the number of different geneset sources that were tested (or set `padj_sources = FALSE` to disable).
#'
#' For example, if the input is a genesets table that contains GO_CC, GO_BP and GO_MF genesets, first multiple testing correction is applied within each source (e.g. using FDR if so desired) and afterwards a Bonferroni correction is applied based on 3 repeated analyses.
#'
#' Note that this is more rigorous than typical GO tools; hypothetically, one could split all GO_CC pathways into 1000 different databases/'sources' and then run enrichment testing. Consequently, the multiple testing burden is reduced if one doesn't adjust p-values for the number of 'sources' as we do here.
#'
#' @examples \donttest{
#'#' # note; this example downloads data when first run, and typically takes ~60seconds
#'
#' ## Basic example for a complete GOAT workflow
#' # Downloads test data to your computer and stores it at current working directory
#' # Refer to the GitHub documentation for elaborate documentation and a worked example
#'
#' # store the downloaded files in the following directory. Here, the temporary file
#' # directory is used. Alternatively, consider storing this data in a more permanent location.
#' # e.g. output_dir="~/data/go" on unix systems or output_dir="C:/data/go" on Windows
#' output_dir = tempdir()
#'
#' # download an example gene list
#' datasets = download_goat_manuscript_data(output_dir)
#' genelist = datasets$`Wingo 2020:mass-spec:PMID32424284`
#'
#' # download GO genesets
#' genesets_asis = download_genesets_goatrepo(output_dir)
#'
#' # filter genesets for sufficient overlap with the genelist, then apply GOAT
#' genesets_filtered = filter_genesets(genesets_asis, genelist)
#' result = test_genesets(genesets_filtered, genelist, method = "goat",
#'   score_type = "effectsize", padj_method = "bonferroni", padj_cutoff = 0.05)
#'
#' # print first 10 rows of the result table
#' print(result |> select(source, name, ngenes, pvalue_adjust) |> utils::head(n=10))
#' }
#' @param genesets tibble with genesets, must contain columns 'source', 'source_version', 'id', 'name', 'genes', 'ngenes', 'ngenes_signif'
#' @param genelist tibble with genes, must contain column 'gene' and 'test'. gene = character column, which are matched against list column 'genes' in genesets tibble. test = boolean column (you can set all to FALSE if not performing Fisher-exact or hypergeometric test downstream)
#' @param method method for overrepresentation analysis. Options: "goat", "hypergeometric", "fisherexact", "fisherexact_ease", "gsea", "idea"
#' @param padj_method first step of multiple testing correction; method for p-value adjustment, passed to `stats::p.adjust()` via `padjust_genesets()`, e.g. set "BH" to compute FDR adjusted p-values (default) or "bonferroni" for a more stringent procedure
#' @param padj_sources second step of multiple testing correction; apply Bonferroni adjustment to all p-values according to the number of geneset sources that were tested. Boolean parameter, set TRUE to enable (default) or FALSE to disable
#' @param padj_cutoff cutoff for adjusted p-value, `signif` column is set to TRUE for all values lesser-equals
#' @param padj_min_signifgenes if a value larger than zero is provided, this will perform additional post-hoc filtering; after p-value adjustment, set the `pvalue_adjust` to `NA` and `signif` to `FALSE` for all genesets with fewer than `padj_min_signifgenes` 'input genes that were significant' (`ngenes_signif` column in genesets table).
#' So this does not affect the accuracy of estimated p-values, in contrast to prefiltering genesets prior to p-value computation or adjusting p-values
#' @param ... further parameters are passed to the respective stats method
#' @return the input `genesets`, with results stored in columns 'pvalue', 'pvalue_adjust' and 'signif'
#' @export
test_genesets = function(genesets, genelist, method, padj_method = "BH", padj_sources = TRUE, padj_cutoff = 0.01, padj_min_signifgenes = 0L, ...) {
  algorithm = ngenes_signif = pvalue_adjust = pvalue = NULL # fix invisible bindings R package NOTE
  genesets = validate_genesets(genesets)
  genelist = validate_genelist(genelist)

  stopifnot("method parameter should be a single string, specifying one of the supported functions (see function documentation)" = length(method) == 1 && is.character(method))
  stopifnot("padj_method parameter should be a valid option as listed in `stats::p.adjust.methods`" = length(padj_method) == 1 && is.character(padj_method) && padj_method %in% stats::p.adjust.methods)
  stopifnot("padj_sources parameter should be a single boolean value" = length(padj_sources) == 1 && padj_sources %in% c(TRUE, FALSE))
  stopifnot("padj_cutoff parameter should be a single positive numeric value" = length(padj_cutoff) == 1 && is.numeric(padj_cutoff) && is.finite(padj_cutoff) && padj_cutoff >= 0)
  stopifnot("padj_min_signifgenes parameter should be a single positive integer (0 or higher)" = length(padj_min_signifgenes) == 1 && is.numeric(padj_min_signifgenes) && is.finite(padj_min_signifgenes) && padj_min_signifgenes >= 0)
  padj_min_signifgenes = as.integer(padj_min_signifgenes) # if supplied as 'numeric', round down and store as integer type
  result = NULL

  if(method == "goat" || method == "goat_precomputed") {
    result = test_genesets_goat_precomputed(genesets, genelist, ...)
  }
  if(method == "goat_fitfunction") {
    result = test_genesets_goat_fitfunction(genesets, genelist, ...)
  }
  if(method == "goat_bootstrap") {
    result = test_genesets_goat_bootstrap(genesets, genelist, ...)
  }
  if(method == "fisherexact") {
    result = test_genesets_fisherexact(genesets, genelist, ...)
  }
  if(method == "fisherexact_ease") {
    result = test_genesets_fisherexact(genesets, genelist, use_ease = TRUE, ...)
  }
  if(method == "hypergeometric") {
    result = test_genesets_hypergeometric(genesets, genelist, ...)
  }
  if(method == "gsea") {
    result = test_genesets_gsea(genesets, genelist, ...)
  }

  # fallthrough; guess if the user provided some custom function
  if(is.null(result)) {
    f = tryCatch(match.fun(method, descend = FALSE), error = function(...) NULL)
    if(!is.function(f)) {
      if(grepl("::", method, fixed = TRUE)) {
        f = tryCatch(utils::getFromNamespace(gsub(".*::", "", method), gsub("::.*", "", method), envir = .GlobalEnv), error = function(...) NULL)
      } else {
        f = tryCatch(utils::getFromNamespace(algorithm, "goat", envir = .GlobalEnv), error = function(...) NULL)
      }
    }
    if(is.function(f)) {
      result = f(genesets = genesets, genelist = genelist, ...)
      stopifnot("custom geneset-test-function should return the input 'genesets' table with an additional 'pvalue' numeric column included" =
                  is.data.frame(result) && "pvalue" %in% colnames(result) && is.numeric(result$pvalue))
    }
  }


  # fallthrough check for valid method parameter
  stopifnot("unknown 'method' parameter" = !is.null(result))


  # finally, adjust p-values and flag significant
  result = result |>
    padjust_genesets(method = padj_method, cutoff = padj_cutoff, correct_sources = padj_sources) |>
    mutate(
      signif = signif & ngenes_signif >= padj_min_signifgenes,
      pvalue_adjust = ifelse(ngenes_signif < padj_min_signifgenes, NA, pvalue_adjust)
    ) |>
    score_geneset_oddsratio(genelist = genelist) |>
    arrange(pvalue)

  # settings as string
  settings... = parameters_prettyprint_length1(...)
  settings = sprintf("test_genesets(method='%s', padj_method='%s', padj_sources=%s, padj_cutoff=%s, padj_min_signifgenes=%s%s)",
                     method, padj_method, padj_sources, padj_cutoff, padj_min_signifgenes, ifelse(settings... == "", "", paste0(", ", settings...)))
  attr(result, "settings") <- c(attr(genesets, "settings"), settings)
  return(result)
}

Try the goat package in your browser

Any scripts or data that you put into this service are public.

goat documentation built on April 3, 2025, 6:05 p.m.