R/pathEnrich.R

Defines functions summary.pathEnrich print.pathEnrich pathEnrich

Documented in pathEnrich print.pathEnrich summary.pathEnrich

#' pathEnrich
#' @description This function takes the list generated in \code{\link{get_kegg}} as well as a vector
#' of NCBI (ENTREZ) geneIDs, and identifies significantly enriched KEGG pathways using
#' a Fisher's Exact Test. Unadjusted p-values as well as FDR corrected p-values are
#' calculated.
#'
#' @param gk_obj list. Object genrated from \code{get_kegg}, or a list containing the
#' output generated from a past \code{get_kegg} call. Names of the list must match those
#' defined in \code{get_kegg}. If the user wishes to use an older version of data
#' generated by \code{get_kegg}, they must first load that data and put it in a named
#' list that matches the names given in the list generated by \code{get_kegg}.
#' @param gene_list Vector. Vector of NCBI (ENTREZ) geneIDs.
#' @param N Numeric. The number of genes from the gene list that must be present in a KEGG pathway in order for that pathway to be retained and tested.
#' @param method Character. Character string telling \code{diffEnrich} which method to
#' use for multiple testing correction. Available methods are those provided by
#' \code{\link{p.adjust}}, and the default is "BH", or False Discovery Rate (FDR).
#' @param cutoff Numeric. The p-value threshold to be used as the cutoff when determining statistical significance, and used to filter list of significant pathways.
#'
#' @return A list object of class \code{pathEnrich} that contains 6 items:
#'
#' \describe{
#' \item{species}{The species used in enrichment}
#' \item{padj}{The method used to correct for multiple testing}
#' \item{sig_paths}{The KEGG pathways the reached statistical significance after multiple testing correction.}
#' \item{cutoff}{The p-value threshold to be used as the cutoff when determining statistical significance, and used to filter final results data set.}
#' \item{N}{The number of genes from the gene list that must be present in a KEGG pathway in order for that pathway to be retained and tested.}
#' \item{enrich_table}{A data frame that summarizes the results of the pathway analysis and contains the following variables:}
#' }
#'
#' \describe{
#'   \item{KEGG_PATHWAY_ID}{KEGG Pathway Identifier}
#'   \item{KEGG_PATHWAY_description}{Description of KEGG Pathway (provided by KEGG)}
#'   \item{KEGG_PATHWAY_cnt}{Number of Genes in KEGG Pathway}
#'   \item{KEGG_PATHWAY_in_list}{Number of Genes from gene list in KEGG Pathway}
#'   \item{KEGG_DATABASE_cnt}{Number of Genes in KEGG Database}
#'   \item{KEGG_DATABASE_in_list}{Number of Genes from gene list in KEGG Database}
#'   \item{expected}{Expected number of genes from list to be in KEGG pathway by chance (i.e., not enriched)}
#'   \item{enrich_p}{P-value for enrichment of list genes related to KEGG pathway}
#'   \item{p_adj}{False Discovery Rate (Benjamini and Hochberg) to account for multiple testing across KEGG pathways}
#'   \item{fold_enrichment}{KEGG_PATHWAY_in_list/expected}
#' }
#'
#' @details This function may not always use the complete list of genes provided by the user.
#' Specifically, it will only use the genes from the list provided that are also in
#' the most current species list pulled from the KEGG REST API, or from the older data KEGG
#' loaded by the user. The 'cutoff' only filters the list of pathways provided in the 'sig_paths'
#' list item. It is not used to filter the 'enrich_table' list object. S3 generic functions for \code{print} and \code{summary} are
#' provided. The \code{print} function prints the results table as a \code{tibble}, and the
#' \code{summary} function returns the number of pathways that reached statistical significance,
#' as well as their descriptions, the number of genes used from the KEGG data base, the KEGG species, and the
#' method used for multiple testing correction, and the p-value cutoff required for reaching statistical significance.
#'
#' @export
#' @importFrom  stats fisher.test
#' @importFrom  stats p.adjust
#' @importFrom rlang .data
#' @import dplyr
#' @import utils
#' @examples
#'
#' list1_pe <- pathEnrich(gk_obj = kegg, gene_list = geneLists$list1)
#' \dontrun{
#' list2_pe <- pathEnrich(gk_obj = kegg, gene_list = geneLists$list2, method = 'none', N = 4)
#' }
#'
pathEnrich <- function(gk_obj, gene_list, method = 'BH', cutoff = 0.05, N = 2){
  ## argument check
  if(missing(gk_obj)){stop("Argument missing: gk_obj")}
  if(missing(gene_list)){stop("Argument missing: gene_list. Please provide list of ncbi geneIDs")}
  if(N < 0 | is.character(N)){stop("N must be a positive number of type float or integer.")}

  ## Prepare gene list
  #gene_list <- gene_list[!is.na(gene_list)]
  gene_list <- as.integer(gene_list)
  N <- as.numeric(N)
  # define p.adjust method
  p.adj <- method

  ## prepare kegg data
  ncbi_to_kegg <- gk_obj[["ncbi_to_kegg"]]
  colnames(ncbi_to_kegg) <- c("ncbi_id", "gene")
  ncbi_to_kegg$Entry <- gsub("ncbi-geneid:","",fixed = T, ncbi_to_kegg$ncbi_id)
  kegg_to_pathway <- gk_obj[["kegg_to_pathway"]]
  colnames(kegg_to_pathway) <- c("gene", "pathway")
  ncbi_to_pathway <- merge(ncbi_to_kegg, kegg_to_pathway, by = "gene")
  ncbi_to_pathway <- ncbi_to_pathway %>%
    dplyr::select(.data$Entry, .data$pathway) %>%
    dplyr::filter(!duplicated(ncbi_to_pathway))
  pathway_to_species <- gk_obj[["pathway_to_species"]]

  ## set up enrichment
  all_KEGG <- unique(ncbi_to_pathway$Entry)
  sig_KEGG <- unique(gene_list)
  all_KEGG_cnt <- ncbi_to_pathway %>%
    dplyr::filter(.data$Entry %in% all_KEGG) %>%
    dplyr::group_by(.data$pathway) %>%
    dplyr::summarize(KEGG_cnt = length(.data$Entry))
  sig_KEGG_cnt <- ncbi_to_pathway %>%
    dplyr::filter(.data$Entry %in% sig_KEGG) %>%
    dplyr::group_by(.data$pathway) %>%
    dplyr::summarize(KEGG_in_list = length(.data$Entry))

  ## Set up enrichment table
  enrich_table <- merge(all_KEGG_cnt, sig_KEGG_cnt, by = "pathway", all = TRUE)
  enrich_table$KEGG_in_list[is.na(enrich_table$KEGG_in_list)] = 0
  enrich_table <- enrich_table %>%
    dplyr::filter(.data$KEGG_in_list >= N) %>%
    dplyr::mutate(numTested = length(all_KEGG),
                  numSig = length(all_KEGG[which(all_KEGG %in% sig_KEGG)]),
                  expected = (.data$numSig/.data$numTested)*.data$KEGG_cnt)
  enrich_table <- merge(pathway_to_species, enrich_table, by.x = "V1", by.y = "pathway")

  ## Perform Fisher's test
  enrich_table$enrich_p <- NA
  for(i in 1:nrow(enrich_table)){
    a = enrich_table[i, "KEGG_in_list"]
    b = enrich_table[i, "KEGG_cnt"] - enrich_table[i, "KEGG_in_list"]
    c = enrich_table[i, "numSig"] - enrich_table[i, "KEGG_in_list"]
    d = enrich_table[i, "numTested"] - a - b - c
    enrich_table$enrich_p[i] = stats::fisher.test(matrix(c(a,b,c,d), nrow = 2), alternative = "greater")$p.value
  }

  ## clean and return
  enrich_table <- enrich_table[order(enrich_table$enrich_p), ]
  enrich_table$fdr <- stats::p.adjust(enrich_table$enrich_p, method = method)
  enrich_table$V1 <- gsub("path:", "", enrich_table$V1, fixed = TRUE)

  ## Calulate fold enrichment column
  enrich_table$fold_enrichment <- enrich_table$KEGG_in_list/enrich_table$expected
  ## Add menaingful column names for the C1 and C2
  colnames(enrich_table) <- c("KEGG_PATHWAY_ID", "KEGG_PATHWAY_description", "KEGG_PATHWAY_cnt", "KEGG_PATHWAY_in_list",
                                      "KEGG_DATABASE_cnt", "KEG_DATABASE_in_list", "expected", "enrich_p",
                                      "p_adj", "fold_enrichment")
  ## define species
  species <- unlist(strsplit(enrich_table$KEGG_PATHWAY_description[1], split = " - ", fixed = TRUE))[2]
  ## Extract sicgnificant pathways
  sig_paths <- enrich_table %>%
    dplyr::filter(.data$p_adj < cutoff) %>%
    pull(.data$KEGG_PATHWAY_description)
  sig_paths <- unlist(lapply(strsplit(sig_paths, split = " - ", fixed = TRUE), function(x) x[1]))
  ## build results list
  res <- list("species" = species,
    "padj" = p.adj,
    "cutoff" = cutoff,
    "N" = N,
    "enrich_table" = enrich_table,
    "sig_pathways" = sig_paths)
  ## define class attr
  class(res) <- c("pathEnrich")
  return(res)
}

#' @name print.pathEnrich
#' @rdname pathEnrich
#' @method print pathEnrich
#' @param x object of class \code{pathEnrich}
#' @param \dots Unused
#' @export
print.pathEnrich <- function(x, ...){
  dplyr::as_tibble(x$enrich_table)
}

#' @name summary.pathEnrich
#' @rdname pathEnrich
#' @method summary pathEnrich
#' @param object object of class \code{pathEnrich}
#' @export
summary.pathEnrich <- function(object, ...){
  ## summary part 1
  l1 <- paste0(dim(object$enrich_table)[1], ' KEGG pathways were tested. \n')
  l2 <- paste0('Only KEGG pathways that contained at least ', object$N, ' genes from gene_list were tested. \n')
  l3 <- paste0("KEGG pathway species: ", object$species, "\n")
  l4 <- paste0(object$enrich_table$KEGG_DATABASE_cnt[1], ' genes from gene_list were in the KEGG data pull. \n')
  l5 <- paste0("p-value adjustment method: ", object$padj, "\n")
  l6 <- paste0(length(object$sig_pathways), " pathways reached statistical significance after multiple testing correction at a cutoff of ", object$cutoff, ". \n")
  cat(
    l1, l2, l3, l4, l5, l6, "\n")
  ## summary part 2
  paths <- paste(object$sig_pathways, collapse = "\n")
  cat("Significant pathways: \n", paths)
}

Try the diffEnrich package in your browser

Any scripts or data that you put into this service are public.

diffEnrich documentation built on June 28, 2022, 1:08 a.m.