Nothing
#' pathEnrich
#' @description This function takes the list generated in \code{\link{get_kegg}} as well as a vector
#' of NCBI (ENTREZ) geneIDs, and identifies significantly enriched KEGG pathways using
#' a Fisher's Exact Test. Unadjusted p-values as well as FDR corrected p-values are
#' calculated.
#'
#' @param gk_obj list. Object genrated from \code{get_kegg}, or a list containing the
#' output generated from a past \code{get_kegg} call. Names of the list must match those
#' defined in \code{get_kegg}. If the user wishes to use an older version of data
#' generated by \code{get_kegg}, they must first load that data and put it in a named
#' list that matches the names given in the list generated by \code{get_kegg}.
#' @param gene_list Vector. Vector of NCBI (ENTREZ) geneIDs.
#' @param N Numeric. The number of genes from the gene list that must be present in a KEGG pathway in order for that pathway to be retained and tested.
#' @param method Character. Character string telling \code{diffEnrich} which method to
#' use for multiple testing correction. Available methods are those provided by
#' \code{\link{p.adjust}}, and the default is "BH", or False Discovery Rate (FDR).
#' @param cutoff Numeric. The p-value threshold to be used as the cutoff when determining statistical significance, and used to filter list of significant pathways.
#'
#' @return A list object of class \code{pathEnrich} that contains 6 items:
#'
#' \describe{
#' \item{species}{The species used in enrichment}
#' \item{padj}{The method used to correct for multiple testing}
#' \item{sig_paths}{The KEGG pathways the reached statistical significance after multiple testing correction.}
#' \item{cutoff}{The p-value threshold to be used as the cutoff when determining statistical significance, and used to filter final results data set.}
#' \item{N}{The number of genes from the gene list that must be present in a KEGG pathway in order for that pathway to be retained and tested.}
#' \item{enrich_table}{A data frame that summarizes the results of the pathway analysis and contains the following variables:}
#' }
#'
#' \describe{
#' \item{KEGG_PATHWAY_ID}{KEGG Pathway Identifier}
#' \item{KEGG_PATHWAY_description}{Description of KEGG Pathway (provided by KEGG)}
#' \item{KEGG_PATHWAY_cnt}{Number of Genes in KEGG Pathway}
#' \item{KEGG_PATHWAY_in_list}{Number of Genes from gene list in KEGG Pathway}
#' \item{KEGG_DATABASE_cnt}{Number of Genes in KEGG Database}
#' \item{KEGG_DATABASE_in_list}{Number of Genes from gene list in KEGG Database}
#' \item{expected}{Expected number of genes from list to be in KEGG pathway by chance (i.e., not enriched)}
#' \item{enrich_p}{P-value for enrichment of list genes related to KEGG pathway}
#' \item{p_adj}{False Discovery Rate (Benjamini and Hochberg) to account for multiple testing across KEGG pathways}
#' \item{fold_enrichment}{KEGG_PATHWAY_in_list/expected}
#' }
#'
#' @details This function may not always use the complete list of genes provided by the user.
#' Specifically, it will only use the genes from the list provided that are also in
#' the most current species list pulled from the KEGG REST API, or from the older data KEGG
#' loaded by the user. The 'cutoff' only filters the list of pathways provided in the 'sig_paths'
#' list item. It is not used to filter the 'enrich_table' list object. S3 generic functions for \code{print} and \code{summary} are
#' provided. The \code{print} function prints the results table as a \code{tibble}, and the
#' \code{summary} function returns the number of pathways that reached statistical significance,
#' as well as their descriptions, the number of genes used from the KEGG data base, the KEGG species, and the
#' method used for multiple testing correction, and the p-value cutoff required for reaching statistical significance.
#'
#' @export
#' @importFrom stats fisher.test
#' @importFrom stats p.adjust
#' @importFrom rlang .data
#' @import dplyr
#' @import utils
#' @examples
#'
#' list1_pe <- pathEnrich(gk_obj = kegg, gene_list = geneLists$list1)
#' \dontrun{
#' list2_pe <- pathEnrich(gk_obj = kegg, gene_list = geneLists$list2, method = 'none', N = 4)
#' }
#'
pathEnrich <- function(gk_obj, gene_list, method = 'BH', cutoff = 0.05, N = 2){
## argument check
if(missing(gk_obj)){stop("Argument missing: gk_obj")}
if(missing(gene_list)){stop("Argument missing: gene_list. Please provide list of ncbi geneIDs")}
if(N < 0 | is.character(N)){stop("N must be a positive number of type float or integer.")}
## Prepare gene list
#gene_list <- gene_list[!is.na(gene_list)]
gene_list <- as.integer(gene_list)
N <- as.numeric(N)
# define p.adjust method
p.adj <- method
## prepare kegg data
ncbi_to_kegg <- gk_obj[["ncbi_to_kegg"]]
colnames(ncbi_to_kegg) <- c("ncbi_id", "gene")
ncbi_to_kegg$Entry <- gsub("ncbi-geneid:","",fixed = T, ncbi_to_kegg$ncbi_id)
kegg_to_pathway <- gk_obj[["kegg_to_pathway"]]
colnames(kegg_to_pathway) <- c("gene", "pathway")
ncbi_to_pathway <- merge(ncbi_to_kegg, kegg_to_pathway, by = "gene")
ncbi_to_pathway <- ncbi_to_pathway %>%
dplyr::select(.data$Entry, .data$pathway) %>%
dplyr::filter(!duplicated(ncbi_to_pathway))
pathway_to_species <- gk_obj[["pathway_to_species"]]
## set up enrichment
all_KEGG <- unique(ncbi_to_pathway$Entry)
sig_KEGG <- unique(gene_list)
all_KEGG_cnt <- ncbi_to_pathway %>%
dplyr::filter(.data$Entry %in% all_KEGG) %>%
dplyr::group_by(.data$pathway) %>%
dplyr::summarize(KEGG_cnt = length(.data$Entry))
sig_KEGG_cnt <- ncbi_to_pathway %>%
dplyr::filter(.data$Entry %in% sig_KEGG) %>%
dplyr::group_by(.data$pathway) %>%
dplyr::summarize(KEGG_in_list = length(.data$Entry))
## Set up enrichment table
enrich_table <- merge(all_KEGG_cnt, sig_KEGG_cnt, by = "pathway", all = TRUE)
enrich_table$KEGG_in_list[is.na(enrich_table$KEGG_in_list)] = 0
enrich_table <- enrich_table %>%
dplyr::filter(.data$KEGG_in_list >= N) %>%
dplyr::mutate(numTested = length(all_KEGG),
numSig = length(all_KEGG[which(all_KEGG %in% sig_KEGG)]),
expected = (.data$numSig/.data$numTested)*.data$KEGG_cnt)
enrich_table <- merge(pathway_to_species, enrich_table, by.x = "V1", by.y = "pathway")
## Perform Fisher's test
enrich_table$enrich_p <- NA
for(i in 1:nrow(enrich_table)){
a = enrich_table[i, "KEGG_in_list"]
b = enrich_table[i, "KEGG_cnt"] - enrich_table[i, "KEGG_in_list"]
c = enrich_table[i, "numSig"] - enrich_table[i, "KEGG_in_list"]
d = enrich_table[i, "numTested"] - a - b - c
enrich_table$enrich_p[i] = stats::fisher.test(matrix(c(a,b,c,d), nrow = 2), alternative = "greater")$p.value
}
## clean and return
enrich_table <- enrich_table[order(enrich_table$enrich_p), ]
enrich_table$fdr <- stats::p.adjust(enrich_table$enrich_p, method = method)
enrich_table$V1 <- gsub("path:", "", enrich_table$V1, fixed = TRUE)
## Calulate fold enrichment column
enrich_table$fold_enrichment <- enrich_table$KEGG_in_list/enrich_table$expected
## Add menaingful column names for the C1 and C2
colnames(enrich_table) <- c("KEGG_PATHWAY_ID", "KEGG_PATHWAY_description", "KEGG_PATHWAY_cnt", "KEGG_PATHWAY_in_list",
"KEGG_DATABASE_cnt", "KEG_DATABASE_in_list", "expected", "enrich_p",
"p_adj", "fold_enrichment")
## define species
species <- unlist(strsplit(enrich_table$KEGG_PATHWAY_description[1], split = " - ", fixed = TRUE))[2]
## Extract sicgnificant pathways
sig_paths <- enrich_table %>%
dplyr::filter(.data$p_adj < cutoff) %>%
pull(.data$KEGG_PATHWAY_description)
sig_paths <- unlist(lapply(strsplit(sig_paths, split = " - ", fixed = TRUE), function(x) x[1]))
## build results list
res <- list("species" = species,
"padj" = p.adj,
"cutoff" = cutoff,
"N" = N,
"enrich_table" = enrich_table,
"sig_pathways" = sig_paths)
## define class attr
class(res) <- c("pathEnrich")
return(res)
}
#' @name print.pathEnrich
#' @rdname pathEnrich
#' @method print pathEnrich
#' @param x object of class \code{pathEnrich}
#' @param \dots Unused
#' @export
print.pathEnrich <- function(x, ...){
dplyr::as_tibble(x$enrich_table)
}
#' @name summary.pathEnrich
#' @rdname pathEnrich
#' @method summary pathEnrich
#' @param object object of class \code{pathEnrich}
#' @export
summary.pathEnrich <- function(object, ...){
## summary part 1
l1 <- paste0(dim(object$enrich_table)[1], ' KEGG pathways were tested. \n')
l2 <- paste0('Only KEGG pathways that contained at least ', object$N, ' genes from gene_list were tested. \n')
l3 <- paste0("KEGG pathway species: ", object$species, "\n")
l4 <- paste0(object$enrich_table$KEGG_DATABASE_cnt[1], ' genes from gene_list were in the KEGG data pull. \n')
l5 <- paste0("p-value adjustment method: ", object$padj, "\n")
l6 <- paste0(length(object$sig_pathways), " pathways reached statistical significance after multiple testing correction at a cutoff of ", object$cutoff, ". \n")
cat(
l1, l2, l3, l4, l5, l6, "\n")
## summary part 2
paths <- paste(object$sig_pathways, collapse = "\n")
cat("Significant pathways: \n", paths)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.