#' Wrapper for gene enrichment analysis using clusterProfiler
#'
#' Applying over-representation analysis and gene set enrichment analysis to supplied gene list
#'
#' This function uses genelist and optionally background gene list as input to perform enrichment analysis
#' using the \code{clusterProfiler} and \code{DOSE} packages. By default, the function performs two kinds
#' of analysis for all categories given in \code{enrichmentCat}.
#' \itemize{
#' \item over-representation analysis: \code{DOSE} implements hypergeometric model to assess whether the
#' number of selected genes annotated with a term or pathway is larger than expected by chance.
#' For this approach, usually a cut off has been applied to select genes of interest e.g. from differential
#' expression analysis. All selected genes are treated with equal priority.
#' \item gene set enrichment analysis (GSEA): for GSEA all genes under investigation can be used as input.
#' They are ranked based on a corresponding quantitative value given in \code{sortcolumn}. Given a priori
#' defined set of genes S (e.g., genes sharing the same GO term), the goal of GSEA is to determine whether the
#' members of S are randomly distributed throughout the ranked gene list (L) or primarily found at the top or bottom.
#' This approach can handle a situation where the extent of differential expression is small, but evidenced
#' in coordinated way in a set of related genes. GSEA aggregates the per gene statistics across genes within
#' a gene set, therefore making it possible to detect situations where all genes in a predefined set change
#' in a small but coordinated way. If no quantitative data is provided (\code{sortcolumn = NULL}), GSEA is skipped.
#' }
#' The names/IDs are converted to ENTREZ IDs (if necessary) prior to enrichment using the annotation package
#' for the species denoted in \code{org}. A background list with genes under investigation (e.g. expression array content)
#' can be provided in \code{backgroundlist} to be used as background for over-representation analysis.
#' Alternatively, all genes of the designated organism can be obtained from the respective
#' annotation package (For GSEA, all genes of the designated organism are used anyway).
#' Optionally, quantitative data in \code{sortcolumn} can be used for sorting and filtering (using \code{sortcolumn.threshold}
#' or \code{maxInputGenes}) the input gene list for over-representation analysis, if not filtered prior to that.
#' The function generation visualisations for each enrichment result like cnetplots enrichment-maps and dotplots.
#' Enriched KEGG pathway maps (if any) are annotated for input genes using the \code{pathview} package.
#' If quantitative data in \code{sortcolumn} are not fold changes, fold changes may be given additionally in \code{FCcolumn}
#' for annotation purposes in cnetplots as well as for KEGG pathway mapping, otherwise the data in \code{sortcolumn}
#' is used for this. Be aware that the legend of the cnetplots will be "Fold Change" anyway.
#'
#'
#' @param genes the input data object can be given in several formats.
#' \itemize{
#' \item vector with gene names/IDs: all genes are used for over-representation analysis. The gene names/IDs
#' must be in the format given in \code{id.type}.
#' \item dataframe with gene names/IDs and quantitative values: genes can be filtered for the quantitative value
#' prior to over-representation analysis via the unfiltered list can be used as background list.
#' Additionally to over-representation analysis, a GSEA is perfomed using the full gene list ranked for the
#' quantitative value. Columns of the dataframe must be named according to parameters \code{id.column} and \code{sortcolumn}.
#' \item character with path to dataframe: as above. The dataframe is loaded from the given path.
#' \item list of items: Each item is processed separately as indicated above.
#' }
#' @param newheader optional character vector with new header information for \code{genes} dataframe. Only relevant
#' if 'genes' is a dataframe (or character string with filepath to a table) with wrong or missing header.
#' NULL otherwise.
#' @param backgroundlist the background list of gene names/IDs for enrichment analysis can be given in several formats.
#' \itemize{
#' \item vector with gene names/IDs: The gene names/IDs must be in the format given in \code{id.type}.
#' \item dataframe with gene names/IDs: an \code{id.column} is needed as in \code{genes} with type of IDs given in \code{id.type}.
#' \item "genome": all ENTREZ IDs from the annotation package of the respective organism (denoted in \code{org}) are used as background.
#' \item NULL: full name/ID list from \code{genes} is used as background (i.e. that \code{genes} need to
#' contain all genes under investigation without pre-filtering).
#' }
#' @param newheaderBackground optional character vector with new header information for \code{backgroundlist}.
#' @param projectfolder character with directory for output files (will be generated if not existing).
#' @param projectname optional character prefix for output file names.
#' @param analysis_type character vector giving the type of analysis to be performed. Eiter "over-representation",
#' "GSEA" or both.
#' @param enrichmentCat character vector with categories to be enriched (\code{GO}: gene ontology (MF, BP, CC),
#' \code{KEGG}: KEGG pathways, \code{Reactome}: Reactome pathways, \code{DO}: Disease ontology).
#' Disease ontology is for human only.
#' @param maxInputGenes (numeric) max number of top diff regulated elements used for over-representation analysis (or NULL).
#' @param id.type character with identifier type from annotation package (\code{"ENTREZID"} or \code{"SYMBOL"})
#' Gene symbols will be converted to EntrezIDs prior to enrichment analysis.
#' @param id.column character with column name for identifier variable in \code{genes}.
#' @param sortcolumn character with column name of quantitative data in \code{genes} used for ordering.
#' If \code{Null}, ranking of genes is omitted and GSEA is not possible.
#' @param highValueHighPriority (logical) priority order of values in \code{sortcolumn}.
#' \code{TRUE}: high values have highest priority (e.g. fold changes).
#' \code{FALSE}: low values have highest priority (e.g. p-values);
#' If \code{FALSE}, values in \code{sortcolumn} should be transformed prior to GSEA (see \code{fun.transform}).
#' @param sortcolumn.threshold numeric threshold for \code{sortcolumn} to be included in over-representation analysis.
#' \code{If highValueHighPriority=F, value < sortcolumn.threshold
#' else value > sortcolumn.threshold}
#' @param fun.transform GSEA needs an input gene list with priority in decreasing order (high values have highest priority).
#' Since quatitative values given in \code{sortcolumn} may have priority in increasing order (e.g. p-values), these values must
#' be transformed by a custom function to generate priority in decreasing order prior to GSEA. A suitable function definition
#' can be given in \code{fun.transform}, e.g. \code{function(x) {-log10(x)}} for p-values or \code{abs} for absolute
#' values of foldchange.
#' @param FCcolumn (character) optional column name of foldchanges in \code{genes} if \code{sortcolumn} is used for a different data column.
#' Used only for annotation cnetplot of enrichment results. Omitted if NULL
#' @param threshold_FC (numeric) Fold change threshold for filtering (threshold interpreted for log2 transformed foldchange values!)
#' Only relevant for over-representation analysis if an unfiltered gene list is given in \code{genes}
#' to allow for GSEA in parallel.
#' @param returnSymbolsInResultObject logical. if \code{TRUE} gene symbols are returned in result objects. ENTREZIDs otherwise.
#' @param pAdjustMethod method for adjusting for multiple testing.
#' One of "holm", "hochberg", "hommel", "bonferroni", "BH", "BY", "fdr", "none"
#' @param org character with name of organism ("human", "mouse", "rat").
#' @param enrich.p.valueCutoff numeric p-value threshold for returned enrichment terms.
#' @param enrich.q.valueCutoff numeric q-value threshold for returned enrichment terms.
#' @param nPerm permutation numbers for gene set enrichment analysis
#' @param minGSSize minimal size of genes annotated by Ontology term for testing.
#' @param maxGSSize maximal size of genes annotated for testing
#' @param figure.res numeric resolution for output png.
#'
#'
#' @return List of enrichment-objects defined in \code{DOSE}-package (\code{enrichResult}-object for overrepresentation
#' analysis and \code{gseaResult}-objects for gene set enrichment analysis).
#' Enrichment tables and plots are stored in the project folder as side effects.
#'
#' @author Frank Ruehle
#'
#' @export
#'
#' @note There are three key elements of the GSEA method (taken from
#' https://bioconductor.org/packages/release/bioc/vignettes/DOSE/inst/doc/GSEA.html):
#' \itemize{
#' \item Calculation of an Enrichment Score: The enrichment score (ES) represent the degree to which a set S is
#' over-represented at the top or bottom of the ranked list L. The score is calculated by walking down the list L,
#' increasing a running-sum statistic when we encounter a gene in S and decreasing when it is not. The magnitude
#' increment depends on the gene statistics (e.g., correlation of the gene with phenotype). The ES is the of the
#' maximum deviation from zero encountered in the random walk; it corresponds to a weighted Kolmogorov-Smirnov-like statistic
#' \item Esimation of Significance Level of ES: The p-value of the ES is calculated using permutation test. Specifically,
#' we permute the gene labels of the gene list L and recompute the ES of the gene set for the permutated data, which
#' generate a null distribution for the ES. The p-value of the observed ES is then calculated relative to this null distribution.
#' \item Adjustment for Multiple Hypothesis Testing: When the entire gene sets were evaluated, \code{DOSE} adjust the estimated
#' significance level to account for multiple hypothesis testing and also q-values were calculated for FDR control.
#' }
## Usage
wrapClusterProfiler <- function (genes,
newheader = NULL,
backgroundlist=NULL,
newheaderBackground = NULL,
projectfolder= "clusterProfiler",
projectname="",
analysis_type = c("over-representation", "GSEA"),
enrichmentCat = c("GO", "KEGG", "Reactome", "DO"),
maxInputGenes = 100,
id.type = "ENTREZID",
id.column = "ENTREZID",
sortcolumn ="adj.P.Val",
highValueHighPriority = FALSE,
sortcolumn.threshold = 0.05,
fun.transform = function(x) {identity(x)},
FCcolumn = "logFC",
threshold_FC= log2(1.5),
returnSymbolsInResultObject =TRUE,
org = "human",
pAdjustMethod = "BH",
enrich.p.valueCutoff = 0.05,
enrich.q.valueCutoff = 0.05,
nPerm = 1000,
minGSSize = 10,
maxGSSize = 500,
figure.res = 300)
{
## additional plotting parameter
emapplot_showCategory = 20
cnetplot_showCategory = 5
dotplot_showCategory = 20
plotGOgraph_firstSigNodes = 10
## create output directory
if (!file.exists(file.path(projectfolder))) {dir.create(file.path(projectfolder), recursive=T)}
if(!is.null(projectname)) { # add underline to projectname if given
if(projectname!="") {
projectname <- paste0(projectname, "_")
}}
# define annotation database for organism
annotationdb <- switch(org, human = "org.Hs.eg.db",
mouse = "org.Mm.eg.db",
rat= "org.Rn.eg.db")
org_alt_name <- switch(org, human = "hsa", # org parameter differs in GO and KEGG functions in clusterProfiler
mouse = "mmu",
rat= "rno")
## install/load required packages from CRAN and Bioconductor
pkg.bioc <- c("clusterProfiler", "DOSE", "enrichplot", "ReactomePA", "topGO", "pathview", annotationdb)
pkg.cran <- c("plyr")
pks2detach <- attach_package(pkg.cran=pkg.cran, pkg.bioc=pkg.bioc)
# determine analysis type(s)
do_overrep <- ifelse("over-representation" %in% analysis_type, T, F)
do_gsea <- ifelse("GSEA" %in% analysis_type, T, F)
# warnings
if(!any(c(do_overrep, do_gsea))) {stop("\n\nSelect analysis_type\n")}
if(!is.null(id.column)) {
if(id.column!="ENTREZID" && "ENTREZID" %in% names(genes)) {
cat("CAUTION: Column ENTREZIDs found. Consider these data for id.column.")
}
}
if(!is.null(sortcolumn) & highValueHighPriority == FALSE) {
cat(paste("\nCAUTION: You indicated high prority for low values in 'sortcolumn',",
"but GSEA will assess high values with higher priority than",
"low values. Make sure that you have given a suitable function",
"in 'fun.transform' for transforming your data",
"(e.g. -log10(x) for p-values).\n"))
}
### process data input
if(class(genes) != "list") { # make list if genes is single element
genes <- list(genes=genes)
}
# initialise help variable to avoid multiple annotation of background list
vector.processed <- FALSE
# initialise result list
enrichresult <- list()
enrichresult_kegg_entrez <- list() # needed for KEGG pathway maps if returnSymbolsInResultObject =TRUE
for(ge in names(genes)) {
cat(paste("\nProcessing", ge, "\n"))
# read file if 'genes' is character string with file path
if(is.character(genes[[ge]]) & length(genes[[ge]])==1) {
cat("\n\nReading gene list:", genes[[ge]], "\n")
genes[[ge]] <- read.table(genes[[ge]], header=is.null(newheader), sep="\t", na.strings = c("", " ", "NA")) # if no newheader, header must be in file
}
if(is.data.frame(genes[[ge]])) {
if (!is.null(newheader)) {# if 'newheader' is defined, it is used as names(DEgenes.unfilt)
cat("\nNew header added to input file:", newheader, "\n")
colnames(genes[[ge]]) <- newheader
}
columns2use <- c(id.column, sortcolumn, FCcolumn)
columnsfound <- columns2use[columns2use %in% names(genes[[ge]])]
if (!all(columns2use %in% names(genes[[ge]]))) {
warning("column(s) ", columns2use[!(columns2use %in% names(genes[[ge]]))], " not found in data header.")
}
genes[[ge]] <- genes[[ge]][,columnsfound, drop=F] # reduce dataframe to columns of interest
}
if(is.vector(genes[[ge]], mode="character")) {
if(!is.null(names(genes[[ge]]))) { # if named vector with IDs as element names and quantitative information as elements
genes[[ge]] <- data.frame(names(genes[[ge]]), genes[[ge]])
names(genes[[ge]]) <- c(id.column, sortcolumn)
} else { # if unnamed vector with IDs but no quantitative information (no sortcolumn)
id.column <- id.type
genes[[ge]] <- data.frame(genes[[ge]], stringsAsFactors = F) # convert single vector to dataframe
names(genes[[ge]]) <- c(id.column)
}
}
cat(paste("\n", nrow(genes[[ge]]), "gene entries loaded.\n"))
# now 'genes[[ge]]' is a data.frame with 1, 2 or 3 columns (id.column, sortcolumn, FCcolumn)
genes[[ge]] <- genes[[ge]][!is.na(genes[[ge]][,id.column]), , drop=F] # remove rows without entry in id.column
if (!is.null(sortcolumn)) {
if (sortcolumn %in% names(genes[[ge]])) {
cat(paste("\nSorting input dataframe for", sortcolumn, ", decreasing =", highValueHighPriority, "\n"))
genes[[ge]] <- genes[[ge]][order(genes[[ge]][,sortcolumn], decreasing=highValueHighPriority),] # sorting genes[[ge]] for quantitative variable
# if multiple probes per gene, only the first gene entry is used
genes[[ge]] <- genes[[ge]][!duplicated(genes[[ge]][,id.column]),]
}
}
## convert IDs to ENTREZ IDs if necessary
if(id.type!="ENTREZID") {
entrezids <- basicAnno(data=genes[[ge]], Symbol.column = id.column, Entrez.column = NULL, org=org)
cat(paste(nrow(entrezids), id.type, "(", length(unique(genes[[ge]][,id.column])), "unique) mapped to", sum(!is.na(entrezids$ENTREZID)), "ENTREZIDs\n"))
cat(paste("For dublicated", id.type, "only the first element is used (after sorting if applicable)\n"))
genes[[ge]] <- plyr::join(genes[[ge]], entrezids, by=id.column, type="right", match="first")
# joined dataframe contains all entries even without available ENTREZIDs and keeps order of 'genes[[ge]]'
} else {names(genes[[ge]])[names(genes[[ge]])==id.column] <- "ENTREZID"}
# 'genes[[ge]]' now definitively provides a column 'ENTREZID'
# remove entries without 'ENTREZID'
genes[[ge]] <- genes[[ge]][!is.na(genes[[ge]][,"ENTREZID"]), , drop=F]
genes[[ge]] <- genes[[ge]][genes[[ge]][,"ENTREZID"] != "", , drop=F]
# if multiple probes per gene, only the first gene entry is used
# (after sorting, if sortcolumn available).
genes[[ge]] <- genes[[ge]][!duplicated(genes[[ge]][,"ENTREZID"]), , drop=F]
cat(paste("\n", nrow(genes[[ge]]), "genes with unique Entrez IDs found.\n"))
### Filtering: Threshold applied to gene list: 'maxInputGenes' as well as 'sortcolumn.threshold' if applicable
filtgenes <- filterGeneLists(genes[[ge]],
newheader=NULL,
filtercat1 = sortcolumn,
filtercat1.decreasing = highValueHighPriority,
filtercat1.function = identity,
filtercat1.threshold= sortcolumn.threshold,
filtercat2 = FCcolumn,
filtercat2.decreasing = TRUE,
filtercat2.function = abs,
filtercat2.threshold= threshold_FC)
filtgenes[,"ENTREZID"] <- as.character(filtgenes[,"ENTREZID"]) # convert numeric EntrezIDs to characters
if(do_overrep) {
cat(paste("\nTop", min(nrow(filtgenes), maxInputGenes), "unique genes selected for overrepresentation analysis:\n"))
print(head(filtgenes$ENTREZID))
cat("\n")
}
filtgenes <- filtgenes[1:min(nrow(filtgenes), maxInputGenes), , drop=F]
## Processing list for Gene Set Enrichment Analysis (GSEA). Named Vector needed in decreasing order.
if(do_gsea) {
if (!is.null(sortcolumn)) { # quantitative data available?
# Requirements for input gene list according to https://github.com/GuangchuangYu/DOSE/wiki/how-to-prepare-your-own-geneList
# - numeric vector: fold change or other type of numerical variable
# - named vector: every number has a name, the corresponding gene ID
# - sorted vector: number should be sorted in decreasing order
gseagenes <- genes[[ge]][,sortcolumn]
names(gseagenes) <- genes[[ge]][,"ENTREZID"]
# for increasing sortcolumn (e.g. p-values). Low values transformed to high values
gseagenes <- fun.transform(gseagenes)
gseagenes <- sort(gseagenes, decreasing=T) # Vector in decreasing order.
cat(paste("\n", length(gseagenes), "unique genes for gene set enrichment analysis:\n"))
print(head(names(gseagenes)))
cat("\n")
}
}
###### read background list if available. Otherwise unfiltered ID list from 'genes[[ge]]' used as background Vector.
if(!is.null(backgroundlist)) {
cat("\nProcessing background list")
if (all(backgroundlist=="genome")) {
# all ENTREZIDs from the annotation db of the respective genome (given in org) used as background.
backgroundlist <- keys(get(annotationdb), keytype="ENTREZID")
cat(paste("\nUsing", length(backgroundlist), "ENTREZ IDs from", annotationdb, "as background.\n"))
} else {
# read file if 'backgroundlist' is character string with file path
if(is.character(backgroundlist) && length(backgroundlist)==1) {
cat("\n\nReading background list:", backgroundlist, "\n")
backgroundlist <- read.table(backgroundlist, header=is.null(newheaderBackground), sep="\t", na.strings = c("", " ", "NA")) # if no newheaderBackground, header must be in file
}
if((is.vector(backgroundlist, mode=c("character")) | is.vector(backgroundlist, mode=c("numeric"))) && !vector.processed) {
backgroundlist <- as.data.frame(backgroundlist, stringsAsFactors =F) # convert vector to dataframe for processing in next chunk
colnames(backgroundlist) <- id.column
}
if(is.data.frame(backgroundlist) ) {
if(!is.null(newheaderBackground)) { # if 'newheaderBackground' is defined, it is used as header for backgroundlist
cat("\nNew header added to background list:", newheaderBackground, "\n")
colnames(backgroundlist) <- newheaderBackground
}
# convert IDs to ENTREZ IDs if necessary
if(id.type!="ENTREZID" && !("ENTREZID" %in% colnames(backgroundlist))) {
backgroundlist <- basicAnno(backgroundlist, Symbol.column = id.column, Entrez.column = NULL, org=org)
}
entrez.column.bg <- ifelse(id.type=="ENTREZID", id.column, "ENTREZID")
backgroundlist <- backgroundlist[!is.na(backgroundlist[,entrez.column.bg]),entrez.column.bg, drop=F] # remove rows without entry in id.column
backgroundlist <- backgroundlist[backgroundlist[,entrez.column.bg] !="", entrez.column.bg, drop=F] # remove rows with entry "" in id.column
backgroundlist <- as.character(unique(backgroundlist[,entrez.column.bg]))
vector.processed <- TRUE # don't process this vector in 2nd run of loop
} else {
if(!vector.processed) {
cat("\nBackground list used as given without any modification (assuming Entrez IDs):")
print(head(backgroundlist))
cat("\n")
}
}
} # end loading and processing of a supplied backgroundlist
cat(paste("\n", length(backgroundlist), "ENTREZ IDs in background list:\n"))
print(head(backgroundlist))
cat("\n")
} else { # if backgroundlist is NULL, all EntrezIDs from 'genes[[ge]]' used as background instead.
backgroundlist <- as.character(unique(genes[[ge]][,"ENTREZID"]))
cat(paste("\n", length(backgroundlist), "ENTREZ IDs used from input object as background:\n"))
print(head(backgroundlist))
cat("\n")
}
#################### Start Enrichment analysis
# remove categories not supported by clusterProfiler
enrichmentCat.found <- enrichmentCat %in% c("GO", "KEGG", "Reactome", "DO")
enrichmentCat <- enrichmentCat[enrichmentCat.found]
if(("DO" %in% enrichmentCat) & org !="human") {
cat("\nDiesease ontology (DO) available for human only!")
enrichmentCat <- enrichmentCat[enrichmentCat!="DO"]
if(length(enrichmentCat)==0) {stop("\nNo valid categories selected!")}
}
cat(paste("\nCategories used for enrichment analysis with clusterProfiler:", paste(enrichmentCat, collapse=" "), "\n\n"))
# initialise result list
enrichresult[[ge]] <- list()
enrichresult_kegg_entrez[[ge]] <- list() # needed for KEGG pathway maps if returnSymbolsInResultObject =TRUE
##### Gene Ontology Enrichment
## Over representation analysis
if ("GO" %in% enrichmentCat) {
for (go in c("MF", "BP", "CC")) {
if(do_overrep) {
if(nrow(filtgenes)>1) {
enrichresult[[ge]][[paste0("Overrep_",go)]] <- clusterProfiler::enrichGO(gene = filtgenes[,"ENTREZID"],
universe = backgroundlist,
OrgDb = annotationdb,
ont = go,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
pAdjustMethod = pAdjustMethod,
pvalueCutoff = enrich.p.valueCutoff,
qvalueCutoff = enrich.q.valueCutoff,
readable = returnSymbolsInResultObject) # if readable =T, EntrezIDs displayed as gene Symbols
cat(paste("Overrepresentation analysis for", go, "GO-terms results in", nrow(as.data.frame(enrichresult[[ge]][[paste0("Overrep_",go)]])), "enriched terms.\n"))
}
}
# GO Gene Set Enrichment Analysis
if(do_gsea) {
if (!is.null(sortcolumn)) { # quantitative data available?
cat(paste("GeneSet Enrichment analysis for", go, "GO-terms results in "))
enrichresult[[ge]][[paste0("GSEA_",go)]] <- clusterProfiler::gseGO(geneList = gseagenes,
OrgDb = annotationdb,
ont = go,
nPerm = nPerm,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
pvalueCutoff = enrich.p.valueCutoff,
pAdjustMethod = pAdjustMethod,
verbose = FALSE)
if(returnSymbolsInResultObject) { # replace ENTREZ IDs by symbols
if(nrow(enrichresult[[ge]][[paste0("GSEA_",go)]]) >=1) {
enrichresult[[ge]][[paste0("GSEA_",go)]] <- setReadable(enrichresult[[ge]][[paste0("GSEA_",go)]], OrgDb=annotationdb, keytype = "ENTREZID")
}
}
cat(paste(nrow(as.data.frame(enrichresult[[ge]][[paste0("GSEA_",go)]])), "enriched terms.\n"))
}
}
} # end go-loop
} # end if GO
##### KEGG pathway enrichment
# KEGG over-representation test
if ("KEGG" %in% enrichmentCat) {
if(do_overrep) {
if(nrow(filtgenes)>1) {
enrichresult[[ge]][[paste0("Overrep_","KEGG")]] <- clusterProfiler::enrichKEGG(gene = filtgenes[,"ENTREZID"],
organism = org_alt_name,
universe = backgroundlist,
pvalueCutoff = enrich.p.valueCutoff,
qvalueCutoff = enrich.q.valueCutoff,
pAdjustMethod = pAdjustMethod,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
use_internal_data = FALSE)
if(returnSymbolsInResultObject) { # replace ENTREZ IDs by symbols
if(nrow(enrichresult[[ge]][[paste0("Overrep_","KEGG")]]) >=1) {
enrichresult_kegg_entrez[[ge]][[paste0("Overrep_","KEGG")]] <- enrichresult[[ge]][[paste0("Overrep_","KEGG")]] # needed for KEGG pathway maps if returnSymbolsInResultObject =TRUE
enrichresult[[ge]][[paste0("Overrep_","KEGG")]] <- setReadable(enrichresult[[ge]][[paste0("Overrep_","KEGG")]], OrgDb=annotationdb, keytype = "ENTREZID")
}
}
cat(paste("Overrepresentation analysis for KEGG pathways results in", nrow(as.data.frame(enrichresult[[ge]][[paste0("Overrep_","KEGG")]])), "enriched pathways\n"))
}
}
# KEGG Gene Set Enrichment Analysis
if(do_gsea) {
if (!is.null(sortcolumn)) { # quantitative data available?
cat(paste("GeneSet Enrichment analysis for KEGG pathways results in "))
enrichresult[[ge]][[paste0("GSEA_","KEGG")]] <- clusterProfiler::gseKEGG(geneList = gseagenes,
organism = org_alt_name,
nPerm = nPerm,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
pvalueCutoff = enrich.p.valueCutoff,
pAdjustMethod = pAdjustMethod,
verbose = FALSE,
use_internal_data = FALSE)
if(returnSymbolsInResultObject) { # replace ENTREZ IDs by symbols
if(nrow(enrichresult[[ge]][[paste0("GSEA_","KEGG")]]) >=1) {
enrichresult_kegg_entrez[[ge]][[paste0("GSEA_","KEGG")]] <- enrichresult[[ge]][[paste0("GSEA_","KEGG")]] # needed for KEGG pathway maps if returnSymbolsInResultObject =TRUE
enrichresult[[ge]][[paste0("GSEA_","KEGG")]] <- setReadable(enrichresult[[ge]][[paste0("GSEA_","KEGG")]], OrgDb=annotationdb, keytype = "ENTREZID")
}
}
cat(paste(nrow(as.data.frame(enrichresult[[ge]][[paste0("GSEA_","KEGG")]])), "enriched pathways\n"))
}
}
}# end if KEGG
##### Reactome pathway enrichment
# Reactome over-representation test
if ("Reactome" %in% enrichmentCat) {
if(do_overrep) {
if(nrow(filtgenes)>1) {
enrichresult[[ge]][[paste0("Overrep_","Reactome")]] <- ReactomePA::enrichPathway(gene = filtgenes[,"ENTREZID"],
organism = org,
universe = backgroundlist,
pvalueCutoff = enrich.p.valueCutoff,
qvalueCutoff = enrich.q.valueCutoff,
pAdjustMethod = pAdjustMethod,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
readable = returnSymbolsInResultObject)
cat(paste("Overrepresentation analysis for Reactome pathways results in", nrow(as.data.frame(enrichresult[[ge]][[paste0("Overrep_","Reactome")]])), "enriched pathways\n"))
}
}
# Reactome Gene Set Enrichment Analysis
if(do_gsea) {
if (!is.null(sortcolumn)) { # quantitative data available?
cat(paste("GeneSet Enrichment analysis for Reactome pathways results in "))
enrichresult[[ge]][[paste0("GSEA_","Reactome")]] <- ReactomePA::gsePathway(geneList = gseagenes,
organism = org,
nPerm = nPerm,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
pvalueCutoff = enrich.p.valueCutoff,
pAdjustMethod = pAdjustMethod,
verbose = FALSE)
if(returnSymbolsInResultObject) { # replace ENTREZ IDs by symbols
if(nrow(enrichresult[[ge]][[paste0("GSEA_","Reactome")]]) >=1) {
enrichresult[[ge]][[paste0("GSEA_","Reactome")]] <- setReadable(enrichresult[[ge]][[paste0("GSEA_","Reactome")]], OrgDb=annotationdb, keytype = "ENTREZID")
}
}
cat(paste(nrow(as.data.frame(enrichresult[[ge]][[paste0("GSEA_","Reactome")]])), "enriched pathways\n"))
}
}
} # end if Reactome
##### Disease Ontology enrichment (for human only!)
# DO over-representation test
if ("DO" %in% enrichmentCat) {
if(do_overrep) {
if(nrow(filtgenes)>1) {
enrichresult[[ge]][[paste0("Overrep_","DO")]] <- DOSE::enrichDO(gene = filtgenes[,"ENTREZID"],
ont="DO",
universe = backgroundlist,
pvalueCutoff = enrich.p.valueCutoff,
qvalueCutoff = enrich.q.valueCutoff,
pAdjustMethod = pAdjustMethod,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
readable = returnSymbolsInResultObject)
cat(paste("Overrepresentation analysis for Disease Ontology results in", nrow(as.data.frame(enrichresult[[ge]][[paste0("Overrep_","DO")]])), "enriched terms\n"))
}
}
# DO Gene Set Enrichment Analysis
if(do_gsea) {
if (!is.null(sortcolumn)) { # quantitative data available?
cat(paste("GeneSet Enrichment analysis for Disease Ontology results in "))
enrichresult[[ge]][[paste0("GSEA_","DO")]] <- DOSE::gseDO(geneList = gseagenes,
nPerm = nPerm,
minGSSize = minGSSize,
maxGSSize = maxGSSize,
pvalueCutoff = enrich.p.valueCutoff,
pAdjustMethod = pAdjustMethod,
verbose = FALSE)
if(returnSymbolsInResultObject) { # replace ENTREZ IDs by symbols
if(nrow(enrichresult[[ge]][[paste0("GSEA_","DO")]]) >=1) {
enrichresult[[ge]][[paste0("GSEA_","DO")]] <- setReadable(enrichresult[[ge]][[paste0("GSEA_","DO")]], OrgDb=annotationdb, keytype = "ENTREZID")
}
}
cat(paste(nrow(as.data.frame(enrichresult[[ge]][[paste0("GSEA_","DO")]])), "enriched terms.\n"))
}
}
} # end if DO
# Creating result tables and plot
cat(paste("\nCreating result tables and plots in", file.path(projectfolder), "\n"))
for(usedcat in names(enrichresult[[ge]])) {
if(nrow(as.data.frame(enrichresult[[ge]][[usedcat]])) >=1) {
cat(paste("\nProcessing", usedcat))
## create output directory
subfolder <- ifelse(ge == "genes", projectfolder, paste0(projectfolder, "/", ge))
projectnamesuffix <- ifelse(ge == "genes", "", paste0(ge, "_"))
if (!file.exists(file.path(subfolder))) {dir.create(file.path(subfolder), recursive=T)}
## Result table
write.table(as.data.frame(enrichresult[[ge]][[usedcat]]), quote=F, row.names=F, sep="\t",
file=file.path(subfolder, paste0(projectname, projectnamesuffix, usedcat, "_resulttable.txt")))
## Enrichment map
png(file.path(subfolder, paste0(projectname, projectnamesuffix, usedcat, "_emapPlot_top_", emapplot_showCategory, "_categories.png")),
width = 300, height = 300, units = "mm", res=figure.res)
try(print(enrichplot::emapplot(enrichresult[[ge]][[usedcat]], showCategory = emapplot_showCategory,
color = "p.adjust")) # "p.adjust" refers to enrichment p value of result object
, silent=T)
dev.off()
## cnet plot (Remark: legend will be 'Fold change' even if other quantitative data are included)
cnetplot_anno_column <- "ENTREZID" # by default, gene names are ENTREZIDs, because the enrichment is done with Entrez IDs
if(returnSymbolsInResultObject) { ### outdated: grepl("Overrep", usedcat) & grepl("MF|BP|CC|Reactome", usedcat) for functions with 'readable' parameter
# if returnSymbolsInResultObject==TRUE genes symbols are returned instead of ENTREZIDs
# either via parameter readable = TRUE in the enrichment function or via function setReadable.
# Then, SYMBOLs must be used to refer to quantitaive information in cnetplots (foldchange or other data).
if(id.type == "SYMBOL") {
cnetplot_anno_column <- id.column
} else {
if(any(grepl("symbol", names(genes[[ge]]), ignore.case = T))) {
# if dataframe containes unused symbol annotation, else annotate with org object
# Anyhow, additional symbol columns are not kept in the current gene[[ge]],
# so they have to be obtained from org object anyway.
temp_symbol_column <- grep("symbol", names(genes[[ge]]), value=T, ignore.case = T)[1]
if(!(any(is.na(genes[[ge]][,temp_symbol_column])) | any(genes[[ge]][,temp_symbol_column]==""))) {
# check if found symbol column in complete
cnetplot_anno_column <- temp_symbol_column
}
} else { # if SYMBOLs must be derived from ENTREZIDs
genes[[ge]] <- basicAnno(data=genes[[ge]], Symbol.column = NULL, Entrez.column = "ENTREZID", org=org)
cnetplot_anno_column <- "SYMBOL"
}
}
}
# annotation of cnet plots (either by FCcolumn or sortcolumn)
if(!is.null(FCcolumn)) {
cnetplot_annotation <- genes[[ge]][,FCcolumn]
names(cnetplot_annotation) <- genes[[ge]][,cnetplot_anno_column] #####
} else {
if(!is.null(sortcolumn)) {
cnetplot_annotation <- genes[[ge]][,sortcolumn]
names(cnetplot_annotation) <- genes[[ge]][,cnetplot_anno_column] ####
} else {cnetplot_annotation <- NULL}
}
# start plot
png(file.path(subfolder, paste0(projectname, projectnamesuffix, usedcat, "_cnetPlot_top_", cnetplot_showCategory, "_categories.png")),
width = 300, height = 300, units = "mm", res=figure.res)
try(print(enrichplot::cnetplot(enrichresult[[ge]][[usedcat]], showCategory = cnetplot_showCategory, # categorySize="pvalue",
foldChange= cnetplot_annotation))
, silent=T)
dev.off()
## Dot plot of enriched terms
png(file.path(subfolder, paste0(projectname, projectnamesuffix, usedcat, "_dotPlot_top_", dotplot_showCategory, "_categories.png")),
width = 300, height = 300, units = "mm", res=figure.res)
try(print(enrichplot::dotplot(enrichresult[[ge]][[usedcat]], showCategory = dotplot_showCategory, color = "p.adjust",
title = usedcat))
, silent=T)
dev.off()
## enriched GO induced graph
if(grepl("CC|MF|BP", usedcat)) {
png(file.path(subfolder, paste0(projectname, projectnamesuffix, usedcat, "_GOgraph_first_", plotGOgraph_firstSigNodes, "_nodes.png")),
width = 300, height = 300, units = "mm", res=figure.res)
try(clusterProfiler::plotGOgraph(enrichresult[[ge]][[usedcat]], firstSigNodes = plotGOgraph_firstSigNodes, useFullNames = TRUE), silent=T)
dev.off()
}
# Pathview maps for KEGG
if(grepl("KEGG", usedcat)) {
pathway.dir <- file.path(subfolder, paste0(usedcat, "_pathway_maps"))
if (!file.exists(pathway.dir)) {dir.create(pathway.dir, recursive=T)}
workingdir <- getwd()
setwd(pathway.dir)
for(k in 1:nrow(enrichresult_kegg_entrez[[ge]][[usedcat]])) {
if(grepl("gsea", usedcat, ignore.case =T)) { # column name with gene IDs differs for GSEA and overrep analysis
temp_geneid_column <- "core_enrichment"} else {
temp_geneid_column <- "geneID"
}
kegggenes <- enrichresult_kegg_entrez[[ge]][[usedcat]][k, temp_geneid_column] # Entrez IDs
kegggenes <- unlist(strsplit(kegggenes, split="/", fixed=T))
# 'genes[[ge]]' is a data.frame with 1, 2 or 3 columns (id.column, sortcolumn, FCcolumn)
kegggenes <- genes[[ge]][genes[[ge]][,"ENTREZID"] %in% kegggenes, ]
if(!is.null(FCcolumn)) { # foldchanges available
tmp.entrezids <- kegggenes$ENTREZID
kegggenes <- kegggenes[,FCcolumn]
names(kegggenes) <- tmp.entrezids
kegglimit <- c(min(kegggenes), max(kegggenes)) # kegglimit <- max(abs(kegggenes))
plot.col.key <- T
} else {
if(!is.null(sortcolumn)) { # no foldchanges available values in sortcolumn to be plotted
tmp.entrezids <- kegggenes$ENTREZID
kegggenes <- kegggenes[,sortcolumn]
names(kegggenes) <- tmp.entrezids
kegglimit <- c(min(kegggenes), max(kegggenes))
plot.col.key <- T
} else { # neither foldchanges nor sortcolumn values available. Just highlight included genes.
kegggenes <- as.character(kegggenes[,"ENTREZID"])
kegglimit <- 1
plot.col.key <- F
}
}
try(
pathview::pathview(
# numeric vector named with ENTREZ IDs
gene.data = kegggenes,
pathway.id = enrichresult_kegg_entrez[[ge]][[usedcat]][k, "ID"],
species = org_alt_name,
gene.idtype = "entrez", # may also be "SYMBOL" here, but entrez is more reliable and organism independant
limit= list(gene= kegglimit, cpd=1),
plot.col.key = plot.col.key)
,silent=F)
}
setwd(workingdir)
}
## remark: the warning produced by kegggraph or pathview do no harm. See https://support.bioconductor.org/p/96798/
}
}
cat("\n")
} # end ge loop
# Detaching libraries not needed any more
detach_package(unique(pks2detach))
return(enrichresult)
} # end of function definition
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.