Nothing
#' @title Pathway analysis and visualization
#' @description This is the function to do pathway enrichment analysis (and visualization) with rWikipathways (also KEGG, REACTOME & Hallmark) from a summary statistics table generated by
#' differential expression analysis like \code{limma} or \code{DESeq2}.
#'
#' @param data A summary statistics table (data.frame) or \code{data.list} generated by DE analysis software like limma or DEseq2, where rownames are gene id.
#' @param comp.names A character vector containing the comparison names corresponding to the same order of the \code{data.list}. Default = NULL.
#' @param gene.id.type The gene id format in \code{data} should be one of: ACCNUM, ALIAS, ENSEMBL, ENSEMBLPROT,
#' ENSEMBLTRANS, ENTREZID, ENZYME, EVIDENCE, EVIDENCEALL, GENENAME, GO, GOALL, IPI, MAP, OMIM,
#' ONTOLOGY, ONTOLOGYALL, PATH, PFAM, PMID, PROSITE, REFSEQ, SYMBOL, UCSCKG, UNIGENE, UNIPROT.
#' @param FC.cutoff The fold change cutoff (numeric) selected to subset summary statistics table. Default = 1.5.
#' @param FDR.cutoff The FDR cutoff selected (numeric) to subset summary statistics table. Default = 0.05.
#' @param FCflag The column name (character) of fold change information, assuming the FC is log2 transformed. Default = "logFC".
#' @param FDRflag The column name (character) of adjusted p value or FDR. Default = "adj.P.Val".
#'
#' @param Fisher.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with all determined
#' Differentially Expressed (DE) genes by \code{FC.cutoff} and \code{FDR.cutoff}.
#' @param Fisher.up.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with the upregulated gene set.
#' @param Fisher.down.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with the downregulated gene set.
#' @param plot.save.to The address to save the plot from simplified cutoff combination with FDR of 0.01, 0.05, 0.1, and 0.2.
#' @param pathway.db The databse to be used for encrichment analysis. Can be one of the following, "rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020".
#' @param customized.pathways the customized pathways in the format of two column dataframe (column name as "gs_name" and "entrez_gene") to be used in analysis.
#' @param ... pass on variables
#' @return The function returns a list of 5 objects:
#' \item{1}{result table from directional pathway enrichment analysis}
#' \item{2}{result table from non-directional pathway enrichment analysis}
#' \item{3}{plot from directional pathway enrichment analysis}
#' \item{4}{plot from non-directional pathway enrichment analysis}
#' \item{5}{plot combining both directional and non-directional plot}
#'
#' @importFrom ggplot2 ggsave ggplot coord_flip geom_bar geom_hline annotate scale_fill_identity xlab ylab theme theme_minimal scale_fill_identity element_blank
#' @importFrom tidyr separate
#' @importFrom stringr word
#' @importFrom clusterProfiler bitr enricher read.gmt
#' @importFrom dplyr %>% select pull as_tibble filter mutate left_join bind_rows
#' @importFrom rWikiPathways downloadPathwayArchive
#' @import org.Hs.eg.db
#' @importFrom ggpubr as_ggplot
#' @importFrom gridExtra arrangeGrob
#' @importFrom purrr set_names map2
#' @importFrom rlang .data
#'
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details The function takes the summary statistics table and use user selected parameter based on check.cutoff to do pathway enrichment analysis
#'
#'
#' @examples result <- plot_pathway(data = Sample_summary_statistics_table,
#' gene.id.type = "ENSEMBL",
#' FC.cutoff = 1.5,
#' p.cutoff = 0.05,
#' pathway.db = "rWikiPathways_aug_2020"
#' )
#' @export plot_pathway
#'
#'
plot_pathway <- function(data = ~df,
comp.names = NULL,
gene.id.type = "ENSEMBL",
FC.cutoff = 1.2,
FDR.cutoff = 0.05,
FCflag = "logFC",
FDRflag = "adj.P.Val",
Fisher.cutoff = 0.1,
Fisher.up.cutoff = 0.1,
Fisher.down.cutoff = 0.1,
plot.save.to = NULL,
pathway.db = "rWikiPathways",
customized.pathways = NULL,
...){
# Gene set enrichment test (Fisher) in FC positive and FC negative group separately.
# the p.adjust value will be compared from + and - group and choose the smaller one and assign the +/- value.
message(paste0("\n\n Running plot pathway. Acceptable data types are list() & data.frame() \n\n"))
message(paste0("\n\n Current program setting only accepts rownames as ", gene.id.type,", use parameter `gene.id.type` to adjust if needed. \n\n"))
validate.single.table.isnotlist(data)
validate.comp.names(comp.names,data)
validate.pathways.db(pathway.db, customized.pathways)
if(inherits(data, "list")){
message("\n Executing list of data \n")
out.d.sig <- map(data, cal.pathway.scores, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = TRUE , IS.list = TRUE, customized.pathways) %>%
set_names(comp.names) %>%
bind_rows(, .id = "Comparisons.ID") %>%
mutate(Description = factor(.data$Description,levels = unique(.data$Description)))
out.nd.sig <- map(data, cal.pathway.scores, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = FALSE , IS.list = TRUE, customized.pathways) %>%
set_names(comp.names) %>%
bind_rows(, .id = "Comparisons.ID") %>%
mutate(Description = factor(.data$Description,levels = unique(.data$Description)))
}else{
if(!all(c(FCflag, FDRflag) %in% names(data))){stop(paste0("Please make sure the column names: ",FCflag," and ",FDRflag, " is specified in input dataframe."))}
path.res <- cal.pathway.scores(data, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = NULL , IS.list = FALSE, customized.pathways)
out.d.sig <- path.res[[1]]
out.nd.sig <- path.res[[2]]
nd.res <- path.res[[3]]
backup.d.sig <- path.res[[1]]
}
##visualize directionally changed pathways
if(is.null(out.d.sig)){
message(paste0("\n No directional enriched pathway can be calculated.\n"))
p.d <- NULL
}else if(nrow(out.d.sig) == 0){
message(paste0("\n No directional enriched pathway passed threshold of FDR < ",Fisher.down.cutoff," in down-regulated pathways or ",Fisher.up.cutoff," for up-regulated pathways.\n"))
p.d <- NULL
}else{
out.d.path <- out.d.sig %>%
filter(.data$directional.p.adjust >= -Fisher.down.cutoff & .data$directional.p.adjust <= Fisher.up.cutoff) %>%
pull(.data$Description) %>%
unique
out.d.sig <- out.d.sig %>%
filter(.data$Description %in% out.d.path)
#retain original dataset before transofrmation
sav.out.d.sig <- out.d.sig
#case where there is a false signal in one treatment but none in the other
if (nrow(out.d.sig) == 0 & !inherits(data,"list")) {
message("\n The cutoff values for Fisher.down.cutoff & Fisher.up.cutoff resulted in 0 values being selected, consider choosing looser cutoff values \n")
p.d <- NULL
} else {
if (inherits(data,"list")){
check_0 = unique(out.d.sig$ID)
if (nrow(out.d.sig) == 0) {
#if the second cutoff removes all the rows
message("\n The cutoff values for Fisher.down.cutoff & Fisher.up.cutoff resulted in 0 values being selected, consider choosing looser cutoff values \n")
out.d.sig <- secondCutoffErr(out.d.sig ,comp.names, TypeQ = 1)
}
else if (!(length(check_0) == 1 & check_0[1] == "")){
if (sum(out.d.sig$ID == "") > 0)
message("\n Warning: There were null entries \n")
out.d.sig <- prettyGraphs(out.d.sig)
#if the second cutoff eliminated one of the facets then do
} else if (length(comp.names) != length(unique(out.d.sig$Comparisons.ID))){
out.d.sig <- secondCutoffErr(out.d.sig ,comp.names, TypeQ = 1)
}
}
p.d <- ggplot(out.d.sig, aes(x = .data$Description, y= .data$log10.padj,fill=.data$fil.cor )) +
geom_bar(stat = "identity" ) +
coord_flip() +
geom_hline(yintercept=-log10(0.05), linetype="dashed",color = "grey") +
geom_hline(yintercept=log10(0.05), linetype="dashed",color = "grey") +
geom_hline(yintercept= 0,color = "black") +
annotate("rect",
xmin = -Inf,
xmax = Inf,
ymin =-log10(0.05),
ymax = log10(0.05),
fill = "grey",
alpha = 0.4) +
scale_fill_identity() +
xlab("Pathway") +
ylab("-log10(FDR)")
#make sure the returned dataset only has observations with signals
out.d.sig <- sav.out.d.sig[sav.out.d.sig$ID != "",]
if(inherits(data, "list")){
p.d <- p.d + facet_grid(.~ Comparisons.ID)
}
}
}
## non-directional test p value
if(is.null(out.nd.sig)){
message(paste0("\n No non-directional enriched pathway can be calculated.\n"))
p.nd <- NULL
p.all <- ggplot() +
annotate("text", x = 4, y = 25, size=8, label = paste0("No non-directional \n enriched pathway\n passed threshold of FDR ",Fisher.cutoff," \n thus a combined plot is not provided")) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
}else if(nrow(out.nd.sig) == 0){
message(paste0("\n No non-directional enriched pathway passed threshold of FDR ",Fisher.cutoff,". \n"))
p.nd <- NULL
p.all <- ggplot() +
annotate("text", x = 4, y = 25, size=8, label = paste0("No non-directional \n enriched pathway\n passed threshold of FDR ",Fisher.cutoff," \n thus a combined plot is not provided")) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
}else{
out.nd.path <- out.nd.sig %>%
filter(.data$p.adjust < Fisher.cutoff) %>%
pull(.data$Description) %>%
unique
out.nd.sig <- out.nd.sig %>%
filter(.data$Description %in% out.nd.path)
sav.out.nd.sig <- out.nd.sig
if (nrow(out.nd.sig) == 0 & !inherits(data,"list")) {
message("\n The cutoff value for Fisher.cutoff resulted in 0 values being selected \n, consider choosing looser cutoff values \n")
e_message <- "The cutoff value for Fisher up/down/non-directional cutoff \n resulted in 0 values being selected, \n consider choosing looser cutoff values"
p.nd <- NULL
p.all <- ggplot() +
annotate("text", x = 4, y = 25, size=6, label = paste0(e_message," \n\n Provided Cutoff's ","\n Fisher Cutoff = ",
Fisher.cutoff,",\n Fisher Cutoff down = ",
Fisher.down.cutoff,", \n Fisher Cutoff up = ", Fisher.up.cutoff )) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
} else {
if (inherits(data,"list")){
#if this is the case we don't have to look at others
if (nrow(out.nd.sig) == 0) {
#if the second cutoff removes all the rows
message(" \n The cutoff value for Fisher.cutoff resulted in 0 values being selected, consider choosing looser cutoff values \n")
out.nd.sig <- secondCutoffErr(out.nd.sig ,comp.names, TypeQ = 2)
out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
}
else if (sum(out.nd.sig$ID == "") > 0){
check_0 = unique(out.nd.sig$ID)
if (!(length(check_0) == 1 & check_0[1] == "")){
#print ("Changed")
out.nd.sig <- prettyGraphs(out.nd.sig)
out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
} else if (length(comp.names) != length(unique(out.nd.sig$Comparisons.ID))){
out.nd.sig <- secondCutoffErr(out.nd.sig ,comp.names, TypeQ = 2)
out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
}
}
}
#cases where data frame is passed
#most likely for cases where all treatment groups are null
check_1 = unique(out.nd.sig$ID)
check_2 = unique(out.nd.sig$Description)
if (length(check_1) == 1 & check_1[1] == "" & check_2[1] == "" & mean(out.nd.sig$pvalue) == -99999){
out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
}
p.nd <- ggplot(out.nd.sig,aes(x=.data$Description,y=-log10(.data$p.adjust))) +
geom_bar(stat = "identity" ) +
coord_flip() +
annotate("rect",
xmin = -Inf,
xmax = Inf,
ymin = 0,
ymax = -log10(0.05),
fill = "grey",
alpha = 0.4) +
geom_hline(yintercept= -log10(0.05),color = "grey") +
ylab("-log10(FDR) - Fisher's Test") +
xlab("Pathway") +
theme_minimal()
#make sure returned dataset has signals
out.nd.sig <- sav.out.nd.sig[sav.out.nd.sig$ID != "",]
if(inherits(data, "list")){
p.nd <- p.nd + facet_grid(.~ Comparisons.ID)
}
# below chunk only run when single summary statistics table is provided
if(!inherits(data, "list")){
if(is.null(out.d.sig)){test_1 <- 0}else{test_1 <- nrow(out.d.sig)}
if(is.null(out.nd.sig)){test_2 <- 0}else{test_2 <- nrow(out.nd.sig)}
if(!(test_1 > 0 & test_2 > 0)){
e_message <- "The cutoff value for Fisher up/down/non-directional cutoff \n resulted in 0 values being selected, \n consider choosing looser cutoff values"
p.all <- ggplot() +
annotate("text", x = 4, y = 25, size=6, label = paste0(e_message," \n\n Provided Cutoff's ","\n Fisher Cutoff = ",
Fisher.cutoff,",\n Fisher Cutoff down = ",
Fisher.down.cutoff,", \n Fisher Cutoff up = ", Fisher.up.cutoff )) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
}else{
if(length(unique(out.d.sig$ID)) > length(unique(out.nd.sig$ID))){
message(paste0("More pathways in directional data than non directional data \n",
"hence using pathways exsting in directinal dataset"))
}else{
message(paste0("More pathways in non directional data than directional data \n",
"hence using pathways exsting in non directinal dataset"))
}
## non-directional test p value - with matched plot
allID <- unique(c(unique(out.d.sig$ID), unique(out.nd.sig$ID)))
mplots <- multiPlot(allID, backup.d.sig, nd.res)
if (!length(mplots) == 2){
p.all <- mplots
}else{
dplot <- mplots[[1]]
ndplot <- mplots[[2]]
p.all <- gridExtra::arrangeGrob(ndplot,dplot,
ncol=2,
widths=c(0.75, 0.25)) %>% as_ggplot
}
}
}
}
}
if(is.null(plot.save.to)){
message("\n Plot file name not specified, a plot in ggplot object will be returned! \n")
}else{
ggsave(filename = plot.save.to,
plot = p.all,
dpi = 300,
units = "in",
device='png')
}
if(inherits(data, "list")){
return(list(out.d.sig, out.nd.sig, p.d, p.nd))
}else{
return(list(out.d.sig, out.nd.sig, p.d, p.nd, p.all))
}
}
#' @title Null Return
#' @description The function takes in a boolean value and a numeric value, which it uses to decide what to output.
#'
#' @param IS.list Indicator of whether the data frame being input is list or not.
#' @param type If type = 1(default) return directional null plot. If type = 2 return non directional null plot.
#'
#' @return The function returns either returns a data frame or the value NULL.
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details nullreturn is a function that returns NULL for single df inputs that don't hold true for threshold values. It returns an empty dataframe for
#' list inputs which don't satisfy the cutoff's
#'
nullreturn <- function(IS.list,type=1){
if (IS.list) {
if (type == 1){
# return empty dataset for directional
return (data.frame("ID"="", "Description"="", "GeneRatio"="", "BgRatio"="",
"pvalue"=0, "p.adjust"=0, "qvalue"=0, "geneID"="", "Count"=0,
"directional.p.adjust"=0, "direction"="", "log10.padj"=0,"fil.cor"="#E31A1C"))}
else if (type == 2){
# return empty for Non Dir. dataset,
# pval = -99999 so that later we can make adjustments to achieve -log10 pval is 0
return (data.frame("ID"="", "Description"="", "pvalue"=-99999, "p.adjust"=-99999))
}
} else {
return (NULL)
}
}
#' @title Second Cutoff Error
#' @description The function takes in a list of dataframe, comp names and a specified type, to output a dataframe styled for ggplot.
#'
#' @param df A list of dataframes.
#' @param comp.names a character vector contain the comparison names corresponding to the same order to the \code{dat.list}. default = NULL.
#' @param TypeQ If type = 1(default) return directional null plot. If type = 2 return non directional null plot.
#'
#' @return Returns a dataframe.
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details secondCutoffErr is a function specifically meant to be used for list inputs. It is used for cases where after
#' applying filter to the data, one of the comparison ID gets left out, this adversely effects the ggplot
#'
secondCutoffErr <- function(df,comp.names, TypeQ = 1){
set.compliment <- setdiff(comp.names,unique(df$Comparisons.ID))
for (i in set.compliment){
ndf <- nullreturn(TRUE,TypeQ)
ndf$Comparisons.ID <- i
df <- rbind(df, ndf)
}
return (df)
}
#special cases where list input and at least one treatment has signal but others don't
# adjust dataframe for visualization
#' @title Pretty Graphs
#' @description Special cases where list input and at least one treatment has signal but others don't.
#'
#' @param vizdf A dataframes of enriched pathways.
#' @param ... pass on variables
#'
#' @return Returns a dataframe.
#'
#' @importFrom dplyr %>% filter
#' @importFrom rlang .data
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details Pretty Graphs is a function specifically meant to be in cases where one of the input treatments meet cutoff, but
#' one or more of the other treatments don't meet the cutoff values. This is important so that ggplot doesn't throw any errors.
#'
#'
prettyGraphs <- function(vizdf, ...){
g1 = vizdf[vizdf$ID == "",]
g2 = vizdf[vizdf$ID != "",]
unique_desc = g2$Description
unique_ID = g2$ID
newdf = data.frame()
for (i in g1$Comparisons.ID){
temp = g1 %>% filter(.data$Comparisons.ID == i)
temp = temp[rep(seq_len(nrow(temp)), each = length(unique_ID)),]
temp[["Description"]] = unique_desc
newdf = rbind(newdf,temp)
}
newdf = rbind(newdf,g2)
return (newdf)
}
#' @title Multi Plot
#' @description Multi plot is for directional and non-directional plots
#'
#' @param allID A vector of all pathway ID's from directional and non directional enriched datasets.
#' @param backup.d.sig A dataframe type of object with directional pathways data prior to any cutoff's being applied
#' @param nd.res A dataframe type of object with non directional pathways data prior to any cutoff's being applied
#' @param ... pass on variables
#'
#' @return Returns ggplot.
#'
#' @importFrom dplyr %>% filter mutate
#' @importFrom rlang .data
#' @importFrom ggplot2 ggsave ggplot coord_flip geom_bar geom_hline annotate scale_fill_identity xlab ylab theme theme_minimal scale_fill_identity element_blank
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details Multi plot is for directional and non-directional plots, when one of the plots doesn't contain observations.
#'
#'
multiPlot <- function(allID, backup.d.sig, nd.res, ...){
#make sure id exists / inner join in both res & sig tables tables
backup.d.sig_ = backup.d.sig[backup.d.sig$ID %in% nd.res$ID,]
nd.res_ = nd.res[ nd.res$ID %in% backup.d.sig$ID ,]
#now eilimate any ID that doesn't exist in the backup columns so that there is no error
# when filtering the datasets
allID_ = allID[allID %in% nd.res_$ID]
matched.d.sig <- backup.d.sig_ %>%
filter(.data$ID %in% allID_)
matched.nd.sig <- nd.res_ %>%
filter(.data$ID %in% allID_) %>%
mutate(Description = factor(matched.d.sig$Description,levels = matched.d.sig$Description))
#if allid_ is empty raise this error,
if (length(allID_) == 0){
message(paste0("\n\n No common ID's between n.d & dir. data \n\n"))
nullplot <- ggplot() +
annotate("text", x = 4, y = 25, size=8, label = paste0("No common ID's between n.d & dir. data")) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
return (nullplot)
}else{
ndplot <- ggplot(matched.nd.sig,aes(x=as.factor(.data$Description),y=-log10(.data$p.adjust))) +
geom_bar(stat = "identity" ) +
coord_flip() +
annotate("rect",xmin = -Inf,xmax = Inf,ymin = 0,ymax = -log10(0.05),fill = "grey", alpha = 0.4) +
geom_hline(yintercept= -log10(0.05),color = "grey") +
ylab("-log10(FDR)") +
xlab("Pathway") +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank())
dplot <- ggplot(matched.d.sig, aes(x = .data$Description, y= .data$log10.padj,fill=.data$fil.cor )) +
geom_bar(stat = "identity" ) +
coord_flip() +
geom_hline(yintercept=-log10(0.05), linetype="dashed",color = "grey") +
geom_hline(yintercept=log10(0.05), linetype="dashed",color = "grey") +
geom_hline(yintercept= 0,color = "black") +
annotate("rect",xmin = -Inf,xmax = Inf,ymin =-log10(0.05),ymax = log10(0.05),fill = "grey", alpha = 0.4) +
scale_fill_identity() +
xlab("Pathway") +
ylab("-log10(FDR)")
return (list(ndplot,dplot))
}
}
#' @title DL Pathways DB
#' @description Download gene database for enrichment.
#'
#' @param pathway.db The databse to be used for encrichment analysis. Can be one of the following, "rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020"
#' @param customized.pathways the user provided pathway added for analysis.
#'
#' @return Returns a dataframe.
#'
#' @importFrom rWikiPathways downloadPathwayArchive
#' @importFrom clusterProfiler read.gmt
#' @import GSVAdata
#' @importFrom GSEABase setName geneIds
#' @importFrom tidyr separate
#' @importFrom dplyr %>% select pull as_tibble filter mutate left_join bind_rows
#' @import org.Hs.eg.db
#' @importFrom purrr set_names map2 map
#' @importFrom rlang .data
#' @importFrom msigdbr msigdbr
#' @param ... pass over parameters
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
dlPathwaysDB <- function(pathway.db, customized.pathways = NULL, ...){
bsetfunc <- function(data){
return (data.frame(cbind(setName(data),geneIds(data))))
}
if(!is.null(pathway.db)){
if(pathway.db == "rWikiPathways"){
gene.dl <- rWikiPathways::downloadPathwayArchive(organism="Homo sapiens", format = "gmt") %>%
clusterProfiler::read.gmt()
names(gene.dl)[1] <- "ont"
gene.dl <- gene.dl %>% tidyr::separate(.data$ont, c("name","version","wpid","org"), "%")
#from the overall data, extract pathways id and gene number
wpid2gene <- gene.dl %>% dplyr::select(.data$wpid,.data$gene) #TERM2GENE
#for the same pathways id's get the full descriptive name
wpid2name <- gene.dl %>% dplyr::select(.data$wpid,.data$name) #TERM2NAME
}else if(pathway.db == "KEGG"){
#data(GSVAdata::c2BroadSets)
c2Bsets <- RVA::c2BroadSets
KEGG <- c2Bsets[c(grep("^KEGG", names(c2Bsets)))]
gene.dl <- map(KEGG, bsetfunc) %>% bind_rows() %>% set_names(c("gs_name","entrez_gene"))
#from the overall data, extract pathways id and gene number
wpid2gene <- gene.dl
#for the same pathways id's get the full descriptive name
wpid2name <- NULL
}else if (pathway.db == "REACTOME"){
#data(c2BroadSets)
c2Bsets <- RVA::c2BroadSets
REAC <- c2Bsets[c(grep("^REACTOME", names(c2Bsets)))]
gene.dl <- map(REAC, bsetfunc) %>% bind_rows() %>% set_names(c("gs_name","entrez_gene"))
#from the overall data, extract pathways id and gene number
wpid2gene <- gene.dl
#for the same pathways id's get the full descriptive name
wpid2name <- NULL
}else if (pathway.db == "Hallmark"){
gene.dl <- msigdbr::msigdbr(species = "Homo sapiens",category = c("H")) %>%
dplyr::select(.data$gs_name, .data$entrez_gene)
#from the overall data, extract pathways id and gene number
wpid2gene <- gene.dl
#for the same pathways id's get the full descriptive name
wpid2name <- NULL
}else if (pathway.db == "rWikiPathways_aug_2020"){
#throw warning about timestamp
#gene.dl <- clusterProfiler::read.gmt('wikipathways-20200710-gmt-Homo_sapiens.gmt')
#gene.dl <- gene.dl %>% tidyr::separate(ont, c("name","version","wpid","org"), "%")
gene.dl <- RVA::wpA2020
#from the overall data, extract pathways id and gene number
wpid2gene <- gene.dl %>% dplyr::select(.data$wpid,.data$gene) #TERM2GENE
#for the same pathways id's get the full descriptive name
wpid2name <- gene.dl %>% dplyr::select(.data$wpid,.data$name) #TERM2NAME
}
}
if(!is.null(customized.pathways)){
if(is.null(pathway.db)){
if(nrow(customized.pathways) < 100){
message(paste0("\n\n!! Customized pathway information alone will be used for analysis without any additional database,",
" if accumulated number of genes in pathway is too small compared to all genes provided in summary statistics table,",
" the result may not be applicable !!\n\n"))
}
wpid2gene <- customized.pathways
wpid2name <- NULL
}else{
if(pathway.db %in% c("rWikiPathways","rWikiPathways_aug_2020")){
colnames(customized.pathways) <- c("wpid","gene")
customized.pathways$gene <- as.character(customized.pathways$gene)
wpid2gene <- wpid2gene %>% bind_rows(customized.pathways)
wpid2name <- wpid2name %>% bind_rows(customized.pathways %>% select(wpid = .data$wpid, name = .data$wpid))
}else{
wpid2gene <- wpid2gene %>% bind_rows(customized.pathways)
}
}
}
return (list(wpid2gene, wpid2name))
}
#' @title calculate pathway scores
#' @description Calculate pathway scores
#'
#' @param data A summary statistics table (data.frame) or \code{data.list} generated by DE analysis software like limma or DEseq2
#' @param pathway.db pathway database used
#' @param gene.id.type gene.id.type
#' @param FCflag The column name (character) of fold change information, assuming the FC is log2 transformed. Default = "logFC".
#' @param FDRflag The column name (character) of adjusted p value or FDR. Default = "adj.P.Val".
#' @param FC.cutoff The fold change cutoff (numeric) selected to subset summary statistics table. Default = 1.5.
#' @param FDR.cutoff The FDR cutoff selected (numeric) to subset summary statistics table. Default = 0.05.
#' @param OUT.Directional logical, whether output directional or non-directional pathway analysis result, default: NULL.
#' @param IS.list logical, whether the input is a list, default: NULL
#' @param customized.pathways the customized pathways in the format of two column dataframe to be used in analysis
#' @param ... pass over parameters
#' @return Returns a dataframe.
#'
#' @importFrom rWikiPathways downloadPathwayArchive
#' @importFrom clusterProfiler bitr enricher
#' @importFrom dplyr %>% as_tibble filter mutate left_join bind_rows select pull
#' @import org.Hs.eg.db
#' @importFrom stringr word
#' @importFrom rlang .data
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
cal.pathway.scores <- function(data, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = NULL , IS.list = FALSE,customized.pathways, ...){
#supress warnings
suppressWarnings({
suppressMessages({
#prepare pathway information
collection_0 <- dlPathwaysDB(pathway.db, customized.pathways)
#from the overall data, extract pathways id and gene number
wpid2gene <- collection_0[[1]]
#for the same pathways id's get the full descriptive name
wpid2name <- collection_0[[2]]
#Get background gene list (from all the genes available for the statistical test)
#Conversion from ENSEMBL to ENTREZID
#IMPORTANT!! The background should be the total genelist
if(gene.id.type == "ENTREZID"){
bkgd.genes.entrez <- rownames(data)
}else{
bkgd.genes.entrez <- rownames(data) %>%
word(sep = '\\.') %>%
clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db) %>%
pull(.data$ENTREZID)
}
data.subset <- data[,c(FCflag, FDRflag)]
colnames(data.subset) <- c("FC","FDR")
data <- data.subset %>%
as_tibble(rownames = "gene") %>%
filter(abs(.data$FC) >= log2(FC.cutoff), .data$FDR <= FDR.cutoff) #apply filter
#processing pathway analysis
if(nrow(data) < 10){
message("\n Less then 10 genes submitted for pathway analysis, consider choosing a looser cutoff... \n")
}
# 1. Get Enrichment test of increase
if(nrow(filter(data,.data$FC >0)) != 0){
#if this throws an error replacing it with custom error message
tryCatch({
if(gene.id.type == "ENTREZID"){
genes.entrez.up <- data %>% filter(.data$FC > 0)
genes.entrez.up <- data.frame(gene = genes.entrez.up$gene, ENTREZID = genes.entrez.up$gene)
}else{
genes.entrez.up <- data %>%
filter(.data$FC > 0) %>%
pull(.data$gene) %>%
word(sep = '\\.') %>%
clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
}
ewp.up <- clusterProfiler::enricher(
genes.entrez.up[[2]],
universe = bkgd.genes.entrez,
pAdjustMethod = "fdr",
pvalueCutoff = 1, #p.adjust cutoff; relaxed
qvalueCutoff = 1,
TERM2GENE = wpid2gene,
TERM2NAME = wpid2name)
}, error = function(e) {
stop(paste0("\n\n If a single dataset was input none of the ",
" gene id that are within threshold have valid keys while running ",
" cluster profiler. If a list is being passed, this might have occured with one of the input tables",
" consider changing supplied datasets or loosen threshold values.Stopping all processses. \n\n"
))}
)
if(is.null(ewp.up)){
res.up <- data.frame()
}else{
res.up <- ewp.up@result}
}else{res.up <- data.frame()} #return empty df if no genes are increase
# 2. Get Enrichment test of decrease
if(nrow(filter(data,.data$FC < 0)) != 0){
#If an error is found throw a custom error message
tryCatch({
if(gene.id.type == "ENTREZID"){
genes.entrez.dn <- data %>% filter(.data$FC < 0)
genes.entrez.dn <- data.frame(gene = genes.entrez.dn$gene, ENTREZID = genes.entrez.dn$gene)
}else{
genes.entrez.dn <- data %>%
filter(.data$FC < 0) %>%
pull(.data$gene) %>%
word(sep = '\\.') %>%
clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
}
ewp.dn <- clusterProfiler::enricher(
genes.entrez.dn[[2]],
universe = bkgd.genes.entrez,
pAdjustMethod = "fdr",
pvalueCutoff = 1, #p.adjust cutoff; relaxed
qvalueCutoff = 1,
TERM2GENE = wpid2gene,
TERM2NAME = wpid2name)
},error = function(e) {
stop(paste0("\n\n If a single dataset was input none of the ",
" gene id that are within threshold have valid keys while running ",
" cluster profiler. If a list is being passed, this might have occured with one of the input tables",
" consider changing supplied datasets or loosen threshold values.Stopping all processses. \n\n"
))}
)
if(is.null(ewp.dn)){
res.dn <- data.frame()
}else{
res.dn <- ewp.dn@result}
}else{res.dn <- data.frame()} #return empty df if no genes are decrease
#obtain result of each direction
if(nrow(res.up) != 0 & nrow(res.dn) != 0){
res.up <- ewp.up@result %>%
as_tibble %>%
mutate(directional.p.adjust = .data$p.adjust)
res.dn <- ewp.dn@result %>%
as_tibble %>%
mutate(directional.p.adjust = -.data$p.adjust)
#identify the duplicated pathways have both up and down information and pick the one with smaller adj.p as final direction
dup.up <- res.up %>%
filter(.data$ID %in% intersect(res.dn$ID,res.up$ID)) %>%
dplyr::select(.data$ID,.data$directional.p.adjust)
reassign.dup <- res.dn %>%
filter(.data$ID %in% intersect(res.dn$ID,res.up$ID)) %>%
left_join(dup.up,by="ID") %>%
mutate(directional.p.adjust = ifelse(abs(.data$directional.p.adjust.x) < abs(.data$directional.p.adjust.y),
.data$directional.p.adjust.x,
.data$directional.p.adjust.y)) %>% # use the smaller p value to assign direction
dplyr::select(-.data$directional.p.adjust.x,-.data$directional.p.adjust.y)
d.res <- res.dn %>%
filter(!(.data$ID %in% reassign.dup$ID)) %>%
bind_rows(reassign.dup, filter(res.up,!(.data$ID %in% reassign.dup$ID))) %>%
dplyr::select(.data$ID, .data$Description,.data$directional.p.adjust) %>%
mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
}else if(nrow(res.up) == 0 & nrow(res.dn) != 0){
d.res <- res.dn %>%
as_tibble %>%
mutate(directional.p.adjust = -.data$p.adjust) %>%
mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
}else if(nrow(res.up) != 0 & nrow(res.dn) == 0){
d.res <- res.up %>%
as_tibble %>%
mutate(directional.p.adjust = .data$p.adjust) %>%
mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
}else{
d.res <- NULL
message("\n No directional enriched pathway has been detected. \n")
}
#3. Get Enrichment test without direction
if(nrow(data)!=0){
if(gene.id.type == "ENTREZID"){
genes.entrez <- data %>% filter(.data$FC > 0)
genes.entrez <- data.frame(gene = genes.entrez$gene, ENTREZID = genes.entrez$gene)
}else{
genes.entrez <- data %>%
filter(.data$FC > 0) %>%
pull(.data$gene) %>%
word(sep = '\\.') %>%
clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
}
ewp <- clusterProfiler::enricher(
genes.entrez[[2]],
universe = bkgd.genes.entrez,
pAdjustMethod = "fdr",
pvalueCutoff = 1, #p.adjust cutoff; relaxed
qvalueCutoff = 1,
TERM2GENE = wpid2gene,
TERM2NAME = wpid2name)
if(is.null(ewp)){
message("\n No none-directional enriched pathway has been detected. \n")
#helpflag
nd.res <- nullreturn(IS.list = IS.list , type=2)
}else{
nd.res <- ewp@result %>%
dplyr::select(.data$ID, .data$Description, .data$pvalue, .data$p.adjust)
}
message(paste0("\n Enrichment test completed with ",nrow(data), " Differentially expressed genes with min FC ",round(2^min(abs(data$FC)),3)," max adj.p ",round(max(abs(data$FDR)),3),".\n\n"))
if(is.null(d.res)){
out.d.sig <- nullreturn(IS.list = IS.list,type=1)
}else{
out.d.sig <- d.res %>%
mutate(log10.padj = ifelse(.data$direction == "up", -log10(abs(.data$directional.p.adjust)), log10(abs(.data$directional.p.adjust)))) %>%
mutate(fil.cor = ifelse(.data$direction == "up", "#E31A1C", "#1F78B4")) %>%
mutate(Description = factor(.data$Description,levels = .data$Description))
}
if(is.null(nd.res)){
out.nd.sig <- nullreturn(IS.list = IS.list,type=2)
}else{
out.nd.sig <- nd.res %>%
mutate(Description = factor(.data$Description,levels = .data$Description))
}
}else{
out.d.sig <- nullreturn(IS.list = IS.list,type=1)
out.nd.sig <- nullreturn(IS.list = IS.list,type=2)
nd.res <- nullreturn(IS.list = IS.list,type=2)
message("\n No genes passed specified cutoff... return NULL \n")
}
if(isTRUE(OUT.Directional)){
return(out.d.sig)
}else if(isFALSE(OUT.Directional)){
return(out.nd.sig)
}else if(is.null(OUT.Directional)){
return(list(out.d.sig, out.nd.sig, nd.res))
}
})
}) #end of supress warning and errors
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.