R/plot_pathway.R

Defines functions cal.pathway.scores dlPathwaysDB multiPlot prettyGraphs secondCutoffErr nullreturn plot_pathway

Documented in cal.pathway.scores dlPathwaysDB multiPlot nullreturn plot_pathway prettyGraphs secondCutoffErr

#' @title Pathway analysis and visualization
#' @description This is the function to do pathway enrichment analysis (and visualization) with rWikipathways (also KEGG, REACTOME & Hallmark) from a summary statistics table generated by
#' differential expression analysis like \code{limma} or \code{DESeq2}.
#'
#' @param data A summary statistics table (data.frame) or \code{data.list} generated by DE analysis software like limma or DEseq2, where rownames are gene id.
#' @param comp.names A character vector containing the comparison names corresponding to the same order of the \code{data.list}. Default = NULL.
#' @param gene.id.type The gene id format in \code{data} should be one of: ACCNUM, ALIAS, ENSEMBL, ENSEMBLPROT,
#'  ENSEMBLTRANS, ENTREZID, ENZYME, EVIDENCE, EVIDENCEALL, GENENAME, GO, GOALL, IPI, MAP, OMIM,
#'   ONTOLOGY, ONTOLOGYALL, PATH, PFAM, PMID, PROSITE, REFSEQ, SYMBOL, UCSCKG, UNIGENE, UNIPROT.
#' @param FC.cutoff The fold change cutoff (numeric) selected to subset summary statistics table. Default = 1.5.
#' @param FDR.cutoff The FDR cutoff selected (numeric) to subset summary statistics table. Default = 0.05.
#' @param FCflag The column name (character) of fold change information, assuming the FC is log2 transformed. Default = "logFC".
#' @param FDRflag The column name (character) of adjusted p value or FDR. Default = "adj.P.Val".
#'
#' @param Fisher.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with all determined
#' Differentially Expressed (DE) genes by \code{FC.cutoff} and \code{FDR.cutoff}.
#' @param Fisher.up.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with the upregulated gene set.
#' @param Fisher.down.cutoff The FDR cutoff selected (numeric) for the pathway enrichment analysis' Fisher's exact test with the downregulated gene set.
#' @param plot.save.to The address to save the plot from simplified cutoff combination with FDR of 0.01, 0.05, 0.1, and 0.2.
#' @param pathway.db The databse to be used for encrichment analysis. Can be one of the following, "rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020".
#' @param customized.pathways the customized pathways in the format of two column dataframe (column name as "gs_name" and "entrez_gene") to be used in analysis.
#' @param ... pass on variables
#' @return The function returns a list of 5 objects:
#' \item{1}{result table from directional pathway enrichment analysis}
#' \item{2}{result table from non-directional pathway enrichment analysis}
#' \item{3}{plot from directional pathway enrichment analysis}
#' \item{4}{plot from non-directional pathway enrichment analysis}
#' \item{5}{plot combining both directional and non-directional plot}
#'
#' @importFrom ggplot2 ggsave ggplot coord_flip geom_bar geom_hline annotate scale_fill_identity xlab ylab theme theme_minimal scale_fill_identity element_blank
#' @importFrom tidyr separate
#' @importFrom stringr word
#' @importFrom clusterProfiler bitr enricher read.gmt
#' @importFrom dplyr %>% select pull as_tibble filter mutate left_join bind_rows
#' @importFrom rWikiPathways downloadPathwayArchive
#' @import org.Hs.eg.db
#' @importFrom ggpubr as_ggplot
#' @importFrom gridExtra arrangeGrob
#' @importFrom purrr set_names map2
#' @importFrom rlang .data
#'
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details The function takes the summary statistics table and use user selected parameter based on check.cutoff to do pathway enrichment analysis
#'
#'
#' @examples result <- plot_pathway(data = Sample_summary_statistics_table,
#' gene.id.type = "ENSEMBL",
#' FC.cutoff = 1.5,
#' p.cutoff = 0.05,
#' pathway.db = "rWikiPathways_aug_2020"
#' )
#' @export plot_pathway
#'
#'


plot_pathway <- function(data = ~df,
                         comp.names = NULL,
                         gene.id.type = "ENSEMBL",
                         FC.cutoff = 1.2,
                         FDR.cutoff = 0.05,
                         FCflag = "logFC",
                         FDRflag = "adj.P.Val",
                         Fisher.cutoff = 0.1,
                         Fisher.up.cutoff = 0.1,
                         Fisher.down.cutoff = 0.1,
                         plot.save.to = NULL,
                         pathway.db = "rWikiPathways",
                         customized.pathways = NULL,
                         ...){

  # Gene set enrichment test (Fisher) in FC positive and FC negative group separately.
  # the p.adjust value will be compared from + and - group and choose the smaller one and assign the +/- value.

  message(paste0("\n\n Running plot pathway. Acceptable data types are list() & data.frame() \n\n"))
  message(paste0("\n\n Current program setting only accepts rownames as ", gene.id.type,", use parameter `gene.id.type` to adjust if needed. \n\n"))

  validate.single.table.isnotlist(data)
  validate.comp.names(comp.names,data)
  validate.pathways.db(pathway.db, customized.pathways)


  if(inherits(data, "list")){
    message("\n Executing list of data \n")
    out.d.sig <- map(data, cal.pathway.scores, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = TRUE , IS.list = TRUE, customized.pathways) %>%
      set_names(comp.names) %>%
      bind_rows(, .id = "Comparisons.ID") %>%
      mutate(Description = factor(.data$Description,levels = unique(.data$Description)))

    out.nd.sig <- map(data, cal.pathway.scores, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = FALSE , IS.list = TRUE, customized.pathways) %>%
      set_names(comp.names) %>%
      bind_rows(, .id = "Comparisons.ID") %>%
      mutate(Description = factor(.data$Description,levels = unique(.data$Description)))

  }else{
    if(!all(c(FCflag, FDRflag) %in% names(data))){stop(paste0("Please make sure the column names: ",FCflag," and ",FDRflag, " is specified in input dataframe."))}

    path.res <- cal.pathway.scores(data, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = NULL , IS.list = FALSE, customized.pathways)
    out.d.sig <- path.res[[1]]
    out.nd.sig <- path.res[[2]]
    nd.res <- path.res[[3]]
    backup.d.sig <- path.res[[1]]
  }


  ##visualize directionally changed pathways
  if(is.null(out.d.sig)){
    message(paste0("\n No directional enriched pathway can be calculated.\n"))
    p.d <- NULL

  }else if(nrow(out.d.sig) == 0){
    message(paste0("\n No directional enriched pathway passed threshold of FDR < ",Fisher.down.cutoff," in down-regulated pathways or ",Fisher.up.cutoff," for up-regulated pathways.\n"))
    p.d <- NULL
  }else{
    out.d.path <- out.d.sig %>%
      filter(.data$directional.p.adjust >= -Fisher.down.cutoff & .data$directional.p.adjust <= Fisher.up.cutoff) %>%
      pull(.data$Description) %>%
      unique
    out.d.sig <- out.d.sig %>%
      filter(.data$Description %in% out.d.path)

    #retain original dataset before transofrmation
    sav.out.d.sig <- out.d.sig

    #case where there is a false signal in one treatment but none in the other
    if (nrow(out.d.sig) == 0 & !inherits(data,"list")) {

      message("\n The cutoff values for Fisher.down.cutoff & Fisher.up.cutoff resulted in 0 values being selected,  consider choosing looser cutoff values \n")
      p.d <- NULL

    } else {

      if (inherits(data,"list")){
        check_0 = unique(out.d.sig$ID)

        if (nrow(out.d.sig) == 0) {

          #if the second cutoff removes all the rows
          message("\n The cutoff values for Fisher.down.cutoff & Fisher.up.cutoff resulted in 0 values being selected,  consider choosing looser cutoff values \n")
          out.d.sig <- secondCutoffErr(out.d.sig ,comp.names, TypeQ = 1)
        }
        else if (!(length(check_0) == 1 & check_0[1] == "")){
          if (sum(out.d.sig$ID == "") > 0)
            message("\n Warning: There were null entries \n")
          out.d.sig <- prettyGraphs(out.d.sig)

          #if the second cutoff eliminated one of the facets then do
        } else if (length(comp.names) != length(unique(out.d.sig$Comparisons.ID))){
          out.d.sig <- secondCutoffErr(out.d.sig ,comp.names, TypeQ = 1)
        }
      }

      p.d <-  ggplot(out.d.sig, aes(x = .data$Description, y= .data$log10.padj,fill=.data$fil.cor )) +
        geom_bar(stat = "identity" ) +
        coord_flip() +
        geom_hline(yintercept=-log10(0.05), linetype="dashed",color = "grey") +
        geom_hline(yintercept=log10(0.05), linetype="dashed",color = "grey") +
        geom_hline(yintercept= 0,color = "black") +
        annotate("rect",
                 xmin = -Inf,
                 xmax = Inf,
                 ymin =-log10(0.05),
                 ymax = log10(0.05),
                 fill = "grey",
                 alpha = 0.4) +
        scale_fill_identity() +
        xlab("Pathway") +
        ylab("-log10(FDR)")

      #make sure the returned dataset only has observations with signals
      out.d.sig <- sav.out.d.sig[sav.out.d.sig$ID != "",]

      if(inherits(data, "list")){
        p.d <- p.d + facet_grid(.~ Comparisons.ID)
      }

    }

  }


  ## non-directional test p value
  if(is.null(out.nd.sig)){
    message(paste0("\n No non-directional enriched pathway can be calculated.\n"))
    p.nd <- NULL
    p.all <- ggplot() +
      annotate("text", x = 4, y = 25, size=8, label = paste0("No non-directional \n enriched pathway\n passed threshold of FDR ",Fisher.cutoff," \n thus a combined plot is not provided")) +
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank())


  }else if(nrow(out.nd.sig) == 0){
    message(paste0("\n No non-directional enriched pathway passed threshold of FDR ",Fisher.cutoff,". \n"))
    p.nd <- NULL
    p.all <- ggplot() +
      annotate("text", x = 4, y = 25, size=8, label = paste0("No non-directional \n enriched pathway\n passed threshold of FDR ",Fisher.cutoff," \n thus a combined plot is not provided")) +
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank())

  }else{
    out.nd.path <- out.nd.sig %>%
      filter(.data$p.adjust < Fisher.cutoff) %>%
      pull(.data$Description) %>%
      unique
    out.nd.sig <- out.nd.sig %>%
      filter(.data$Description %in% out.nd.path)

    sav.out.nd.sig <- out.nd.sig

    if (nrow(out.nd.sig) == 0 & !inherits(data,"list")) {

      message("\n The cutoff value for Fisher.cutoff resulted in 0 values being selected \n, consider choosing looser cutoff values \n")
      e_message <- "The cutoff value for Fisher up/down/non-directional cutoff \n resulted in 0 values being selected, \n consider choosing looser cutoff values"
      p.nd <- NULL
      p.all <- ggplot() +
        annotate("text", x = 4, y = 25, size=6, label = paste0(e_message," \n\n Provided Cutoff's ","\n Fisher Cutoff = ",
                                                               Fisher.cutoff,",\n Fisher Cutoff down = ",
                                                               Fisher.down.cutoff,", \n Fisher Cutoff up = ", Fisher.up.cutoff )) +
        theme(axis.title.y = element_blank(),
              axis.text.y = element_blank())


    } else {

      if (inherits(data,"list")){

        #if this is the case we don't have to look at others
        if (nrow(out.nd.sig) == 0) {
          #if the second cutoff removes all the rows
          message(" \n The cutoff value for Fisher.cutoff resulted in 0 values being selected, consider choosing looser cutoff values \n")

          out.nd.sig <- secondCutoffErr(out.nd.sig ,comp.names, TypeQ = 2)
          out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
          out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
        }
        else if (sum(out.nd.sig$ID == "") > 0){
          check_0 = unique(out.nd.sig$ID)

          if (!(length(check_0) == 1 & check_0[1] == "")){
            #print ("Changed")
            out.nd.sig <- prettyGraphs(out.nd.sig)
            out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
            out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
          } else if (length(comp.names) != length(unique(out.nd.sig$Comparisons.ID))){
            out.nd.sig <- secondCutoffErr(out.nd.sig ,comp.names, TypeQ = 2)
            out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
            out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
          }
        }
      }


      #cases where data frame is passed
      #most likely for cases where all treatment groups are null
      check_1 = unique(out.nd.sig$ID)
      check_2 = unique(out.nd.sig$Description)
      if (length(check_1) == 1 & check_1[1] == "" & check_2[1] == "" & mean(out.nd.sig$pvalue) == -99999){
        out.nd.sig[out.nd.sig$pvalue == -99999,]['pvalue'] = 1
        out.nd.sig[out.nd.sig$p.adjust == -99999,]['p.adjust'] = 1
      }

      p.nd <-  ggplot(out.nd.sig,aes(x=.data$Description,y=-log10(.data$p.adjust))) +
        geom_bar(stat = "identity" ) +
        coord_flip() +
        annotate("rect",
                 xmin = -Inf,
                 xmax = Inf,
                 ymin = 0,
                 ymax = -log10(0.05),
                 fill = "grey",
                 alpha = 0.4) +
        geom_hline(yintercept= -log10(0.05),color = "grey") +
        ylab("-log10(FDR) - Fisher's Test") +
        xlab("Pathway") +
        theme_minimal()

      #make sure returned dataset has signals
      out.nd.sig <- sav.out.nd.sig[sav.out.nd.sig$ID != "",]

      if(inherits(data, "list")){
        p.nd <- p.nd + facet_grid(.~ Comparisons.ID)
      }

      # below chunk only run when single summary statistics table is provided
      if(!inherits(data, "list")){


        if(is.null(out.d.sig)){test_1 <- 0}else{test_1 <- nrow(out.d.sig)}
        if(is.null(out.nd.sig)){test_2 <- 0}else{test_2 <- nrow(out.nd.sig)}


        if(!(test_1 > 0 & test_2 > 0)){
          e_message <- "The cutoff value for Fisher up/down/non-directional cutoff \n resulted in 0 values being selected, \n  consider choosing looser cutoff values"
          p.all <- ggplot() +
            annotate("text", x = 4, y = 25, size=6, label = paste0(e_message," \n\n Provided Cutoff's ","\n Fisher Cutoff = ",
                                                                   Fisher.cutoff,",\n Fisher Cutoff down = ",
                                                                   Fisher.down.cutoff,", \n Fisher Cutoff up = ", Fisher.up.cutoff )) +
            theme(axis.title.y = element_blank(),
                  axis.text.y = element_blank())

        }else{


          if(length(unique(out.d.sig$ID)) > length(unique(out.nd.sig$ID))){
            message(paste0("More pathways in directional data than non directional data \n",
                       "hence using pathways exsting in directinal dataset"))
          }else{
            message(paste0("More pathways in non directional data than directional data \n",
                       "hence using pathways exsting in non directinal dataset"))
          }

          ## non-directional test p value - with matched plot
          allID <- unique(c(unique(out.d.sig$ID), unique(out.nd.sig$ID)))

          mplots <- multiPlot(allID, backup.d.sig, nd.res)

          if (!length(mplots) == 2){
            p.all <- mplots
          }else{
            dplot <- mplots[[1]]
            ndplot <- mplots[[2]]

            p.all <- gridExtra::arrangeGrob(ndplot,dplot,
                                            ncol=2,
                                            widths=c(0.75, 0.25)) %>% as_ggplot

          }
        }
      }
    }
  }



  if(is.null(plot.save.to)){
    message("\n Plot file name not specified, a plot in ggplot object will be returned! \n")
  }else{
    ggsave(filename = plot.save.to,
           plot = p.all,
           dpi = 300,
           units = "in",
           device='png')
  }

  if(inherits(data, "list")){
    return(list(out.d.sig, out.nd.sig, p.d, p.nd))

  }else{
    return(list(out.d.sig, out.nd.sig, p.d, p.nd, p.all))
  }

}



#' @title Null Return
#' @description The function takes in a boolean value and a numeric value, which it uses to decide what to output.
#'
#' @param IS.list Indicator of whether the data frame being input is list or not.
#' @param type If type = 1(default) return directional null plot. If type = 2 return non directional null plot.
#'
#' @return The function returns either returns a data frame or the value NULL.
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details nullreturn is a function that returns NULL for single df inputs that don't hold true for threshold values. It returns an empty dataframe for
#' list inputs which don't satisfy the cutoff's
#'

nullreturn <- function(IS.list,type=1){
  if (IS.list) {
    if (type == 1){

      # return empty dataset for directional
      return (data.frame("ID"="", "Description"="", "GeneRatio"="", "BgRatio"="",
                         "pvalue"=0, "p.adjust"=0, "qvalue"=0, "geneID"="", "Count"=0,
                         "directional.p.adjust"=0, "direction"="", "log10.padj"=0,"fil.cor"="#E31A1C"))}

    else if (type == 2){

      # return empty for Non Dir. dataset,
      # pval = -99999 so that later we can make adjustments to achieve -log10 pval is 0
      return (data.frame("ID"="", "Description"="", "pvalue"=-99999, "p.adjust"=-99999))
    }

  } else {
    return (NULL)
  }
}


#' @title Second Cutoff Error
#' @description The function takes in a list of dataframe, comp names and a specified type, to output a dataframe styled for ggplot.
#'
#' @param df A list of dataframes.
#' @param comp.names a character vector contain the comparison names corresponding to the same order to the \code{dat.list}. default = NULL.
#' @param TypeQ If type = 1(default) return directional null plot. If type = 2 return non directional null plot.
#'
#' @return Returns a dataframe.
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details secondCutoffErr is a function specifically meant to be used for list inputs. It is used for cases where after
#' applying filter to the data, one of the comparison ID gets left out, this adversely effects the ggplot
#'
secondCutoffErr <- function(df,comp.names, TypeQ = 1){
  set.compliment <- setdiff(comp.names,unique(df$Comparisons.ID))
  for (i in set.compliment){
    ndf <- nullreturn(TRUE,TypeQ)
    ndf$Comparisons.ID <- i
    df <- rbind(df, ndf)
  }
  return (df)
}



#special cases where list input and at least one treatment has signal but others don't
# adjust dataframe for visualization


#' @title Pretty Graphs
#' @description Special cases where list input and at least one treatment has signal but others don't.
#'
#' @param vizdf A dataframes of enriched pathways.
#' @param ... pass on variables
#'
#' @return Returns a dataframe.
#'
#' @importFrom dplyr %>% filter
#' @importFrom rlang .data
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details Pretty Graphs is a function specifically meant to be in cases where one of the input treatments meet cutoff, but
#' one or more of the other treatments don't meet the cutoff values. This is important so that ggplot doesn't throw any errors.
#'
#'
prettyGraphs <- function(vizdf, ...){
  g1 = vizdf[vizdf$ID == "",]
  g2 = vizdf[vizdf$ID != "",]
  unique_desc = g2$Description
  unique_ID = g2$ID
  newdf = data.frame()
  for (i in g1$Comparisons.ID){
    temp = g1 %>% filter(.data$Comparisons.ID == i)
    temp = temp[rep(seq_len(nrow(temp)), each = length(unique_ID)),]
    temp[["Description"]] = unique_desc
    newdf = rbind(newdf,temp)
  }

  newdf = rbind(newdf,g2)
  return (newdf)

}


#' @title Multi Plot
#' @description Multi plot is for directional and non-directional plots
#'
#' @param allID A vector of all pathway ID's from directional and non directional enriched datasets.
#' @param backup.d.sig A dataframe type of object with directional pathways data prior to any cutoff's being applied
#' @param nd.res A dataframe type of object with non directional pathways data prior to any cutoff's being applied
#' @param ... pass on variables
#'
#' @return Returns ggplot.
#'
#' @importFrom dplyr %>% filter mutate
#' @importFrom rlang .data
#' @importFrom ggplot2 ggsave ggplot coord_flip geom_bar geom_hline annotate scale_fill_identity xlab ylab theme theme_minimal scale_fill_identity element_blank
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'
#' @details Multi plot is for directional and non-directional plots, when one of the plots doesn't contain observations.
#'
#'


multiPlot <- function(allID, backup.d.sig, nd.res, ...){
  #make sure id exists / inner join in both res & sig tables tables
  backup.d.sig_ = backup.d.sig[backup.d.sig$ID %in% nd.res$ID,]
  nd.res_ = nd.res[ nd.res$ID %in% backup.d.sig$ID ,]

  #now eilimate any ID that doesn't exist in the backup columns so that there is no error
  # when filtering the datasets
  allID_ = allID[allID %in% nd.res_$ID]

  matched.d.sig <- backup.d.sig_ %>%
    filter(.data$ID %in% allID_)

  matched.nd.sig <- nd.res_ %>%
    filter(.data$ID %in% allID_)  %>%
    mutate(Description = factor(matched.d.sig$Description,levels = matched.d.sig$Description))

  #if allid_ is empty raise this error,
  if (length(allID_) == 0){

    message(paste0("\n\n No common ID's between n.d & dir. data \n\n"))

    nullplot <- ggplot() +
      annotate("text", x = 4, y = 25, size=8, label = paste0("No common ID's between n.d & dir. data")) +
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank())
    return (nullplot)

  }else{
    ndplot <- ggplot(matched.nd.sig,aes(x=as.factor(.data$Description),y=-log10(.data$p.adjust))) +
      geom_bar(stat = "identity" ) +
      coord_flip() +
      annotate("rect",xmin = -Inf,xmax = Inf,ymin = 0,ymax = -log10(0.05),fill = "grey", alpha = 0.4) +
      geom_hline(yintercept= -log10(0.05),color = "grey") +
      ylab("-log10(FDR)") +
      xlab("Pathway") +
      theme(axis.title.y = element_blank(),
            axis.text.y = element_blank())


    dplot <- ggplot(matched.d.sig, aes(x = .data$Description, y= .data$log10.padj,fill=.data$fil.cor )) +
      geom_bar(stat = "identity" ) +
      coord_flip() +
      geom_hline(yintercept=-log10(0.05), linetype="dashed",color = "grey") +
      geom_hline(yintercept=log10(0.05), linetype="dashed",color = "grey") +
      geom_hline(yintercept= 0,color = "black") +
      annotate("rect",xmin = -Inf,xmax = Inf,ymin =-log10(0.05),ymax = log10(0.05),fill = "grey", alpha = 0.4) +
      scale_fill_identity() +
      xlab("Pathway") +
      ylab("-log10(FDR)")

    return (list(ndplot,dplot))

  }
}






#' @title DL Pathways DB
#' @description Download gene database for enrichment.
#'
#' @param pathway.db The databse to be used for encrichment analysis. Can be one of the following, "rWikiPathways", "KEGG", "REACTOME", "Hallmark","rWikiPathways_aug_2020"
#' @param customized.pathways the user provided pathway added for analysis.
#'
#' @return Returns a dataframe.
#'
#' @importFrom rWikiPathways downloadPathwayArchive
#' @importFrom clusterProfiler read.gmt
#' @import GSVAdata
#' @importFrom GSEABase setName geneIds
#' @importFrom tidyr separate
#' @importFrom dplyr %>% select pull as_tibble filter mutate left_join bind_rows
#' @import org.Hs.eg.db
#' @importFrom purrr set_names map2 map
#' @importFrom rlang .data
#' @importFrom msigdbr msigdbr
#' @param ... pass over parameters
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'

dlPathwaysDB <- function(pathway.db, customized.pathways = NULL, ...){

  bsetfunc <- function(data){
    return (data.frame(cbind(setName(data),geneIds(data))))
  }

  if(!is.null(pathway.db)){
    if(pathway.db == "rWikiPathways"){

      gene.dl <- rWikiPathways::downloadPathwayArchive(organism="Homo sapiens", format = "gmt") %>%
        clusterProfiler::read.gmt()

      names(gene.dl)[1] <- "ont"
      gene.dl <- gene.dl %>% tidyr::separate(.data$ont, c("name","version","wpid","org"), "%")

      #from the overall data, extract pathways id and gene number
      wpid2gene <- gene.dl %>% dplyr::select(.data$wpid,.data$gene) #TERM2GENE

      #for the same pathways id's get the full descriptive name
      wpid2name <- gene.dl %>% dplyr::select(.data$wpid,.data$name) #TERM2NAME

    }else if(pathway.db == "KEGG"){
      #data(GSVAdata::c2BroadSets)

      c2Bsets <- RVA::c2BroadSets

      KEGG <- c2Bsets[c(grep("^KEGG", names(c2Bsets)))]
      gene.dl <- map(KEGG, bsetfunc) %>% bind_rows() %>% set_names(c("gs_name","entrez_gene"))

      #from the overall data, extract pathways id and gene number
      wpid2gene <- gene.dl

      #for the same pathways id's get the full descriptive name
      wpid2name <- NULL
    }else if (pathway.db == "REACTOME"){

      #data(c2BroadSets)

      c2Bsets <- RVA::c2BroadSets

      REAC <- c2Bsets[c(grep("^REACTOME", names(c2Bsets)))]
      gene.dl <- map(REAC, bsetfunc) %>% bind_rows() %>% set_names(c("gs_name","entrez_gene"))

      #from the overall data, extract pathways id and gene number
      wpid2gene <- gene.dl

      #for the same pathways id's get the full descriptive name
      wpid2name <- NULL
    }else if (pathway.db == "Hallmark"){

      gene.dl <- msigdbr::msigdbr(species = "Homo sapiens",category = c("H")) %>%
        dplyr::select(.data$gs_name, .data$entrez_gene)

      #from the overall data, extract pathways id and gene number
      wpid2gene <- gene.dl

      #for the same pathways id's get the full descriptive name
      wpid2name <- NULL

    }else if (pathway.db == "rWikiPathways_aug_2020"){
      #throw warning about timestamp
      #gene.dl <- clusterProfiler::read.gmt('wikipathways-20200710-gmt-Homo_sapiens.gmt')
      #gene.dl <- gene.dl %>% tidyr::separate(ont, c("name","version","wpid","org"), "%")
      gene.dl <- RVA::wpA2020

      #from the overall data, extract pathways id and gene number
      wpid2gene <- gene.dl %>% dplyr::select(.data$wpid,.data$gene) #TERM2GENE

      #for the same pathways id's get the full descriptive name
      wpid2name <- gene.dl %>% dplyr::select(.data$wpid,.data$name) #TERM2NAME
    }
  }


  if(!is.null(customized.pathways)){
    if(is.null(pathway.db)){

      if(nrow(customized.pathways) < 100){
        message(paste0("\n\n!! Customized pathway information alone will be used for analysis without any additional database,",
                   " if accumulated number of genes in pathway is too small compared to all genes provided in summary statistics table,",
                   " the result may not be applicable !!\n\n"))
      }

      wpid2gene <- customized.pathways
      wpid2name <- NULL

    }else{

      if(pathway.db %in% c("rWikiPathways","rWikiPathways_aug_2020")){

        colnames(customized.pathways) <- c("wpid","gene")
        customized.pathways$gene <- as.character(customized.pathways$gene)

        wpid2gene <- wpid2gene %>% bind_rows(customized.pathways)
        wpid2name <- wpid2name %>% bind_rows(customized.pathways %>% select(wpid = .data$wpid, name = .data$wpid))
      }else{
        wpid2gene <- wpid2gene %>% bind_rows(customized.pathways)
      }

    }
  }

  return (list(wpid2gene, wpid2name))
}



#' @title calculate pathway scores
#' @description Calculate pathway scores
#'
#' @param data A summary statistics table (data.frame) or \code{data.list} generated by DE analysis software like limma or DEseq2
#' @param pathway.db pathway database used
#' @param gene.id.type gene.id.type
#' @param FCflag The column name (character) of fold change information, assuming the FC is log2 transformed. Default = "logFC".
#' @param FDRflag The column name (character) of adjusted p value or FDR. Default = "adj.P.Val".
#' @param FC.cutoff The fold change cutoff (numeric) selected to subset summary statistics table. Default = 1.5.
#' @param FDR.cutoff The FDR cutoff selected (numeric) to subset summary statistics table. Default = 0.05.
#' @param OUT.Directional logical, whether output directional or non-directional pathway analysis result, default: NULL.
#' @param IS.list logical, whether the input is a list, default: NULL
#' @param customized.pathways the customized pathways in the format of two column dataframe to be used in analysis
#' @param ... pass over parameters
#' @return Returns a dataframe.
#'
#' @importFrom rWikiPathways downloadPathwayArchive
#' @importFrom clusterProfiler bitr enricher
#' @importFrom dplyr %>% as_tibble filter mutate left_join bind_rows select pull
#' @import org.Hs.eg.db
#' @importFrom stringr word
#' @importFrom rlang .data
#'
#' @references Xingpeng Li & Siddhartha Pachhai RVA - RNAseq Visualization Automation tool.
#'

cal.pathway.scores <- function(data, pathway.db, gene.id.type, FCflag, FDRflag, FC.cutoff, FDR.cutoff, OUT.Directional = NULL , IS.list = FALSE,customized.pathways, ...){
  #supress warnings

  suppressWarnings({
    suppressMessages({

      #prepare pathway information
      collection_0 <- dlPathwaysDB(pathway.db, customized.pathways)

      #from the overall data, extract pathways id and gene number
      wpid2gene <- collection_0[[1]]

      #for the same pathways id's get the full descriptive name
      wpid2name <- collection_0[[2]]

      #Get background gene list (from all the genes available for the statistical test)
      #Conversion from ENSEMBL to ENTREZID
      #IMPORTANT!! The background should be the total genelist
      if(gene.id.type == "ENTREZID"){
        bkgd.genes.entrez <- rownames(data)
      }else{
        bkgd.genes.entrez <- rownames(data) %>%
          word(sep = '\\.') %>%
          clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db) %>%
          pull(.data$ENTREZID)
      }


      data.subset <- data[,c(FCflag, FDRflag)]
      colnames(data.subset) <- c("FC","FDR")

      data <- data.subset %>%
        as_tibble(rownames = "gene") %>%
        filter(abs(.data$FC) >= log2(FC.cutoff), .data$FDR <= FDR.cutoff) #apply filter

      #processing pathway analysis
      if(nrow(data) < 10){
        message("\n Less then 10 genes submitted for pathway analysis, consider choosing a looser cutoff... \n")
      }

      # 1. Get Enrichment test of increase
      if(nrow(filter(data,.data$FC >0)) != 0){

        #if this throws an error replacing it with custom error message
        tryCatch({

          if(gene.id.type == "ENTREZID"){
            genes.entrez.up <- data %>% filter(.data$FC > 0)
            genes.entrez.up <- data.frame(gene = genes.entrez.up$gene, ENTREZID = genes.entrez.up$gene)
          }else{
            genes.entrez.up <- data %>%
              filter(.data$FC > 0) %>%
              pull(.data$gene) %>%
              word(sep = '\\.') %>%
              clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
          }


          ewp.up <- clusterProfiler::enricher(
            genes.entrez.up[[2]],
            universe = bkgd.genes.entrez,
            pAdjustMethod = "fdr",
            pvalueCutoff = 1, #p.adjust cutoff; relaxed
            qvalueCutoff = 1,
            TERM2GENE = wpid2gene,
            TERM2NAME = wpid2name)

        }, error = function(e) {
          stop(paste0("\n\n If a single dataset was input none of the ",
                      " gene id that are within threshold have valid keys while running ",
                      "  cluster profiler.  If a list is being passed, this might have occured with one of the input tables",
                      "  consider changing supplied datasets or loosen threshold values.Stopping all processses. \n\n"
          ))}
        )

        if(is.null(ewp.up)){
          res.up <- data.frame()
        }else{
          res.up <- ewp.up@result}

      }else{res.up <- data.frame()} #return empty df if no genes are increase


      # 2. Get Enrichment test of decrease
      if(nrow(filter(data,.data$FC < 0)) != 0){

        #If an error is found throw a custom error message
        tryCatch({

          if(gene.id.type == "ENTREZID"){
            genes.entrez.dn <- data %>% filter(.data$FC < 0)
            genes.entrez.dn <- data.frame(gene = genes.entrez.dn$gene, ENTREZID = genes.entrez.dn$gene)
          }else{
            genes.entrez.dn <- data %>%
              filter(.data$FC < 0) %>%
              pull(.data$gene) %>%
              word(sep = '\\.') %>%
              clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
          }


          ewp.dn <- clusterProfiler::enricher(
            genes.entrez.dn[[2]],
            universe = bkgd.genes.entrez,
            pAdjustMethod = "fdr",
            pvalueCutoff = 1, #p.adjust cutoff; relaxed
            qvalueCutoff = 1,
            TERM2GENE = wpid2gene,
            TERM2NAME = wpid2name)

        },error = function(e) {
          stop(paste0("\n\n If a single dataset was input none of the ",
                      " gene id that are within threshold have valid keys while running ",
                      "  cluster profiler.  If a list is being passed, this might have occured with one of the input tables",
                      " consider changing supplied datasets or loosen threshold values.Stopping all processses. \n\n"
          ))}
        )

        if(is.null(ewp.dn)){
          res.dn <- data.frame()
        }else{
          res.dn <- ewp.dn@result}
      }else{res.dn <- data.frame()} #return empty df if no genes are decrease

      #obtain result of each direction
      if(nrow(res.up) != 0 & nrow(res.dn) != 0){
        res.up <- ewp.up@result %>%
          as_tibble %>%
          mutate(directional.p.adjust = .data$p.adjust)
        res.dn <- ewp.dn@result %>%
          as_tibble %>%
          mutate(directional.p.adjust = -.data$p.adjust)
        #identify the duplicated pathways have both up and down information and pick the one with smaller adj.p as final direction
        dup.up <- res.up %>%
          filter(.data$ID %in% intersect(res.dn$ID,res.up$ID)) %>%
          dplyr::select(.data$ID,.data$directional.p.adjust)

        reassign.dup <- res.dn %>%
          filter(.data$ID %in% intersect(res.dn$ID,res.up$ID)) %>%
          left_join(dup.up,by="ID") %>%
          mutate(directional.p.adjust = ifelse(abs(.data$directional.p.adjust.x) < abs(.data$directional.p.adjust.y),
                                               .data$directional.p.adjust.x,
                                               .data$directional.p.adjust.y)) %>% # use the smaller p value to assign direction
          dplyr::select(-.data$directional.p.adjust.x,-.data$directional.p.adjust.y)

        d.res <- res.dn %>%
          filter(!(.data$ID %in% reassign.dup$ID)) %>%
          bind_rows(reassign.dup, filter(res.up,!(.data$ID %in% reassign.dup$ID))) %>%
          dplyr::select(.data$ID, .data$Description,.data$directional.p.adjust) %>%
          mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
      }else if(nrow(res.up) == 0 & nrow(res.dn) != 0){
        d.res <- res.dn %>%
          as_tibble %>%
          mutate(directional.p.adjust = -.data$p.adjust) %>%
          mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
      }else if(nrow(res.up) != 0 & nrow(res.dn) == 0){
        d.res <- res.up %>%
          as_tibble %>%
          mutate(directional.p.adjust = .data$p.adjust) %>%
          mutate(direction = ifelse(.data$directional.p.adjust > 0,"up","down"))
      }else{
        d.res <- NULL
        message("\n No directional enriched pathway has been detected. \n")

      }

      #3.  Get Enrichment test without direction
      if(nrow(data)!=0){
        if(gene.id.type == "ENTREZID"){
          genes.entrez <- data %>% filter(.data$FC > 0)
          genes.entrez <- data.frame(gene = genes.entrez$gene, ENTREZID = genes.entrez$gene)
        }else{
          genes.entrez <- data %>%
            filter(.data$FC > 0) %>%
            pull(.data$gene) %>%
            word(sep = '\\.') %>%
            clusterProfiler::bitr(fromType = gene.id.type, toType = "ENTREZID",OrgDb = org.Hs.eg.db)
        }

        ewp <- clusterProfiler::enricher(
          genes.entrez[[2]],
          universe = bkgd.genes.entrez,
          pAdjustMethod = "fdr",
          pvalueCutoff = 1, #p.adjust cutoff; relaxed
          qvalueCutoff = 1,
          TERM2GENE = wpid2gene,
          TERM2NAME = wpid2name)

        if(is.null(ewp)){
          message("\n No none-directional enriched pathway has been detected. \n")
          #helpflag
          nd.res <- nullreturn(IS.list = IS.list , type=2)
        }else{
          nd.res <- ewp@result %>%
            dplyr::select(.data$ID, .data$Description, .data$pvalue, .data$p.adjust)

        }

        message(paste0("\n Enrichment test completed with ",nrow(data), " Differentially expressed genes with min FC ",round(2^min(abs(data$FC)),3)," max adj.p ",round(max(abs(data$FDR)),3),".\n\n"))

        if(is.null(d.res)){
          out.d.sig <- nullreturn(IS.list = IS.list,type=1)
        }else{
          out.d.sig <- d.res %>%
            mutate(log10.padj = ifelse(.data$direction == "up", -log10(abs(.data$directional.p.adjust)), log10(abs(.data$directional.p.adjust)))) %>%
            mutate(fil.cor = ifelse(.data$direction == "up", "#E31A1C", "#1F78B4")) %>%
            mutate(Description = factor(.data$Description,levels = .data$Description))
        }


        if(is.null(nd.res)){
          out.nd.sig <- nullreturn(IS.list = IS.list,type=2)
        }else{
          out.nd.sig <- nd.res %>%
            mutate(Description = factor(.data$Description,levels = .data$Description))
        }

      }else{
        out.d.sig <-  nullreturn(IS.list = IS.list,type=1)
        out.nd.sig <- nullreturn(IS.list = IS.list,type=2)
        nd.res <- nullreturn(IS.list = IS.list,type=2)
        message("\n No genes passed specified cutoff... return NULL \n")
      }

      if(isTRUE(OUT.Directional)){
        return(out.d.sig)
      }else if(isFALSE(OUT.Directional)){
        return(out.nd.sig)
      }else if(is.null(OUT.Directional)){
        return(list(out.d.sig, out.nd.sig, nd.res))
      }
    })
  }) #end of supress warning and errors
}

Try the RVA package in your browser

Any scripts or data that you put into this service are public.

RVA documentation built on Nov. 2, 2021, 1:06 a.m.