R/GSEA.R

Defines functions .citeggpubr .citetibble .citeggdendro .citeggplot .citedplyr .citegodb .citebase .citeanno .citehyper .citefgsea .citeorgmm .citeorghs seekCreateGeneSet seekGSEplot .mycolorgradient seekGSEoverview seekGSEcluster .gseaClustering seekGSEbubble seekGSEhyper seekGSEA seekGSErl seekGO

Documented in seekCreateGeneSet seekGSEA seekGSEbubble seekGSEoverview seekGSEplot seekGSErl

# Install Package: Ctrl + Shift + B

# devtools::document()


#' Function to output a list of GO sets
#'
#' Imports:
#' org.Hs.eg.db,
#' org.Mm.eg.db,
#' AnnotationDbi,
#' GO.db
#'
#' @param species string, either "human" or "mouse". Is used to retreive the right Entrez IDs for the GO pathways
#' @param ont string, GO subsection: "BP", "MF" or "CC"
#' @return list where each item is a vector of entrez IDs and the item name is the GO description/name
seekGO <- function(species, ont="BP"){
  #==GO=database==#
  ##goal: create a list of GO terms for a chosen species & ontology, that has the Entrez IDs for each term
  # creates a list: each item is a GO term (its name is the GOID, e.g. GO:0001234) for that species. Each item contains the Entrez IDs for it.
  # org.Hs.egGO2EG: GO-IDs with entrezids directly associated to them (no parent-child inheritance)
  # org.Hs.egGO2ALLEGS: GO-IDs with entrezids directly or indirectly associated to them (inheritance)
  goList <- switch(species,
                   "human"=as.list(org.Hs.eg.db::org.Hs.egGO2EG),
                   "mouse"=as.list(org.Mm.eg.db::org.Mm.egGO2EG))

  #creates a data.frame with GOIDs (from the chosen ontology) as rownames, and GO descriptive names as the first (and only) column
  goTerm <- as.data.frame(AnnotationDbi::Term(GO.db::GOTERM[AnnotationDbi::Ontology(GO.db::GOTERM)==ont])) # GOTERM can be used with Ontology(), GOID(), Term() & Definition() using the AnnotationDbi package
  colnames(goTerm) <- "GO_name"

  #creates a string vector of GOIDs that are present in goList (i.e. in the chosen species) AND in goTerm (i.e in the chosen ontology)
  goID <- rownames(goTerm)[which(rownames(goTerm) %in% names(goList))]

  #create a list like goList, but only with terms from the chosen ontology
  goEntrez <- lapply(goID, function(x) as.vector(unique(goList[[x]])))
  names(goEntrez) <- goTerm[goID,"GO_name"]

  #==citation==#
  message("please cite the following packages:\n",  .citeorghs(), .citeorgmm(), .citeanno(), .citegodb())

  #==output==#
  goEntrez
}

#' creates a ranked list from diff.Expression data
#'
#' @param DE a data.frame with a column "log2FC" and entrezids as rownames. Optionally a "padj" column if pFilter is to be used.
#' @param pFilter float, cutoff below which genes are excluded from the list (takes colp to determine which column to filter)
#' @param colentrez string, name of column containing entrezids
#' @param colrank string, name of column values should be ranked after (high to low)
#' @param colp string, name of column by which the pFilter operates
#' @return A sorted vector with rank values (e.g. log2FC) and entrezids as names
#' @examples
#' myDE <- sqDE
#' rl <- seekGSErl(myDE, pFilter=0.05, colentrez="EntrezId")
seekGSErl <- function(DE, pFilter=NA, colentrez="entrezid", colrank="log2FC", colp="padj"){
  DE <- as.data.frame(DE)
  DE[,colentrez] <- as.character(DE[,colentrez])
  if(length(unique(DE[,colentrez]))<length(DE[,colentrez])){
    message("Entrez IDs are not unique. Coercion to unique values.")
    DE[,colentrez] <- gsub("^X","",make.names(DE[,colentrez], unique=T))
  }
  if(!is.na(pFilter)) DE <- subset(DE, DE[,colp]<=pFilter)

  DE <- DE[order(DE[,colrank], decreasing=T),]
  rankedList <- DE[,colrank]
  names(rankedList) <- as.character(DE[,colentrez])
  rankedList <- rankedList[!rankedList %in% "NaN"]
  rankedList <- na.omit(rankedList)
  rankedList <- rankedList[!duplicated(names(rankedList))]

  #==output==#
  rankedList
}

#' Gene set entrichment analysis using the fgsea package
#'
#' Imports:
#' fgsea,
#' org.Hs.eg.db,
#' org.Mm.eg.db,
#' AnnotationDbi,
#' GO.db
#'
#' @param rankedList vector sorted from high to low values of the ranking metric (e.g. log2 fold change), with entrezids as names.
#' @param genesetList either the string ("BP", "MF" or "CC" for the GO subcollections), or a NAMED list of genesets of Entrez IDs, all the same species as the rankedList!
#' @param species string, either "human" or "mouse". Is used to retreive the right Entrez IDs for the GO pathways
#' @param output_IDs string, ID type to be returned for the table (e.g.: "SYMBOL","ENTREZID","ENSEMBL",...)
#' @param minSize Minimal size of a gene set to test. All pathways below the threshold are excluded.
#' @param maxSize Maximal size of a gene set to test. All pathways above the threshold are excluded.
#' @param nperm Number of permutations to do. Minimial possible nominal p-value is about 1/nperm
#' @return data.frame where each row is a GO term, and each column is a parameter of information about the term (e.g. ID, p value, etc.).
#' @examples
#' myDE <- sqDE
#' rl <- seekGSErl(myDE, pFilter=0.05, colentrez="EntrezId")
#' mygsea <- seekGSEA(rl, Species="mouse", Ont="BP")
seekGSEA <- function(rankedList, genesetList="BP", species=NA, output_IDs="SYMBOL", minSize=1, maxSize=Inf){
  rangedList <- rankedList[!duplicated(names(rankedList))]
  if(is.character(genesetList)){
    if(is.na(species)){stop("Run aborted. No species defined.")}
    message(paste0("GSEA on on GO-", genesetList))
    goEntrez <- seekGO(species, genesetList)
  }else{
    message(paste0("GSEA on custom genesets"))
    goEntrez <- genesetList
    goTerm <- data.frame(GO_ID=names(goEntrez), row.names=names(goEntrez))
  }

  #==GSEA==#
  print("fgsea multilevel calculation")
  tab <- fgsea::fgseaMultilevel(pathways=goEntrez, stats=rankedList, minSize=minSize, maxSize=maxSize)
  print("expanding the table...")
  tab <- merge(goTerm, tab, by.x=0, by.y="pathway")
  colnames(tab)[1] <- "GO_ID"
  tab <- tab[order(tab$pval),]

  leadingEdge <- tab$leadingEdge
  tab <- tab[,c(1:7)] #exclude set size and leading edge (will be reintroduced now)

  #==expand the table==#
  # for each set, calculate number of genes in the set and number of genes found in the rankedList
  print("expanding the table...")
  setGenes <- lapply(tab[,1], function(x) unique(goEntrez[[x]])) # all entrez ids for each GO term in the table
  tab$hit_size <- sapply(setGenes, function(x) sum(names(rankedList) %in% x))
  tab$set_size <- sapply(setGenes, length)
  tab$hit_perc <- tab$hit_size/tab$set_size
  # genes from the rankedList that are found in the set
  print("expanding the table...")
  hitGenes <- seekX2Y(names(rankedList),xtype="ENTREZID",ytype=output_IDs,species=species)
  tab$hit_genes <- sapply(setGenes, function(x) paste0(sort(hitGenes[names(rankedList) %in% x]), collapse=","))
  # all genes in the set
  print("expanding the table...")
  all_genes <- unique(unlist(setGenes))
  names(all_genes) <- seekX2Y(all_genes, xtype="ENTREZID",ytype=output_IDs,species=species)
  tab$set_genes <- sapply(setGenes, function(x) paste0(sort(names(all_genes)[all_genes %in% x]), collapse=","))
  # genes on the leading edge (genes on the extreme side of the plot before the curve gets to its maximum or minimum)
  print("expanding the table...")
  edge_genes <- unique(unlist(leadingEdge))
  names(edge_genes) <- seekX2Y(edge_genes, xtype="ENTREZID",ytype=output_IDs,species=species)
  tab$leadingEdge <- sapply(leadingEdge, function(x) paste0(sort(names(edge_genes)[edge_genes %in% x]), collapse=","))

  #==citation==#
  message("please cite the following packages:\n",  .citeorghs(), .citeorgmm(), .citeanno(), .citegodb(), .citefgsea())

  #==output==#
  tab
}

#' Gene set overrepresentation using the hypeR package
#'
#' Imports:
#' hypeR,
#' dplyr
#' org.Hs.eg.db,
#' org.Mm.eg.db,
#' AnnotationDbi,
#' GO.db
#'
#' @param mygenes vector with all genes that will be tested
#' @param genesetList either the string ("BP", "MF" or "CC" for the GO subcollections), or a NAMED list of genesets of Entrez IDs, all the same species as the rankedList!
#' @param species string, either "human" or "mouse". Is needed if the input ID types and the output ID type are not all the same
#' @param input_geneIDs string, ID type used for the mygenes parameter (e.g.: "SYMBOL","ENTREZID","ENSEMBL",...)
#' @param input_setIDs string, ID type used for the genesetList parameter (e.g.: "SYMBOL","ENTREZID","ENSEMBL",...)
#' @param output_IDs string, ID type to be returned for the table (e.g.: "SYMBOL","ENTREZID","ENSEMBL",...)
#' @return data.frame where each row is a GO term, and each column is a parameter of information about the term (e.g. ID, p value, etc.).
#' @examples
#' myanalysis <- seekGSEhyper(c("HOXA9","HOXA10","MTOR"), species="human")
seekGSEhyper <- function(mygenes, genesetList="BP", species=NA, input_geneIDs="SYMBOL", input_setIDs="ENTREZID", output_IDs="SYMBOL"){
  if(unique(c(input_geneIDs, input_setIDs, output_IDs)>1)){
    if(is.na(species)) stop("Specify species as either 'mouse' or 'human'.")
  }
  mygenes <- as.character(mygenes)
  if(is.character(genesetList)) genesetList <- seekGO(species, genesetList)

  #1: the input gene IDs are changed so we have a vector with the sets ID type as name and the gene ID type as value
  #(its shorter to change the genelist than the sets)
  genesID_to_setsID <- seekX2Y(mygenes, input_geneIDs, input_setIDs, species)
  #2: vector with the output ID type as name and the sets ID type as value
  setsID_to_outputID <- seekX2Y(as.vector(genesID_to_setsID), input_setIDs, output_IDs, species)

  outtable <- hypeR::hypeR(as.vector(genesID_to_setsID), genesetList) #overrepresentation analysis
  outtable <- outtable$data
  outtable <- dplyr::rename(outtable, "padj"="fdr", "GO_ID"="label","hit_genes"="hits","set_size"="geneset","hit_size"="overlap")
  outtable$hit_perc <- outtable$hit_size/outtable$set_size
  outtable$NES <- 0
  #translate the found genes to the output ID type
  if(!input_setIDs %in% output_IDs){
    outtable$hit_genes <- sapply(genesetList[outtable$GO_ID], function(x) paste0(sort(setsID_to_outputID[names(setsID_to_outputID) %in% x]),
                                                                                 collapse=","))
  }

  #==citation==#
  message("please cite the following packages:\n",  .citeorghs(), .citeorgmm(), .citeanno(), .citegodb(), .citehyper(), .citedplyr())

  #==output==#
  outtable
}

#' bubbleplot for GSEAs
#'
#' Imports:
#' ggplot2
#'
#' @param gsealist list of gsea tables, obtained from seekGSEA() function from this package
#' @param minoverlap minimum number of sets that have to have this enriched term (significant or not)
#' @param pcutofft p value cutoff by which pathways are discarded in the beginning. Usefull if the "arguments imply differing number of rows" error comes up which means that some items in the list have no row either due to the pvalue filtering (set this to 1 then) or because they didn't have it to begin with (in that case, exclude the itemfrom the list)
#' @return a ggplot object; seethrough dots have p>0.05 (not significant)
#' @examples
#' myDE1 <- sqDE
#' rl1 <- seekGSErl(myDE1, pFilter=0.05, colentrez="EntrezId")
#' mygsea1 <- seekGSEA(rl1, species="mouse", genesetList="BP")
#' myDE2 <- sqDE
#' rl2 <- seekGSErl(myDE2, pFilter=0.05, colentrez="EntrezId")
#' mygsea2 <- seekGSEA(rl2, species="mouse", genesetList="BP")
#' seekGSEbubble(list(g1=mygsea1, g2=mygsea2), 2)
seekGSEbubble <- function(gsealist, minoverlap=1, maxoverlap=NA, pcutoff=0.05, discardEmpty=T, midcol="white",
                          dendrogram=T, bubble_to_dendro_ratio=4){
  significantgenesets <- lapply(gsealist, function(x) subset(x, pval<=pcutoff)$GO_ID) %>% unlist() %>% table() %>% {names(.[. >= minoverlap])}
  if(is.na(maxoverlap)) maxoverlap <- length(gsealist)
  dfgsea1 <- gsealist
  if(discardEmpty) dfgsea1 <- lapply(dfgsea1, function(x) subset(x, hit_size>0))
  dfgsea1 <- lapply(dfgsea1, function(x) subset(x, GO_ID %in% significantgenesets, select=c("GO_ID","pval","NES","hit_genes")))

  allHitGenes <- do.call(cbind, lapply(dfgsea1, function(x) x$hit_genes))
  allHitGenes <- apply(allHitGenes, 1,function(x) paste0(x, collapse=","))
  dfgsea2 <- dfgsea1[[1]][,c("GO_ID","hit_genes")]
  dfgsea2$hit_genes <- allHitGenes

  clust <- .gseaClustering(dfgsea2)
  dfgsea3 <- dfgsea2[clust$order,]
  dfgsea3 <- cbind(dfgsea3["GO_ID"], clusterorder=1:nrow(dfgsea3))

  dfgsea <- lapply(dfgsea1, function(x) merge(x,dfgsea3, by="GO_ID",sort=F))
  #dfgsea <- mutate(dfgsea, experiment=factor(experiment, levels=unique(experiment)))
  xaxisposition <- -seq(length(gsealist))
  xaxislabels <- `names<-`(rev(names(gsealist)), as.character(xaxisposition))
  yaxislabels <- `names<-`(dfgsea3$GO_ID, as.character(seq(nrow(dfgsea3))))
  dfgsea <- mapply(function(x,y,z) mutate(x, experiment=y, xposition=z), x=dfgsea, y=names(dfgsea), z=xaxisposition, SIMPLIFY=F)
  dfgsea <- do.call(rbind, dfgsea)
  dfgsea$GO_ID <- factor(dfgsea$GO_ID)

  # generalplot <- ggplot2::scale_y_continuous(expand=c(0,0), breaks=as.numeric(names(xaxislabels)), labels=xaxislabels) +
  #   ggplot2::coord_flip() +

  colorgradient <- .mycolorgradient(200, highcol="red", midcol=midcol, lowcol="blue")
  gradientmax <- ceiling(max(abs(dfgsea$NES))*10)/10

  mydendro <- ggdendro::ggdendrogram(clust) +
    ggplot2::coord_flip() +
    ggplot2::theme(axis.text.y=ggplot2::element_blank(), axis.text.x=ggplot2::element_blank())

  mybubble <- ggplot2::ggplot(mapping=ggplot2::aes(y=experiment, x=clusterorder)) +
    ggplot2::coord_flip() +
    ggplot2::scale_x_continuous(breaks=as.numeric(names(yaxislabels)), labels=yaxislabels) +
    ggplot2::geom_point(ggplot2::aes(fill=NES, size= -log10(pval)), alpha= 1, shape=21, data=subset(dfgsea, pval<=.05)) +
    ggplot2::geom_point(ggplot2::aes(color=ifelse(NES>0,"1","2"), size= -log10(pval)), alpha= 1, shape=21, data=subset(dfgsea, pval> .05), fill=NA) +
    ggplot2::scale_fill_gradientn(colours=colorgradient, limits=c(-gradientmax, gradientmax)) +
    ggplot2::scale_color_manual(values=c("red","blue")) +
    ggplot2::labs(fill=expression(italic(NES)), size=expression(-log[10]~italic(p))) +
    ggplot2::guides(color=F) +
    ggplot2::theme_classic() +
    ggplot2::theme(axis.title=element_blank(), axis.text.x=element_text(hjust=1, angle=45), axis.line=element_blank(),
                   axis.ticks=element_blank())

  mytitle <- paste("showing sets enriched in",minoverlap,"to",maxoverlap,"with p <=",pcutoff)
  if(dendrogram){
    plotme <- ggpubr::ggarrange(mybubble,NULL,mydendro, ncol=3, nrow=1, common.legend=T, widths=c(bubble_to_dendro_ratio,-.1,1), legend="right", align="h")
    plotme <- ggpubr::annotate_figure(plotme, top=mytitle)
  }else{
    plotme <- mybubble + ggplot2::ggtitle(mytitle)
  }

  # if(dendrogram){
  #   plotme <- ggdendro::ggdendrogram(clust)
  #   maxX <- ceiling(max(clust$height))
  # }else{
  #   plotme <- ggplot2::ggplot()
  #   maxX <- .5
  # }
  #
  # plotme <- plotme +
  #   ggplot2::scale_y_continuous(expand=c(0,0), breaks=as.numeric(names(xaxislabels)), labels=xaxislabels, limits=c(as.numeric(names(xaxislabels))[length(xaxislabels)]-.5, maxX)) +
  #   ggplot2::scale_x_continuous(breaks=as.numeric(names(yaxislabels)), labels=yaxislabels) +
  #   ggplot2::coord_flip() +
  #   ggplot2::ggtitle(paste("enriched in",minoverlap,"or more")) +
  #   ggplot2::geom_point(ggplot2::aes(y=xposition, x=clusterorder, fill=NES, size= -log10(pval)), alpha= 1, shape=21, data=subset(dfgsea, pval<=.05)) +
  #   ggplot2::geom_point(ggplot2::aes(y=xposition, x=clusterorder, color=NES, size= -log10(pval)), alpha= 1, shape=21, data=subset(dfgsea, pval> .05), fill=NA) +
  #   ggplot2::scale_fill_gradient2(low="royalblue4", mid=midcolor, high="red", midpoint=0) +
  #   ggplot2::scale_color_gradient2(low="royalblue4", mid=midcolor, high="red", midpoint=0) +
  #   ggplot2::theme_classic() +
  #   ggplot2::theme(axis.title=element_blank(), axis.text.x=element_text(hjust=1, angle=45), axis.text.y=element_text(angle=0), axis.line=element_blank(),
  #                  axis.ticks=element_blank())

  #==citation==#
  message("please cite the following packages:\n", .citeggplot(), .citeggpubr(), .citedplyr())

  #==output==#
  plotme
}

#this function takes a gsea dataframe and returns the output of the hclust function
.gseaClustering <- function(gseaToCluster){
  #list all hit genes and make a matrix that says for each gsea set (rows) if it contains the genes (columns)
  allgenes <- unique(unlist(strsplit(gseaToCluster$hit_genes, ",")))
  geneframe <- as.data.frame(matrix(nrow=nrow(gseaToCluster), ncol=length(allgenes)))
  colnames(geneframe) <- allgenes
  rownames(geneframe) <- gseaToCluster$GO_ID
  geneframe <- tibble::rownames_to_column(geneframe, "id")
  #the allgenes vector is the same as the colnames, so we can use it to check for presence of genes in the terms
  finalframe <- geneframe %>% apply(1,function(x) c(x[1], as.numeric(allgenes %in% unlist(strsplit(subset(gseaToCluster, GO_ID %in% x[1])$hit_genes, ","))))) %>%
    t() %>% as.data.frame() %>% column_to_rownames("id") %>% `colnames<-`(allgenes)
  #from this matrix, calculate the dist-matrix which is used by the clustering algorithm
  distmat <- dist(finalframe, method="euclidean")
  #important outputs: the hclust output, and a dataframe with GO_IDs and their cluster order
  clust <- hclust(distmat, method="complete")
  myclusterorder <- data.frame(GO_ID=clust$labels, clusterorder=clust$order)

  clust
}


#' dendogram from GSEA results
#'
#' Imports:
#' ggdendro,
#' ggplot2
#'
#' @param mygsea data.frame, output from seekGSEA or seekGSEhyper
#' @return a list with 2 items: a ggplot object (dendrogram) and a vector with the gene set names in dendrogram order
#' @examples
#' myDE1 <- sqDE
#' rl1 <- seekGSErl(myDE1, pFilter=0.05, colentrez="EntrezId")
#' mygsea1 <- seekGSEA(rl1, species="mouse", genesetList="BP")
#' mycluster <- seekGSEcluster(mygsea1)
seekGSEcluster <- function(mygsea, pcutoff=0.05, fillcolorColumn="NES", highcol="red", midcol="white", lowcol="blue"){
  colorgradient <- .mycolorgradient(200, highcol=highcol, midcol=midcol, lowcol=lowcol)
  mygsea <- subset(mygsea, pval<=pcutoff)
  clust <- .gseaClustering(mygsea)
  mydfgsea <- mygsea[clust$order,]
  mydfgsea <- cbind(mygsea, clusterorder=1:nrow(mydfgsea), perc_found=mydfgsea$hit_size/mydfgsea$set_size)
  mydfgsea$fillColor <- mydfgsea[,fillcolorColumn]
  gradientmax <- ceiling(max(abs(mydfgsea$fillColor))*10)/10

  myplot <- ggdendro::ggdendrogram(clust) +
    ggplot2::coord_flip() +
    ggplot2::theme(axis.text.y=element_text(angle=0), axis.text.x=element_blank()) +
    ggplot2::geom_point(data=mydfgsea, mapping=ggplot2::aes(y=0, x=clusterorder, size=-log2(pval), fill=fillColor), shape=21) +
    ggplot2::labs(fill=gsub("_"," ",fillcolorColumn), size=expression(-log[2]~italic(p))) +
    ggplot2::scale_fill_gradientn(colors = colorgradient, limits=c(-gradientmax, gradientmax))

  #==citation==#
  message("please cite the following packages:\n", .citeggplot(), .citeggdendro(), .citetibble())

  #==output==#
  print(myplot)
  list(dendrogram=myplot, labels=clust$labels[clust$order])
}

#' Overview bar graph for GO terms
#'
#' Imports:
#' ggplot2
#'
#' @param tab data.frame, direct output from seekGSEA
#' @param Title plot totle
#' @param n integer, number of GO terms to display
#' @param fillcolors vector with 2 strings indicating colors, colors will be used for the p value color gradient
#' @return ggplot2 object
#' @examples
#' myDE <- sqDE
#' rl <- seekGSErl(myDE[[1]])
#' mygsea <- seekGSEA(rl, Species="mouse", setList="BP")
#' seekGSEoverview(mygsea)
seekGSEoverview <- function(tab, Title="", n=20, fillColors=c("green","yellow"), pvalThreshold=0.05, orderby="pval"){
  tab <- subset(tab, pval<=pvalThreshold)
  if(nrow(tab)<n) n <- nrow(tab)
  tab <- tab[order(tab$pval, decreasing=T),]
  tab <- tab[1:n,]
  tab <- tab[order(tab[,orderby], decreasing=F),]
  tab$color <- ifelse(tab$NES>0, "red", "blue")
  tab$GO_ID <- factor(tab$GO_ID, levels=tab$GO_ID)
  tab$percent_found <- tab$hit_size/tab$set_size
  outplot <- ggplot2::ggplot(tab, ggplot2::aes(x=GO_ID, y=NES, fill=pval)) + #color=color
    ggplot2::geom_point(stat="identity", shape=21, ggplot2::aes(size=percent_found)) +
    ggplot2::ggtitle(Title) +
    ggplot2::coord_flip() +
    ggplot2::scale_color_manual(values = levels(factor(tab$color))) + #color according to color column
    ggplot2::scale_fill_gradient(low=fillColors[1], high=fillColors[2]) +
    ggplot2::theme(axis.title.y=ggplot2::element_blank()) +
    ggplot2::geom_vline(xintercept=0, linetype="dashed")

  #==citation==#
  message("please cite the following packages:\n", .citeggplot())

  #==output==#
  outplot
}

#get a vector with colors for a color gradient
.mycolorgradient <- function(n, highcol, midcol, lowcol){
  colfunc1 <- colorRampPalette(c(lowcol, midcol)) #functions for getting a color ramp
  colfunc2 <- colorRampPalette(c(midcol, highcol))
  colorgradient<- c(colfunc1(n/2), colfunc2(n/2))

  #==output==#
  colorgradient
}


#' GSEA plot
#'
#' Imports:
#' ggplot2,
#' fgsea,
#' AnnotationDbi,
#' GO.db,
#' org.Hs.eg.db,
#' org.Mm.eg.db
#'
#' @param rankedList A vector sorted from high to low values of the ranking metric (e.g. log2 fold change), with names according to entrezids.
#' @param setID string, either GO-ID (e.g. "GO:0006355") or name of a custom set included in the setList
#' @param setList either string ("BP", "CC" or "MF") or a list of custom datasets, where each item in the list is a vector of strings (entrezids)
#' @param Species a string, either "human" or "mouse". Is used to retreive the right Entrez IDs for the GO pathways. Unimportant if a custom setID is used.
#' @param gseaTable data.frame, direct output from seekGSEA. Adds statistics to the plot.
#' @param annotations vector of 2 strings, which will be written underneath the color ribbon
#' @param plotTitle string, replaces the gene set name as the plot title
#' @param cutC log2FC threshold, changes the vlines and the color ribbon
#' @param gseaParam GSEA parameter  determines ES-gain per hit (default: 1)
#' @param vlines boolean, vertical lines indicate the log2FC threshold
#' @param hlines boolean, horizontal lines indicate the ES scores
#' @param lineSize width of main line through the plot
#' @param tickSize width of ticks in the barcode (default: 0.2)
#' @param textSize text size for plot title, axis titles and axis labels
#' @param annoSize text size for annoations (below color ribbon) and statistics
#' @param colors vector with 3 strings, naming the colors used for: upregulated ribbon, downregulated ribbon, main line
#' @param yTitle boolean, should there be a title for the y axis?
#' @param xTitle boolean, should there be a title for the x axis?
#' @param xText boolean, should there be a x axis labels?
#' @return ggplot object
#' @examples
#' myDE <- sqDE
#' rl <- seekGSErl(myDE[[1]])
#' mygsea <- seekGSEA(rl, Species="mouse", setList="BP")
#' seekGSEplot(rl, setID="GO:0006355", Species="mouse", gseaTable=mygsea)
seekGSEplot <- function(rankedList, setID, setList="BP", Species=NA, gseaTable=NA,
                        annotations=c("up","down"), plotTitle=NA,
                        cutC=log2(1.5), gseaParam=1,
                        vlines=T, hlines=T, lineSize=1.5, ticksSize=0.2,
                        textSize=30, annoSize=10, colors=c("red","blue","green"),
                        yTitle=T, xTitle=T, xText=T, ggarrangeRows=1) {

  #==check=input==#
  if(is.na(Species)){stop("!!!Error, no Species selected!!!")}
  stats <- rankedList

  ##=pathway=genes=##===================================================================
  #==for=GO=(if)=or=customSets=(else)=#
  if(is.character(setList)){
    setName <- AnnotationDbi::Term(GO.db::GOTERM[AnnotationDbi::Ontology(GO.db::GOTERM)==setList]) #vector of GO-set-names, with GOIDs as vector names
    setName <- as.character(setName[names(setName) %in% setID]) #string of GO-set-name corresponding to GO-ID given by the "setID" parameter
    pathway <- switch(Species,
                      "human"=as.list(org.Hs.eg.db::org.Hs.egGO2EG)[[setID]], #all gene IDs for the specific GO-ID
                      "mouse"=as.list(org.Mm.eg.db::org.Mm.egGO2EG)[[setID]])
  }else{
    setName <- setID
    pathway <- setList[[setID]]
  }
  print(setName)
  setName <- gsub("^REACTOME_","",gsub("^KEGG_","",setName))
  wordcut <- floor(nchar(setName)/2)-3
  getspace <- FALSE

  while(!getspace){
    getspace <- {substr(setName, wordcut,wordcut) %in% c("_",",",";"," ","-",".")}
    if(!getspace){wordcut <- wordcut+1}
    if(wordcut==nchar(setName)){getspace <- TRUE}
  }
  setName <- paste0(substr(setName, 1, wordcut), "\n", substr(setName, wordcut+1, nchar(setName)))

  #=====================================================================================
  #=====================================================================================

  ##=fgsea=calculation=##===============================================================
  #==rankedList=preparation==#
  rnk <- rank(-stats) # gives a vector: 1,2,3,4,... ranking the rankedList (actually unnecessary)
  ord <- order(rnk) # orders the previous vector (also actually unnecessary)
  statsAdj <- stats#[ord] # puts the rankedList into correct order (also actually unnecessary)
  statsAdj <- sign(statsAdj) * (abs(statsAdj) ^ gseaParam) # increases the rl values by gseaParam without losing the - or + before the value
  statsAdj <- statsAdj / max(abs(statsAdj)) # normalizing the rl values by the max

  #==find=genes=from=rankedList=in=setID==#
  pathway <- unname(as.vector(na.omit(match(pathway, names(statsAdj))))) # vector with positions genes from geneset in the ranked list
  pathway <- sort(pathway) # vector is sorted by positions

  #==fgsea=function==#
  gseaRes <- fgsea::calcGseaStat(statsAdj, selectedStats=pathway, returnAllExtremes=T)

  #=====================================================================================
  #=====================================================================================

  ##=calculations=##====================================================================
  #==calculating=x=and=y=coordinates==#
  bottom <- min(gseaRes$bottoms, na.rm=T)
  top <- max(gseaRes$tops, na.rm=T)
  xs <- as.vector(rbind(pathway-1, pathway)) #combines the vectors in an alternating fashion
  ys <- as.vector(rbind(gseaRes$bottoms, gseaRes$tops))
  n <- length(statsAdj)

  #==gsea=line=(most=important=part)==#
  toPlot <- data.frame(x=c(0, xs, n+1), y=c(0, ys, 0))
  #==axis=labels==#
  axisXtext <- c(ceiling(top*10)/10, ceiling(bottom*10)/10)
  axisXspace <- abs(round(2*(axisXtext[1]-axisXtext[2]))/10)*2
  axisXtext <- round(c(seq(axisXtext[2], axisXtext[1], by=ifelse(axisXspace==0, 0.1 ,axisXspace)), 0),1)

  #==Title==#
  if(!is.na(plotTitle)){ # if a plotTitle is specified, use it
    toPlot$title = plotTitle
  }else{
    toPlot$title = setName # else, use the default name
  }

  #==Positions==#
  x=y=NULL
  sizeFactor <- ((top - bottom))
  yStats <- c(bottom+(sizeFactor*0.01), bottom+sqrt(ggarrangeRows)*(sizeFactor*0.2*annoSize/10))
  yTicks <- c(bottom-(sizeFactor*0.025), bottom-(sizeFactor*0.125))
  yRibbon <- c(bottom-(sizeFactor*0.13), bottom-(sizeFactor*0.23))
  yLabel <- bottom-(sizeFactor*0.28)
  vline <- c(length(rankedList[rankedList>=cutC]),
             length(rankedList)-length(rankedList[rankedList<=-cutC]),
             length(rankedList[rankedList>=0]))

  #==colorRibbon==#
  # bar lengths for each threshold
  barlength <- unique((cutC*c(1/4,1/2,1,2,4,4,8,8,8,8,rep(16,8))[1:(3+ceiling(max(abs(rankedList))))])) # vector with lengths for log2FC thresholds
  barlength <- na.omit(barlength)
  barlength <- c(rev(barlength)[2:length(barlength)],0, -barlength) # vector as above but mirrored (highest, ..., 0, ..., -highest)
  barlength <- sapply(barlength, function(x) sum(rankedList>=x)) # vector with number of genes that have log2FC above the thresholds
  # bar colors
  # colfunc1 <- colorRampPalette(c(colors[1], "white")) #functions for getting a color ramp
  # colfunc2 <- colorRampPalette(c("white", colors[2]))
  # barcolor <- c(colfunc1(length(barlength)/2), colfunc2(length(barlength)/2))
  barcolor <- .mycolorgradient(length(barlength), highcol=colors[2], midcol="white", lowcol=colors[1])
  # data.frame: each row has a bar color intensity, bar length, bar start (x1) and end (x2) position and an Order-Number
  colorRibbon <- data.frame(barcolor=barcolor,
                            barlength=barlength,
                            x1=c(0,barlength[1:(length(barlength)-1)]),
                            x2=barlength)
  # divide data frame into red and blue ribbon
  colorRibbon1 <- colorRibbon[1:(length(barcolor)/2),]
  colorRibbon2 <- colorRibbon[((length(barcolor)/2)+1):length(barcolor),]
  #=====================================================================================
  #=====================================================================================

  ##=GSEA=plot=##=======================================================================
  plot <- ggplot2::ggplot(toPlot, ggplot2::aes(x=x, y=y)) +
    #==main=line==#
    ggplot2::geom_point(color=colors[3], size=0.1) +
    ggplot2::geom_line(color=colors[3], size=lineSize) +

    #==hit=segments==#
    ggplot2::geom_segment(data=data.frame(x=pathway), mapping=ggplot2::aes(x=x, y=yTicks[1], xend=x, yend=yTicks[2]), size=ticksSize) +

    #==colorRibbon=left=(red)=and=right=(blue)==#
    ggplot2::geom_rect(data=colorRibbon1, ggplot2::aes(xmin=x1, xmax=x2, ymin=yRibbon[1], ymax=yRibbon[2]), fill=colorRibbon1$barcolor) +
    ggplot2::geom_rect(data=colorRibbon2, ggplot2::aes(xmin=x1, xmax=x2, ymin=yRibbon[1], ymax=yRibbon[2]), fill=colorRibbon2$barcolor) +
    ggplot2::annotate(geom="text", size=annoSize, y=yLabel, x=(length(stats)/400), label=annotations[1], color=colors[1], hjust=0) +
    ggplot2::annotate(geom="text", size=annoSize, y=yLabel, x=length(rankedList)-(length(stats)/400), label=annotations[2], color=colors[2], hjust=1) +

    #==theme=settings==#
    ggplot2::geom_hline(yintercept=0, colour="gray") +
    ggplot2::theme_bw() +
    ggplot2::theme(panel.border=ggplot2::element_rect(fill=NA, size=0.5),
                   panel.grid=ggplot2::element_blank(), legend.position="none",
                   axis.text.x=ggplot2::element_text(angle=45, hjust=1),
                   axis.text= ggplot2::element_text(size=textSize),
                   axis.title=ggplot2::element_text(size=textSize),
                   strip.text=ggplot2::element_text(size=textSize)) +

    #==axis=titles=and=labels==#
    ggplot2::scale_y_continuous(name="enrichment score", labels=axisXtext, breaks=axisXtext) +
    ggplot2::labs(x="rank") +
    #==plot=title==#
    ggplot2::facet_grid(. ~ title)

  #==additional=lines==#
  if(vlines){plot <- plot + ggplot2::geom_vline(xintercept = vline, linetype="dotted", color="gray20")}
  if(hlines){plot <- plot + ggplot2::geom_hline(yintercept=c(top, bottom), colour="red", linetype="dotted")}

  #==statistics===#
  if(is.data.frame(gseaTable)){
    pval <- round(gseaTable[which(gseaTable[,"GO_ID"] %in% setID),"pval"], 4)
    NES <- round(gseaTable[which(gseaTable[,"GO_ID"] %in% setID),"NES"], 2)
    plot <- plot +
      ggplot2::annotate(geom="text",size=annoSize, x=(length(stats)/10), y=yStats[1], label=substitute(paste(italic("NES"), " = ",NES)),hjust=0,vjust=0) +
      ggplot2::annotate(geom="text",size=annoSize, x=(length(stats)/10), y=yStats[2], label=substitute(paste(italic("p"), " = ",pval)),hjust=0,vjust=0)
  }

  #==make=axis=text=and=title=blank=(user=choice)==#
  if(!yTitle){plot <- plot + ggplot2::theme(axis.title.y=element_blank())}
  if(!xTitle){plot <- plot + ggplot2::theme(axis.title.x=element_blank())}
  if(!xText){plot <- plot + ggplot2::theme(axis.text.x=element_blank())}

  #==citation==#
  message("please cite the following packages:\n", .citeggplot(), .citefgsea(), .citeanno(), .citeorghs(), .citeorgmm(), .citegodb())

  #==output==#
  plot
}

#' Create a gene set
#'
#' Imports:
#' GSEABase
#'
#' @param name string, give the gene a short descriptive name
#' @param IDs vector of identifiers (strings or integers)
#' @param species string, either "Homo sapiens" or "Mus musculus"
#' @param translate string, translate IDs to entrezids. Must be "SYMBOL", "ENSEMBL" or other valid types
#' @param pmids string, Pubmed ID
#' @param desc1 string, short description
#' @param desc2 string, long description
#' @param contributor string
#' @return Vector of human Entrez IDs (or, if matrix=T, a matrix of mouse and human Entrez IDs)
seekCreateGeneSet <- function(name, IDs, species, translate=NA, pmids="", desc1="", desc2="", contributor=""){
  IDs <- as.character(na.omit(as.vector(unique(IDs))))
  if(!is.na(translate)){IDs <- as.vector(switch(species,
                                                "Homo sapiens"=AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,keys=c(IDs, limma::alias2Symbol(IDs, species="Hs")),column="ENTREZID",keytype=translate, multiVals="first"),
                                                "Mus musculus"=AnnotationDbi::mapIds(org.Mm.eg.db::org.Mm.eg.db,keys=c(IDs, limma::alias2Symbol(IDs, species="Mm")),column="ENTREZID",keytype=translate, multiVals="first")))}
  IDs <- unlist(unique(na.omit(IDs)))
  GSEABase::GeneSet(setName=name, geneIds=unique(IDs), organism=species,
                    contributor=contributor,
                    creationDate=date(),
                    shortDescription=desc1,
                    longDescription=desc2,
                    pubMedIds=pmids,
                    geneIdType=GSEABase::EntrezIdentifier())
}

.citeorghs <- function() print("### org.Hs.eg.db package: Marc Carlson (2019). org.Hs.eg.db: Genome wide annotation for Human. R package version 3.8.2.\n")
.citeorgmm <- function() print("### org.Mm.eg.db package: Marc Carlson (2019). org.Mm.eg.db: Genome wide annotation for Mouse. R package version 3.8.2.\n")
.citefgsea <- function() print("### fgsea package: Alexey Sergushichev. An algorithm for fast preranked gene set enrichment analysis using cumulative statistic calculation. bioRxiv (2016), doi:10.1101/060012\n")
.citehyper <- function() print("### hypeR package: Federico, A. & Monti, S. hypeR: an R package for geneset enrichment workflows. Bioinformatics 36, 1307-1308 (2020).\n")
.citeanno <- function() print("### AnnotationDbi package: Hervé Pagès, Marc Carlson, Seth Falcon and Nianhua Li (2019). AnnotationDbi: Manipulation of SQLite-based annotations in Bioconductor. R package version 1.46.1.\n")
.citebase <- function() print("### GSEABase package: Martin Morgan, Seth Falcon and Robert Gentleman (2020). GSEABase: Gene set enrichment data structures and methods. R package version 1.52.1.\n")
.citegodb <- function() print("### GO.db package: Marc Carlson (2019). GO.db: A set of annotation maps describing the entire Gene Ontology. R package version 3.8.2.\n")
.citedplyr <- function() print("### dplyr package: Hadley Wickham, Romain François, Lionel Henry and Kirill Müller (2021). dplyr: A Grammar of Data Manipulation. R package version 1.0.4. https://CRAN.R-project.org/package=dplyr\n")
.citeggplot <- function() print("### ggplot2 package: Wickham H (2016). ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. ISBN 978-3-319-24277-4, https://ggplot2.tidyverse.org.\n")
.citeggdendro <- function() print("### ggdendro package: https://www.r-pkg.org/pkg/ggdendro\n")
.citetibble <- function() print("### tibble package: https://github.com/tidyverse/tibble\n")
.citeggpubr <- function() print("### ggpubr package: https://rpkgs.datanovia.com/ggpubr/\n")
Solatar/seeqR documentation built on Feb. 19, 2021, 8:07 p.m.