R/GO_visualization.R

Defines functions getGoTerm GO_viz_choose GO_visualization

Documented in getGoTerm GO_visualization GO_viz_choose

#' GO_visualization
#'
#'   This function provides a dot plot chart of the GO enrichments after semantic similarity analysis and a recalculation
#'     of enrichments based on new gene categories derived from the semantic similarity analysis.
#' @param cluster_enriched_df second object provided by the enrich_test _function
#' @param markers_df data frame resulting from FindAllMarkers function in the Seurat package, required if a clust_list is not provided
#' @param clust_list a list of gene vectors associated with each cluster.
#' @param numcats number of catergorical terms to plot on x-axis. Function is setup to select the the most frequent terms. Default is 15.
#' @param GOcats product of the msigdb_gsets funciton using GOBP. Can also be user supplied annotation in the same format.
#' @param org_db annotation package from Bioconductor of matching species to the species_x parameter.
#' @param chosen_cats User supplied catergorical terms to plot on x-axis. Default is null which lets the function choose the categories.
#' @param goterms a vector with GOIDS as object and the GO terms as names
#' @param species_x species names, default is "Homo sapiens", refer to ?msigdbr::msigdbr for available species
#' @import 'foreach'
#' @import 'ggplot2'
#' @import 'rrvgo'
#' @import 'LSAfun'
#' @import 'hypeR'
#' @import 'stringr'
#' @importFrom foreach %do%
#' @importFrom foreach %dopar%
#' @importFrom stats as.dist
#' @importFrom stats cutree
#' @importFrom stats hclust
#' @importFrom stringr str_squish
#' @return prints a plot and returns a list with three objects. The first object is a list of data frames generated by the semantic similarity analysis based on GO:BP enrichments, the second is a dataframe used for ggplot,
#'             and the third object is a ggplot2 object that was plotted suitable for fine tuning the appearance of the plot.
#' @export
#' @examples
#' \dontrun{
#' GO_visualization(enrich_results$Enriched_df,markers_df=markers,goterms=goterms,numcats=10)
#' }
GO_visualization<-function(cluster_enriched_df,markers_df=NULL,clust_list=NULL,numcats=15,GOcats=NULL,chosen_cats=NULL,org_db="org.Hs.eg.db",goterms=NULL,species_x="Homo sapiens") {
  i<-b<-NULL
  
  if(is.null(goterms)) stop("A named GO annotation vector must be included with GO ids making up the vector and GO term names as the vector names")
  if(is.null(markers_df) && is.null(clust_list)) stop("Need either a cluster gene list or a a Seurat markers data frame")
  if(is.null(markers_df)) {clust_list} else {
    x<-markers_df[markers_df$p_val_adj<0.1,]
    clust_list<-foreach(i=1:length(levels(markers_df$cluster)),.packages="foreach")%do% {x[x$cluster==levels(x$cluster)[i],]$gene}
    names(clust_list)<-levels(markers_df$cluster)
    clust_list<-clust_list[lengths(clust_list)>0]
  }
  
  if(is.null(GOcats)) {GOcats <- msigdb_gsets(species=species_x, category="C5", subcategory="BP")}
  cats<-GOcats$genesets
  y<-substring(names(cats),6)
  y<-breakdown(y)
  y<-stringr::str_squish(y)
  y<-goterms[y]
  names(cats)<-y

  Go.list.viz<-foreach(i=1:length(cluster_enriched_df),.packages="foreach") %do% {
    print(i)
    y<-rownames(cluster_enriched_df[[i]][[2]])
    
    if(length(y)==0) {"No Enrichments"} else{
      y<-unlist(substring(y,6))
      y<-breakdown(y)
      y<-stringr::str_squish(y)
      y<-goterms[y]
      scores<-cluster_enriched_df[[i]][[2]]$overlap
      scores<-scores[!is.na(y)]
      sizes<-cluster_enriched_df[[i]][[2]]$geneset
      sizes<-sizes[!is.na(y)]
      y<-y[!is.na(y)]
      names(sizes)<-y
      names(scores)<-y
      if(length(y)<=1) {"No Enrichments"} else {
        sm<-calculateSimMatrix(y,orgdb=org_db,ont="BP",method="Rel")
        
        sizes<-sizes[intersect(rownames(sm),names(sizes))]
        scores<-scores[intersect(rownames(sm),names(scores))]
        o <- rev(order(scores, sizes, na.last = FALSE))
        sm <- sm[o, o]
        clustree <- cutree(hclust(as.dist(1 - sm)), h = .7)
        clusterRep <- tapply(rownames(sm), clustree, function(x) x[which.max(scores[x])])
        red<-data.frame(go = rownames(sm),
                        clustree = clustree,
                        parent = clusterRep[clustree],
                        parentSimScore = unlist(Map(seq_len(nrow(sm)),clusterRep[clustree],f = function(i, j) sm[i,j])),
                        score = scores[match(rownames(sm),names(scores))],
                        size = sizes[match(rownames(sm), names(sizes))],
                        term =  getGoTerm(rownames(sm)),
                        parentTerm = getGoTerm(clusterRep[clustree]))
      }
    }
  }
  names(Go.list.viz)<-names(cluster_enriched_df)
  
  df1<-foreach(i=1:length(Go.list.viz),.combine='rbind',.packages="foreach") %do% {
    if(is.data.frame(Go.list.viz[[i]])){
    x<-Go.list.viz[[i]]
    x$cluster<-names(Go.list.viz)[i]
    x} else {}
  }
  if(is.null(chosen_cats)){
    x<-foreach(i=1:length(Go.list.viz),.combine='c',.packages="foreach") %do% {
      suppressWarnings(if(!is.data.frame(Go.list.viz[[i]])) {}else{unique(Go.list.viz[[i]]$parentTerm)})
    }
    x<-sort(table(x),decreasing=T)
    chosen_cats<-x[1:numcats]
  }
  chosen_cats<-chosen_cats[!(is.na(chosen_cats))]

  x<-foreach(i=1:length(unique(df1$parentTerm)),.packages="foreach") %dopar% {
    fin<-foreach(b=1:length(Go.list.viz),.combine='c',.packages="foreach") %do% {
      g1<-Go.list.viz[[b]]
      if(is.character(g1)) {} else{
        g1<-g1[which(g1$parentTerm==unique(df1$parentTerm)[i]),]$go}
    }
    unique(fin)
  }

  cats2<-foreach(i=1:length(x),.packages="foreach") %do% {
    x1<-x[[i]]
    x2<-unlist(cats[x1])
    unique(x2)
  }
  names(cats2)<-unique(df1$parentTerm)

  G1<-foreach(i=1:length(clust_list),.packages=c("hypeR","foreach")) %do% {
    g3<-hypeR(clust_list[[i]], cats2, background=23459, fdr=1)
    Percentage<-g3$data$overlap/g3$data$geneset*100
    FDR<-g3$data$fdr
    data.frame(g3$data$label,Percentage,FDR)
  }
names(G1)<-names(clust_list)
  
  G2<-foreach(i=1:length(clust_list),.combine='rbind',.packages="foreach") %do% {
    GO_Names<-G1[[i]]$g3.data.label
    Percentage<-G1[[i]]$Percentage
    Clusters<-as.character(rep(names(G1)[i],length(Percentage)))
    FDR<-G1[[i]]$FDR
    data.frame(GO_Names,Percentage,Clusters,FDR)
  }
  G2$Clusters<-factor(G2$Clusters,
                      levels=names(G1))
  y<-names(chosen_cats)
  G3<-foreach(i=1:length(y),.combine='rbind',.packages="foreach") %do% {
    G2[G2$GO_Names==y[i],]
  }
  G3$GO_Names<-factor(G3$GO_Names,levels=y)

  p<-ggplot(G3, aes(GO_Names,-log10(FDR)))
  p<-p+theme_light()
  p<-p+geom_jitter(height=0,width=0.3,aes_string(color="Clusters",size="Percentage"))
  p<-p +theme(axis.text.x = element_text(angle=90,size=10),axis.title.x = element_blank())
  p<-p+ylab("-log10(FDR)")
  p<-p+geom_hline(yintercept=-log10(0.1),color="red",linetype="dashed")

  p

  fin<-list(df1,G2,p)
  names(fin)<-c("GO_sem","GO_df","plot")
  return(fin)
}

#' This function provides a dot plot chart of the GO enrichments of user defined Parent term categories from the results of GO_visualizatiohn function
#' @param GO_viz_results object provided by the Go_visulaization function
#' @param markers_df data frame resulting from FindAllMarkers function in the Seurat package, required if a clust_list is not provided
#' @param clust_list a list of gene vectors associated with each cluster.
#' @param chosen_cats User supplied catergorical terms to plot on x-axis. Default is null which lets the function choose the categories.
#' @param GOcats product of the msigdb_gsets funciton using GOBP. Can also be user supplied annotation in the same format.
#' @param goterms a vector with GOIDS as object and the GO terms as names
#' @param species_x species names, default is "Homo sapiens", refer to ?msigdbr::msigdbr for available species
#' @import 'foreach'
#' @import 'ggplot2'
#' @import 'hypeR'
#' @importFrom foreach %do%
#' @importFrom foreach %dopar%
#' @return prints a plot and returns a ggplot2 object suitable for fine tuning the appearance of the plot.
#' @export
#' @examples
#' \dontrun{
#' GO_viz_choose(go_vis_res1,chosen_cats=newcats,markers_df=markers)
#' }

GO_viz_choose<-function(GO_viz_results,chosen_cats,clust_list=NULL,markers_df=NULL,GOcats=NULL,species_x="Homo sapiens", goterms=NULL){
  i<-NULL
  if(is.null(goterms)) stop("A named GO annotation vector must be included with GO ids making up the vector and GO term names as the vector names")
  if(is.null(markers_df) && is.null(clust_list)) stop("Need either a cluster gene list or a a Seurat markers data frame")
  if(is.null(markers_df)) {clust_list} else {
    x<-markers_df[markers_df$p_val_adj<0.1,]
    clust_list<-foreach(i=1:length(levels(markers_df$cluster)),.packages="foreach")%do% {x[x$cluster==levels(x$cluster)[i],]$gene}
    names(clust_list)<-levels(markers_df$cluster)
    clust_list<-clust_list[lengths(clust_list)>0]
  }
  if(is.null(GOcats)) {GOcats <- msigdb_gsets(species=species_x, category="C5", subcategory="BP")}
  cats<-GOcats$genesets
  y<-unlist(substring(names(cats),6))
  y<-breakdown(y)
  y<-stringr::str_squish(y)
  y<-goterms[y]
  names(cats)<-y

  res1<-GO_viz_results$GO_sem

  g1<-foreach(i=1:length(unique(res1$parentTerm))) %dopar% {
    res1[which(res1$parentTerm==unique(res1$parentTerm)[i]),]$go}
  cats2<-foreach(i=1:length(g1)) %do% {
    x1<-g1[[i]]
    x2<-unlist(cats[x1])
    unique(x2)
  }
  names(cats2)<-unique(res1$parentTerm)

  G1<-foreach(i=1:length(clust_list),.packages="hypeR") %do% {
    g3<-hypeR(clust_list[[i]], cats2, background=23459, fdr=1)
    Percentage<-g3$data$overlap/g3$data$geneset*100
    FDR<-g3$data$fdr
    data.frame(g3$data$label,Percentage,FDR)
  }
  names(G1)<-names(clust_list)
  
  G2<-foreach(i=1:length(clust_list),.combine='rbind') %do% {
    GO_Names<-G1[[i]]$g3.data.label
    Percentage<-G1[[i]]$Percentage
    Clusters<-as.character(rep(names(G1)[i],length(Percentage)))
    FDR<-G1[[i]]$FDR
    data.frame(GO_Names,Percentage,Clusters,FDR)
  }
  G2$Clusters<-factor(G2$Clusters,
                      levels=names(G1))

  G3<-foreach(i=1:length(chosen_cats),.combine='rbind') %do% {
    G2[G2$GO_Names==chosen_cats[i],]
  }
  G3$GO_Names<-factor(G3$GO_Names,levels=chosen_cats)

  p<-ggplot(G3, aes(GO_Names,-log10(FDR)))
  p<-p+theme_light()
  p<-p+geom_jitter(height=0,width=0.3,aes_string(color="Clusters",size="Percentage"))
  p<-p +theme(axis.text.x = element_text(angle=90,size=10),axis.title.x = element_blank())
  p<-p+ylab("-log10(FDR)")
  p<-p+labs(size="Percentage")
  p<-p+geom_hline(yintercept=-log10(0.1),color="red",linetype="dashed")

  p
  fin<-list(G2,p)
  names(fin)<-c("GO_df","plot")
  return(fin)
}

#' getGoTerm
#' Get the description of a GO term
#' 
#' @param x GO terms
#' @import GO.db
#' @return the Term slot in GO.db::GOTERM[[x]]
getGoTerm <- function(x) {
  sapply(x, function(x) GO.db::GOTERM[[x]]@Term)
}
neural-stem-cell-institute/screp documentation built on Feb. 11, 2023, 10:45 a.m.