BinfTools: Useful functions for standard analyses

Documented in customGMT gmtHeat readGMT writeGMT

#' Make custom gene sets
#'
#' This function will use the gost output of GO_GEM along with a complete geneset
#' object (downloaded from gprofiler) to extract gene sets of GO terms containing
#' a given key.
#'
#' @param gost The gost output of GO_GEM (get by setting returnGost=TRUE when using GO_GEM)
#' @param key The keyword used to identify GO terms of interest
#' @param gmt The geneset object (read in using qusage::read.gmt())
#' @return A list of gene sets in the format for use with other BinfTools functions
#' @export

customGMT<-function(gost, key, gmt){
  ##Using the gost object, grep for a term, grab the GO ID, and then return the
  ##genesets related to the term
  if(class(gost) != "data.frame"){
    y<-gost$result[,-14]
  } else {
    y<-gost
  }
  IDs<-c()
  terms<-c()
  pb<-txtProgressBar(min=0, max=length(key), style=3)
  for(i in 1:length(key)){
    IDs<-c(IDs,y[grep(key[i], y$term_name, ignore.case = TRUE),]$term_id)
    terms<-c(terms,grep(key[i], y$term_name, ignore.case = TRUE, value = TRUE))
    setTxtProgressBar(pb, i)
  }
  x<-gmt[IDs]
  names(x)<-terms
  return(x)
}

#' Write a gene set to file in gmt format
#'
#' This function will accept a list of genes belonging to a gene set and write
#' it to a given filename in gmt format.
#'
#' @param geneSet The geneset list object (generated by customGMT())
#' @param filename The output filename for the gmt file. Use extension .gmt
#' @export

writeGMT<-function(geneSet, filename){
  f=file(filename,'w')
  for(i in 1:length(geneSet)){
    cat(names(geneSet)[i], names(geneSet)[i], geneSet[[i]], file=f, sep="\t")
    cat('\n', file=f)
  }
  close(f)
  message(paste0("Gene sets written to ", filename,"."))
  return(invisible())
}

#' Read a gmt file into R
#'
#' This function will read a gmt file into a list object in R
#'
#' @param gmt Character with path to file in .gmt format
#' @export

readGMT<-function(gmt){
  x<-strsplit(readLines(gmt),"\t")
  y<-lapply(x, tail, -2)
  names(y)<-sapply(x, head, 1)
  return(y)
}

#' Make a heatmap of gene expression grouped by gene sets
#'
#' @param counts data frame of normalized gene expression values
#' @param cond character vector indicating which condition each column of counts belongs to
#' @param gmt named list of gene sets to be used in the heatmap. Genes must be found in rownames(counts).
#' @param con character indicating the control condition from cond. Or the order in which conditions in cond should appear on the heatmap.
#' @param labgenes character indicating which genes (if any) should be labeled on the heatmap. Default NULL will label all genes. Set to "" to label no genes.
#' @param title character indicating the title of the resulting heatmap
#' @param avgExp Boolean indicating if gene expression should be averaged within each condition (TRUE) or if each individual replicate should be plotted (FALSE; default).
#' @param zscore Boolean indicating if gene expression should be z-score scaled (TRUE; default) or not (FALSE).
#' @param colAnno Dataframe with rownames=colnames counts and at least one column to annotate the columns of the heatmap. Leave NULL for no annotation.
#' @param hmcol colorRampPalette object of length 100 indicating colour scheme of heatmap. Leave NULL for default colours.
#' @param intClus Boolean indicating if hierarchical clustering should be performed within each gene set. Note trees will not be shown. Default = TRUE. If FALSE, heatmap will be presented in the order in which genes appear in the gene sets.
#' @param printEach Boolean indicating if heatmaps for each individual gene set should be printed. Only works when intClus=TRUE. Trees will be shown. Default = FALSE.
#' @param retGroups Boolean indicating if named list of data frames of gene expression subset to each gene set should be returned (z-score normalized if zscore=TRUE). Default is FALSE. if TRUE, heatmap won't be plotted.
#' @return Annotated heatmap of gene expression of all gene sets provided or named list of data frames of gene expression subset to each gene set.
#' @export

gmtHeat<-function(counts, cond, gmt, con=NULL, labgenes=NULL, title="", avgExp=TRUE, zscore=TRUE, colAnno=NULL, hmcol=NULL, intClus=TRUE, printEach=FALSE, retGroups=FALSE){
  if (is.null(hmcol)) {
    hmcol <- colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7,
                                                           name = "RdYlBu")))(100)
  }
  gmt <- lapply(gmt, function(x) {
    return(x[which(x %in% rownames(counts))])
  })
  annodf <- suppressWarnings(unique(tidyr::gather(as.data.frame(do.call("cbind",
                                                                        gmt)), key = "Term", value = "Genes")))
  rownames(annodf) <- make.names(annodf$Genes, unique = TRUE)
  if (isTRUE(zscore)) {
    message("Z-scoring counts")
    counts <- t(scale(t(counts)))
  }
  if (!is.null(con)) {
    if (length(con) == length(levels(factor(cond)))) {
      cond <- forcats::fct_relevel(cond, con)
    }
    else {
      ind <- which(levels(as.factor(cond)) == con)
      if (ind != 1) {
        x <- 1:length(levels(as.factor(cond)))
        x <- x[-ind]
        x <- c(ind, x)
        y <- levels(as.factor(cond))[x]
        cond <- forcats::fct_relevel(as.factor(cond),
                                     y)
      }
    }
    tmp.counts <- counts[, which(cond == levels(cond)[1])]
    tmp.cond <- as.character(cond[which(cond == levels(cond)[1])])
    for (i in 2:length(levels(cond))) {
      tmp.counts <- cbind(tmp.counts, counts[, which(cond  ==
                                                       levels(cond)[i])])
      tmp.cond <- c(tmp.cond, as.character(cond[which(cond  ==
                                                        levels(cond)[i])]))
    }
    counts <- tmp.counts
    cond <- tmp.cond
  }
  if (isTRUE(avgExp)) {
    message("Averaging expression values accross replicates")
    counts <- avgExp(counts, cond)
    cond <- colnames(avgExp)
  }
  forHeat <- list()
  for (i in 1:length(gmt)) {
    forHeat[[i]] <- counts[which(rownames(counts) %in% gmt[[i]]),]
    if(isTRUE(intClus)){
      genelab<-rownames(forHeat[[i]])
      if (!is.null(labgenes)) {
        tmp <- rep(" ", nrow(forHeat[[i]]))
        for (k in 1:length(labgenes)) {
          tmp[which(rownames(forHeat[[i]]) %in% labgenes[k], arr.ind = TRUE)] <- labgenes[k]
        }
        genelab <- tmp
      }
      tmp<-pheatmap::pheatmap(forHeat[[i]], cluster_row=TRUE, silent=!printEach, main=names(gmt)[i], labels_row = genelab)
      ord<-rownames(forHeat[[i]])[tmp$tree_row$order]
      gmt[[i]]<-ord
      forHeat[[i]]<-forHeat[[i]][match(ord, rownames(forHeat[[i]])),]
    } else {
      forHeat[[i]] <- forHeat[[i]][match(gmt[[i]], rownames(forHeat[[i]])),]
    }
    names(forHeat)[i] <- names(gmt)[i]
  }
  if (isTRUE(retGroups)) {
    message("Returning list of groups")
    return(forHeat)
  }

  forHeat <- suppressWarnings(as.data.frame(do.call("rbind",
                                                    forHeat)))

  annodf <- suppressWarnings(unique(tidyr::gather(as.data.frame(do.call("cbind",
                                                                        gmt)), key = "Term", value = "Genes")))
  rownames(annodf) <- make.names(annodf$Genes, unique = TRUE)

  rownames(forHeat) <- rownames(annodf)
  gaps <- c()
  if (length(gmt) < 2) {
    gaps <- NULL
  }
  else {
    for (i in 1:(length(gmt) - 1)) {
      gaps <- c(gaps, sum(gaps[length(gaps)], length(gmt[[i]])))
    }
  }
  if (!is.null(labgenes)) {
    tmp <- rep(" ", nrow(forHeat))
    for (i in 1:length(labgenes)) {
      tmp[which(annodf$Genes %in% labgenes[i], arr.ind = TRUE)] <- labgenes[i]
    }
    labgenes <- tmp
  }
  else {
    labgenes <- annodf$Genes
  }
  lim <- c(ifelse(min(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])<0, min(as.matrix(forHeat)[is.finite(as.matrix(forHeat))]), 0), max(as.matrix(forHeat)[is.finite(as.matrix(forHeat))]))
  if(isTRUE(zscore)){
    lim <- c(max(abs(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])) * -1, max(abs(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])))
  }
  if(is.null(colAnno)){
    pheatmap::pheatmap(forHeat, annotation_row = annodf[,-2,drop = FALSE], gaps_row = gaps, main=title,
                       cluster_rows = FALSE, labels_row = labgenes, cluster_cols = FALSE,
                       breaks = seq(from = lim[1], to = lim[2], length.out = 100), color=hmcol)
  } else {
    pheatmap::pheatmap(forHeat, annotation_row = annodf[,-2,drop = FALSE], annotation_col = colAnno, gaps_row = gaps, main=title,
                       cluster_rows = FALSE, labels_row = labgenes, cluster_cols = FALSE,
                       breaks = seq(from = lim[1], to = lim[2], length.out = 100), color=hmcol)
  }
}