#' Make custom gene sets
#'
#' This function will use the gost output of GO_GEM along with a complete geneset
#' object (downloaded from gprofiler) to extract gene sets of GO terms containing
#' a given key.
#'
#' @param gost The gost output of GO_GEM (get by setting returnGost=TRUE when using GO_GEM)
#' @param key The keyword used to identify GO terms of interest
#' @param gmt The geneset object (read in using qusage::read.gmt())
#' @return A list of gene sets in the format for use with other BinfTools functions
#' @export
customGMT<-function(gost, key, gmt){
##Using the gost object, grep for a term, grab the GO ID, and then return the
##genesets related to the term
if(class(gost) != "data.frame"){
y<-gost$result[,-14]
} else {
y<-gost
}
IDs<-c()
terms<-c()
pb<-txtProgressBar(min=0, max=length(key), style=3)
for(i in 1:length(key)){
IDs<-c(IDs,y[grep(key[i], y$term_name, ignore.case = TRUE),]$term_id)
terms<-c(terms,grep(key[i], y$term_name, ignore.case = TRUE, value = TRUE))
setTxtProgressBar(pb, i)
}
x<-gmt[IDs]
names(x)<-terms
return(x)
}
#' Write a gene set to file in gmt format
#'
#' This function will accept a list of genes belonging to a gene set and write
#' it to a given filename in gmt format.
#'
#' @param geneSet The geneset list object (generated by customGMT())
#' @param filename The output filename for the gmt file. Use extension .gmt
#' @export
writeGMT<-function(geneSet, filename){
f=file(filename,'w')
for(i in 1:length(geneSet)){
cat(names(geneSet)[i], names(geneSet)[i], geneSet[[i]], file=f, sep="\t")
cat('\n', file=f)
}
close(f)
message(paste0("Gene sets written to ", filename,"."))
return(invisible())
}
#' Read a gmt file into R
#'
#' This function will read a gmt file into a list object in R
#'
#' @param gmt Character with path to file in .gmt format
#' @export
readGMT<-function(gmt){
x<-strsplit(readLines(gmt),"\t")
y<-lapply(x, tail, -2)
names(y)<-sapply(x, head, 1)
return(y)
}
#' Make a heatmap of gene expression grouped by gene sets
#'
#' @param counts data frame of normalized gene expression values
#' @param cond character vector indicating which condition each column of counts belongs to
#' @param gmt named list of gene sets to be used in the heatmap. Genes must be found in rownames(counts).
#' @param con character indicating the control condition from cond. Or the order in which conditions in cond should appear on the heatmap.
#' @param labgenes character indicating which genes (if any) should be labeled on the heatmap. Default NULL will label all genes. Set to "" to label no genes.
#' @param title character indicating the title of the resulting heatmap
#' @param avgExp Boolean indicating if gene expression should be averaged within each condition (TRUE) or if each individual replicate should be plotted (FALSE; default).
#' @param zscore Boolean indicating if gene expression should be z-score scaled (TRUE; default) or not (FALSE).
#' @param colAnno Dataframe with rownames=colnames counts and at least one column to annotate the columns of the heatmap. Leave NULL for no annotation.
#' @param hmcol colorRampPalette object of length 100 indicating colour scheme of heatmap. Leave NULL for default colours.
#' @param intClus Boolean indicating if hierarchical clustering should be performed within each gene set. Note trees will not be shown. Default = TRUE. If FALSE, heatmap will be presented in the order in which genes appear in the gene sets.
#' @param printEach Boolean indicating if heatmaps for each individual gene set should be printed. Only works when intClus=TRUE. Trees will be shown. Default = FALSE.
#' @param retGroups Boolean indicating if named list of data frames of gene expression subset to each gene set should be returned (z-score normalized if zscore=TRUE). Default is FALSE. if TRUE, heatmap won't be plotted.
#' @return Annotated heatmap of gene expression of all gene sets provided or named list of data frames of gene expression subset to each gene set.
#' @export
gmtHeat<-function(counts, cond, gmt, con=NULL, labgenes=NULL, title="", avgExp=TRUE, zscore=TRUE, colAnno=NULL, hmcol=NULL, intClus=TRUE, printEach=FALSE, retGroups=FALSE){
if (is.null(hmcol)) {
hmcol <- colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7,
name = "RdYlBu")))(100)
}
gmt <- lapply(gmt, function(x) {
return(x[which(x %in% rownames(counts))])
})
annodf <- suppressWarnings(unique(tidyr::gather(as.data.frame(do.call("cbind",
gmt)), key = "Term", value = "Genes")))
rownames(annodf) <- make.names(annodf$Genes, unique = TRUE)
if (isTRUE(zscore)) {
message("Z-scoring counts")
counts <- t(scale(t(counts)))
}
if (!is.null(con)) {
if (length(con) == length(levels(factor(cond)))) {
cond <- forcats::fct_relevel(cond, con)
}
else {
ind <- which(levels(as.factor(cond)) == con)
if (ind != 1) {
x <- 1:length(levels(as.factor(cond)))
x <- x[-ind]
x <- c(ind, x)
y <- levels(as.factor(cond))[x]
cond <- forcats::fct_relevel(as.factor(cond),
y)
}
}
tmp.counts <- counts[, which(cond == levels(cond)[1])]
tmp.cond <- as.character(cond[which(cond == levels(cond)[1])])
for (i in 2:length(levels(cond))) {
tmp.counts <- cbind(tmp.counts, counts[, which(cond ==
levels(cond)[i])])
tmp.cond <- c(tmp.cond, as.character(cond[which(cond ==
levels(cond)[i])]))
}
counts <- tmp.counts
cond <- tmp.cond
}
if (isTRUE(avgExp)) {
message("Averaging expression values accross replicates")
counts <- avgExp(counts, cond)
cond <- colnames(avgExp)
}
forHeat <- list()
for (i in 1:length(gmt)) {
forHeat[[i]] <- counts[which(rownames(counts) %in% gmt[[i]]),]
if(isTRUE(intClus)){
genelab<-rownames(forHeat[[i]])
if (!is.null(labgenes)) {
tmp <- rep(" ", nrow(forHeat[[i]]))
for (k in 1:length(labgenes)) {
tmp[which(rownames(forHeat[[i]]) %in% labgenes[k], arr.ind = TRUE)] <- labgenes[k]
}
genelab <- tmp
}
tmp<-pheatmap::pheatmap(forHeat[[i]], cluster_row=TRUE, silent=!printEach, main=names(gmt)[i], labels_row = genelab)
ord<-rownames(forHeat[[i]])[tmp$tree_row$order]
gmt[[i]]<-ord
forHeat[[i]]<-forHeat[[i]][match(ord, rownames(forHeat[[i]])),]
} else {
forHeat[[i]] <- forHeat[[i]][match(gmt[[i]], rownames(forHeat[[i]])),]
}
names(forHeat)[i] <- names(gmt)[i]
}
if (isTRUE(retGroups)) {
message("Returning list of groups")
return(forHeat)
}
forHeat <- suppressWarnings(as.data.frame(do.call("rbind",
forHeat)))
annodf <- suppressWarnings(unique(tidyr::gather(as.data.frame(do.call("cbind",
gmt)), key = "Term", value = "Genes")))
rownames(annodf) <- make.names(annodf$Genes, unique = TRUE)
rownames(forHeat) <- rownames(annodf)
gaps <- c()
if (length(gmt) < 2) {
gaps <- NULL
}
else {
for (i in 1:(length(gmt) - 1)) {
gaps <- c(gaps, sum(gaps[length(gaps)], length(gmt[[i]])))
}
}
if (!is.null(labgenes)) {
tmp <- rep(" ", nrow(forHeat))
for (i in 1:length(labgenes)) {
tmp[which(annodf$Genes %in% labgenes[i], arr.ind = TRUE)] <- labgenes[i]
}
labgenes <- tmp
}
else {
labgenes <- annodf$Genes
}
lim <- c(ifelse(min(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])<0, min(as.matrix(forHeat)[is.finite(as.matrix(forHeat))]), 0), max(as.matrix(forHeat)[is.finite(as.matrix(forHeat))]))
if(isTRUE(zscore)){
lim <- c(max(abs(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])) * -1, max(abs(as.matrix(forHeat)[is.finite(as.matrix(forHeat))])))
}
if(is.null(colAnno)){
pheatmap::pheatmap(forHeat, annotation_row = annodf[,-2,drop = FALSE], gaps_row = gaps, main=title,
cluster_rows = FALSE, labels_row = labgenes, cluster_cols = FALSE,
breaks = seq(from = lim[1], to = lim[2], length.out = 100), color=hmcol)
} else {
pheatmap::pheatmap(forHeat, annotation_row = annodf[,-2,drop = FALSE], annotation_col = colAnno, gaps_row = gaps, main=title,
cluster_rows = FALSE, labels_row = labgenes, cluster_cols = FALSE,
breaks = seq(from = lim[1], to = lim[2], length.out = 100), color=hmcol)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.