In computational-chemical-biology/motifHeat:

knitr::opts_chunk$set(echo = TRUE)

motifHeat

This is a short tutorial to introduce motifHeat, associating hierarchical clustering to motif detection.

First we need to install the dependencies

packageList <- c("devtools", "dendextend")
newPackages <- packageList[!(packageList %in% installed.packages()[,"Package"])]
if(length(newPackages)) install.packages(newPackages)

library("devtools")
library("dendextend")

if (!require("motifHeat")) {
    install_github("computational-chemical-biology/motifHeat")
} else {
    library("motifHeat")
}

Load data from GNPS

You need your GNPS task id and optionally ms2lda:

dlist <- access_gnps('60e8f7096e994d5c9717d471445f8bfe',            '56f089a1772340a0aa2978d41478e3e4')

Recover most frequent motifs

Display motifs annotated:

head(dlist$motifs_in_scans)

Count the motifs and display the most frequent

motif_count <- table(dlist$motifs_in_scans[,'motif'])
motif_count[order(motif_count, decreasing=TRUE)][1:5]

Select cluster indexes associated to most frequent

smotif <- dlist$motifs_in_scans[dlist$motifs_in_scans[,'motif'] %in% c('motif_480'), c('scan', 'motif')]
head(smotif)

Format color for sample classes

factorCorLis is a nested list with one color for each level of a factor:

factorColList <- list(list(colors=c("darkorchid","darkred"), factor='Extraction'), list(colors=c("green", "darkgreen"), factor='Trimethoprim'))

Plot a basic heatmap

You can also embed plots, for example:

tab <- dlist$features
meta <- dlist$metadata
h <- format_heatmap(tab, meta, selectField='StrainName', selectValue='Burkholderia dolosa AU0645  Genomovar type VI', factorColList=factorColList)

We can also add motif labels to the features in the heatmap:

h <- format_heatmap(tab, meta, selectField='StrainName', labCol=smotif, selectValue='Burkholderia dolosa AU0645  Genomovar type VI', factorColList=factorColList)

Subselect metadata and repeat heatmap:

meta2 <- meta[meta$TimePoint=='48h' & grepl('EA', meta$Extraction) & grepl('Burkholderia', meta$StrainName),]
factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName'))
h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList)

Changing color scale:

library("gplots")
h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75))

Adding normalization:

h <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75))

Creating a pdf:

pdf('ea_48h_strains.pdf')
h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, main='Color 1 not Normalized')
h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 not Normalized')
h <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 Normalized')
dev.off()

Repeating the process for other condition:

meta2 <- meta[meta$TimePoint=='48h' & grepl('SPE', meta$Extraction) & grepl('Burkholderia', meta$StrainName),]
factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName'))
pdf('spe_48h_strains.pdf')
h2 <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, main='Color 1 not Normalized')
h2 <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 not Normalized')
h2 <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 Normalized')
dev.off()

Recover data from heatmap's hierarchical clustering

The heatmap result contains hierarchical clustering from samples and features. We can associate the sample grouping to colors

# needs preprocessing
head(cbind(t(h$colors), colnames(tab)[grep('Peak area', colnames(tab))])[h$heatmap$rowInd,])

Create hclut object from dendrogram and plot the cluters with arbitrary number of groups

hc <- as.hclust(h2$heatmap$colDendrogram)
plot(hc)
plot(hc, labels=FALSE)
rect.hclust(hc, 6)

Change the criteria to hight and record grouping

plot(hc, labels=FALSE)
rect.hclust(hc, h=1500)
hcgrp <- cutree(hc, h=1500)

Use grouping to create colored bars

dend <- h2$heatmap$colDendrogram
dend %>% plot 
colored_bars(colors = hcgrp, dend = dend)

Associate hierachical clustering to network component and motif frequency

Create a table with all component indexes and associated cluster ids:

ms2lda <- cbind(c(dlist$ms2lda_edges[, 'CLUSTERID1'], dlist$ms2lda_edges[, 'CLUSTERID2']), rep(dlist$ms2lda_edges[, 'ComponentIndex'], 2))
ms2lda <- unique(ms2lda)
colnames(ms2lda) <- c('CLUSTERID','ComponentIndex')
ms2lda <- ms2lda[ms2lda[,2]!=-1,]
head(ms2lda)

Merge component index and moftif tables

ms2lda <- merge(ms2lda, dlist$ms2lda_nodes[,c('scans', 'motif')], by.x='CLUSTERID', by.y='scans')
tgrp <- data.frame(CLUSTERID=as.numeric(names(hcgrp)), HCA_GRP=hcgrp) 
ms2lda <- merge(ms2lda, tgrp, by.x='CLUSTERID', by.y='CLUSTERID')
head(ms2lda)

Obtain the proportion of nodes from each connected component that are associated to the same group detectep by hierarchical clustering

comp2grp <- tapply(ms2lda[,4], ms2lda[,2], function(x) c(names(table(x))[which.max(table(x))], max(table(x))/length(x), length(x)))

tcomp2grp <- do.call(rbind, comp2grp)
tcomp2grp <- cbind(names(comp2grp), tcomp2grp)
colnames(tcomp2grp) <- c('ComponentIndex', 'HCA_GRP', 'Max_Proportion', 'Total_Nodes')
tcomp2grp <- tcomp2grp[order(as.numeric(tcomp2grp[,4]), decreasing=TRUE),]
mfmotif <- tapply(ms2lda[,3], ms2lda[,2], function(x) names(which.max(table(unlist(sapply(x, strsplit, ',')))))) 
mfmotif[unlist(lapply(mfmotif, is.null))] <- '' 
tcomp2grp <- cbind(tcomp2grp, '') 
colnames(tcomp2grp)[5] <- 'Most_Freq_Motif' 
tcomp2grp[,5] <- unlist(mfmotif[tcomp2grp[,1]]) 
ulinks <- unique(dlist$motifs_in_scans[,c('motif', 'motifdb_url')]) 
ulinks <- as.matrix(ulinks) 
tcomp2grp <- cbind(tcomp2grp, '')
ids <- match(tcomp2grp[,5], ulinks[,1])
tcomp2grp[!is.na(ids), 6] <- ulinks[ids[!is.na(ids)],2]
head(tcomp2grp)

Save table

colnames(tcomp2grp)[6] <- 'Link'
write.table(tcomp2grp, 'component_group_association.tsv', sep='\t', row.names=FALSE)

In order to localize interesting features in the dendrogram we can need first to select the interesting branches. Here we use the reources of the source function format_heatmap.

meta2 <- meta[meta$TimePoint=='48h' & grepl('EA', meta$Extraction) & grepl('Burkholderia', meta$StrainName),]
meta2 <- meta2[grep('cenocepacia', meta2$StrainName),]
factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName'))
pdf('EA_48h_Cenocepacia_strains.pdf')
h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, norm=TRUE, colorScale = redgreen(75))
dev.off()

exc <- c('BcH14274_EA_48_2_P1-D-2_01_2859.mzXML',  'BcH14274_EA_48_TMP_2_P1-D-6_01_2865.mzXML', 
         'BcK562_EA_48_2_P1-E-2_01_2803.mzXML', 'BcK562_EA_48_TMP_3_P1-E-6_01_2811.mzXML')

h <- format_heatmap(tab, meta2[!meta2[,1]%in% exc,], labCol=smotif, factorColList=factorColList, norm=TRUE, colorScale = redgreen(75))


cdend <- h$heatmap$colDendrogram

colbranches <- function(n, col) {
  a <- attributes(n) # Find the attributes of current node
  # Color edges with requested color
  attr(n, "edgePar") <- c(a$edgePar, list(col=col, lwd=2))
  n # Don't forget to return the node!
}
cdend[[2]][[2]][[2]][[2]][[1]] = dendrapply(cdend[[2]][[2]][[2]][[2]][[1]], colbranches, "blue")
cdend[[1]] = dendrapply(cdend[[1]], colbranches, "red")
plot(cdend)

featureTable=tab; metadataTable=meta2[!meta2[,1]%in% exc,]; labCol=smotif; factorColList=factorColList; norm=TRUE; colorScale = redgreen(75)
tab2 <- featureTable[, grep('Peak area', colnames(featureTable))]
tab2 <- t(tab2)
rownames(tab2) <- sub(' filtered Peak area$| Peak area$', '', rownames(tab2))
colnames(tab2) <- tab[,'row ID']
# order samples according to metadata
tab2 <- tab2[metadataTable[,1],]
tab2 <- tab2[,apply(tab2, 2, sum)!=0]
if (norm) {
  tab2 <- t(apply(tab2, 1, function(x) x/sum(x)))
}
if (length(labCol)) {
  tmp <- rep('', ncol(tab2))
  mt <- match(labCol[,1], colnames(tab2))
  if(any(is.na(mt))) {
  labCol <- labCol[!is.na(mt),]
  mt <- mt[!is.na(mt)]
  }
  tmp[mt] <- labCol[,2]
  labCol <- tmp
} else {
  labCol <- rep('', ncol(tab2))
}
colList <- list()
if (length(factorColList)) {
  fnames <- c()
  for(i in 1:length(factorColList)) {
    colList[[i]] <- factorColList[[i]]$colors
    names(colList[[i]]) <- unique(metadataTable[, factorColList[[i]]$factor])
    colList[[i]] <- colList[[i]][metadataTable[, factorColList[[i]]$factor]]
    fnames <- append(fnames, factorColList[[i]]$factor)
  }
  rlab <- do.call(rbind, colList)
  rownames(rlab) <- fnames
}
mydist <- function(c) { dist(c, method="canberra") }
myclust <- function(c) { hclust(c, method="ward.D") }

# Load source code
source('../R/heatmap.3.R')

#pdf('EA_48h_Cenocepacia_selected_strains.pdf')
h <- heatmap.3(tab2, hclustfun=myclust, distfun=mydist, na.rm = TRUE, scale="column", dendrogram="both", margins=c(6,12),
  Rowv=TRUE, Colv=cdend,  RowSideColors=rlab, symbreaks=FALSE, key=TRUE, symkey=FALSE,
  density.info="none", trace="none", labCol=labCol, labRow=rownames(tab2), col=colorScale,
  ColSideColorsSize=7, RowSideColorsSize=2, KeyValueName="Ion intensity")
#dev.off()

After the selection, we can select the features included in the groups of interest and export them.

url_to_attributes = paste0("http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=", '60e8f7096e994d5c9717d471445f8bfe', "&block=main&file=clusterinfo_summary/")
gnps <- read.delim(url_to_attributes)

reddend <- gnps[gnps[,"cluster.index"] %in% labels(cdend[[1]]),]
reddend <- reddend[,c("cluster.index", "componentindex", "parent.mass", "RTMean", "LibraryID")]
reddend <- merge(reddend, dlist$ms2lda_nodes[,1:7], by.x='cluster.index', by.y='scans')
head(reddend)