knitr::opts_chunk$set(echo = TRUE)
This is a short tutorial to introduce motifHeat, associating hierarchical clustering to motif detection.
First we need to install the dependencies
packageList <- c("devtools", "dendextend") newPackages <- packageList[!(packageList %in% installed.packages()[,"Package"])] if(length(newPackages)) install.packages(newPackages) library("devtools") library("dendextend") if (!require("motifHeat")) { install_github("computational-chemical-biology/motifHeat") } else { library("motifHeat") }
You need your GNPS task id and optionally ms2lda:
dlist <- access_gnps('60e8f7096e994d5c9717d471445f8bfe', '56f089a1772340a0aa2978d41478e3e4')
Display motifs annotated:
head(dlist$motifs_in_scans)
Count the motifs and display the most frequent
motif_count <- table(dlist$motifs_in_scans[,'motif']) motif_count[order(motif_count, decreasing=TRUE)][1:5]
Select cluster indexes associated to most frequent
smotif <- dlist$motifs_in_scans[dlist$motifs_in_scans[,'motif'] %in% c('motif_480'), c('scan', 'motif')] head(smotif)
factorCorLis is a nested list with one color for each level of a factor:
factorColList <- list(list(colors=c("darkorchid","darkred"), factor='Extraction'), list(colors=c("green", "darkgreen"), factor='Trimethoprim'))
You can also embed plots, for example:
tab <- dlist$features meta <- dlist$metadata h <- format_heatmap(tab, meta, selectField='StrainName', selectValue='Burkholderia dolosa AU0645 Genomovar type VI', factorColList=factorColList)
We can also add motif labels to the features in the heatmap:
h <- format_heatmap(tab, meta, selectField='StrainName', labCol=smotif, selectValue='Burkholderia dolosa AU0645 Genomovar type VI', factorColList=factorColList)
Subselect metadata and repeat heatmap:
meta2 <- meta[meta$TimePoint=='48h' & grepl('EA', meta$Extraction) & grepl('Burkholderia', meta$StrainName),] factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName')) h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList)
Changing color scale:
library("gplots") h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75))
Adding normalization:
h <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75))
Creating a pdf:
pdf('ea_48h_strains.pdf') h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, main='Color 1 not Normalized') h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 not Normalized') h <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 Normalized') dev.off()
Repeating the process for other condition:
meta2 <- meta[meta$TimePoint=='48h' & grepl('SPE', meta$Extraction) & grepl('Burkholderia', meta$StrainName),] factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName')) pdf('spe_48h_strains.pdf') h2 <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, main='Color 1 not Normalized') h2 <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 not Normalized') h2 <- format_heatmap(tab, meta2, labCol=smotif, norm=TRUE, factorColList=factorColList, colorScale = redgreen(75), main='Color 2 Normalized') dev.off()
The heatmap result contains hierarchical clustering from samples and features. We can associate the sample grouping to colors
# needs preprocessing head(cbind(t(h$colors), colnames(tab)[grep('Peak area', colnames(tab))])[h$heatmap$rowInd,])
Create hclut object from dendrogram and plot the cluters with arbitrary number of groups
hc <- as.hclust(h2$heatmap$colDendrogram) plot(hc) plot(hc, labels=FALSE) rect.hclust(hc, 6)
Change the criteria to hight and record grouping
plot(hc, labels=FALSE) rect.hclust(hc, h=1500) hcgrp <- cutree(hc, h=1500)
Use grouping to create colored bars
dend <- h2$heatmap$colDendrogram dend %>% plot colored_bars(colors = hcgrp, dend = dend)
Create a table with all component indexes and associated cluster ids:
ms2lda <- cbind(c(dlist$ms2lda_edges[, 'CLUSTERID1'], dlist$ms2lda_edges[, 'CLUSTERID2']), rep(dlist$ms2lda_edges[, 'ComponentIndex'], 2)) ms2lda <- unique(ms2lda) colnames(ms2lda) <- c('CLUSTERID','ComponentIndex') ms2lda <- ms2lda[ms2lda[,2]!=-1,] head(ms2lda)
Merge component index and moftif tables
ms2lda <- merge(ms2lda, dlist$ms2lda_nodes[,c('scans', 'motif')], by.x='CLUSTERID', by.y='scans') tgrp <- data.frame(CLUSTERID=as.numeric(names(hcgrp)), HCA_GRP=hcgrp) ms2lda <- merge(ms2lda, tgrp, by.x='CLUSTERID', by.y='CLUSTERID') head(ms2lda)
Obtain the proportion of nodes from each connected component that are associated to the same group detectep by hierarchical clustering
comp2grp <- tapply(ms2lda[,4], ms2lda[,2], function(x) c(names(table(x))[which.max(table(x))], max(table(x))/length(x), length(x))) tcomp2grp <- do.call(rbind, comp2grp) tcomp2grp <- cbind(names(comp2grp), tcomp2grp) colnames(tcomp2grp) <- c('ComponentIndex', 'HCA_GRP', 'Max_Proportion', 'Total_Nodes') tcomp2grp <- tcomp2grp[order(as.numeric(tcomp2grp[,4]), decreasing=TRUE),] mfmotif <- tapply(ms2lda[,3], ms2lda[,2], function(x) names(which.max(table(unlist(sapply(x, strsplit, ',')))))) mfmotif[unlist(lapply(mfmotif, is.null))] <- '' tcomp2grp <- cbind(tcomp2grp, '') colnames(tcomp2grp)[5] <- 'Most_Freq_Motif' tcomp2grp[,5] <- unlist(mfmotif[tcomp2grp[,1]]) ulinks <- unique(dlist$motifs_in_scans[,c('motif', 'motifdb_url')]) ulinks <- as.matrix(ulinks) tcomp2grp <- cbind(tcomp2grp, '') ids <- match(tcomp2grp[,5], ulinks[,1]) tcomp2grp[!is.na(ids), 6] <- ulinks[ids[!is.na(ids)],2] head(tcomp2grp)
Save table
colnames(tcomp2grp)[6] <- 'Link' write.table(tcomp2grp, 'component_group_association.tsv', sep='\t', row.names=FALSE)
In order to localize interesting features in the dendrogram we can need first to select the interesting branches. Here we use the reources of the source function format_heatmap
.
meta2 <- meta[meta$TimePoint=='48h' & grepl('EA', meta$Extraction) & grepl('Burkholderia', meta$StrainName),] meta2 <- meta2[grep('cenocepacia', meta2$StrainName),] factorColList <- list(list(colors=rainbow(length(unique(meta2$StrainName))), factor='StrainName')) pdf('EA_48h_Cenocepacia_strains.pdf') h <- format_heatmap(tab, meta2, labCol=smotif, factorColList=factorColList, norm=TRUE, colorScale = redgreen(75)) dev.off() exc <- c('BcH14274_EA_48_2_P1-D-2_01_2859.mzXML', 'BcH14274_EA_48_TMP_2_P1-D-6_01_2865.mzXML', 'BcK562_EA_48_2_P1-E-2_01_2803.mzXML', 'BcK562_EA_48_TMP_3_P1-E-6_01_2811.mzXML') h <- format_heatmap(tab, meta2[!meta2[,1]%in% exc,], labCol=smotif, factorColList=factorColList, norm=TRUE, colorScale = redgreen(75)) cdend <- h$heatmap$colDendrogram colbranches <- function(n, col) { a <- attributes(n) # Find the attributes of current node # Color edges with requested color attr(n, "edgePar") <- c(a$edgePar, list(col=col, lwd=2)) n # Don't forget to return the node! } cdend[[2]][[2]][[2]][[2]][[1]] = dendrapply(cdend[[2]][[2]][[2]][[2]][[1]], colbranches, "blue") cdend[[1]] = dendrapply(cdend[[1]], colbranches, "red") plot(cdend) featureTable=tab; metadataTable=meta2[!meta2[,1]%in% exc,]; labCol=smotif; factorColList=factorColList; norm=TRUE; colorScale = redgreen(75) tab2 <- featureTable[, grep('Peak area', colnames(featureTable))] tab2 <- t(tab2) rownames(tab2) <- sub(' filtered Peak area$| Peak area$', '', rownames(tab2)) colnames(tab2) <- tab[,'row ID'] # order samples according to metadata tab2 <- tab2[metadataTable[,1],] tab2 <- tab2[,apply(tab2, 2, sum)!=0] if (norm) { tab2 <- t(apply(tab2, 1, function(x) x/sum(x))) } if (length(labCol)) { tmp <- rep('', ncol(tab2)) mt <- match(labCol[,1], colnames(tab2)) if(any(is.na(mt))) { labCol <- labCol[!is.na(mt),] mt <- mt[!is.na(mt)] } tmp[mt] <- labCol[,2] labCol <- tmp } else { labCol <- rep('', ncol(tab2)) } colList <- list() if (length(factorColList)) { fnames <- c() for(i in 1:length(factorColList)) { colList[[i]] <- factorColList[[i]]$colors names(colList[[i]]) <- unique(metadataTable[, factorColList[[i]]$factor]) colList[[i]] <- colList[[i]][metadataTable[, factorColList[[i]]$factor]] fnames <- append(fnames, factorColList[[i]]$factor) } rlab <- do.call(rbind, colList) rownames(rlab) <- fnames } mydist <- function(c) { dist(c, method="canberra") } myclust <- function(c) { hclust(c, method="ward.D") } # Load source code source('../R/heatmap.3.R') #pdf('EA_48h_Cenocepacia_selected_strains.pdf') h <- heatmap.3(tab2, hclustfun=myclust, distfun=mydist, na.rm = TRUE, scale="column", dendrogram="both", margins=c(6,12), Rowv=TRUE, Colv=cdend, RowSideColors=rlab, symbreaks=FALSE, key=TRUE, symkey=FALSE, density.info="none", trace="none", labCol=labCol, labRow=rownames(tab2), col=colorScale, ColSideColorsSize=7, RowSideColorsSize=2, KeyValueName="Ion intensity") #dev.off()
After the selection, we can select the features included in the groups of interest and export them.
url_to_attributes = paste0("http://gnps.ucsd.edu/ProteoSAFe/DownloadResultFile?task=", '60e8f7096e994d5c9717d471445f8bfe', "&block=main&file=clusterinfo_summary/") gnps <- read.delim(url_to_attributes) reddend <- gnps[gnps[,"cluster.index"] %in% labels(cdend[[1]]),] reddend <- reddend[,c("cluster.index", "componentindex", "parent.mass", "RTMean", "LibraryID")] reddend <- merge(reddend, dlist$ms2lda_nodes[,1:7], by.x='cluster.index', by.y='scans') head(reddend)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.