#' \code{\link{ramclustR}} - modified for MetMSLine
#'
#' @param peakTable optional if pcaResult argument not supplied. Either a data.frame,
#' full file path as a character string to a .csv file of a peak table in the
#' form observation (samples) in columns and variables (Mass spectral signals)
#' in rows. If argument is not supplied a GUI file selection window will open
#' and a .csv file can be selected.
#' @param obsNames character vector of observation (i.e. sample/ QC/ Blank) names
#' to identify appropriate observation (sample) columns.
#' @param st Sigma t value controlling the rate at which feature similarity
#' decays with respect to differences in retention times between feature pairs.
#' When an xcms diff report is used as input, this parameter is by default set to
#' half the peak width of all the peaks in the xcmsSet. Can be manually overridden
#' to tune performance when necessary. Larger values are less selective, smaller
#' values are more selective.
#' @param sr Sigma r value controlling the rate at which feature similarity
#' decays with respect to correlational simiarity between a pair of features.
#' numeric e.g. 0.5, Sigma r: the rate of similarity decay in response to
#' correlational r . Can be manually overridden to tune performance. Larger
#' values are less selective, smaller values are more selective.
#' @param maxt numeric e.g 20, the maximum time allowed for similarity between
#' features to be greater than zero. For example, if two features are separated
#' by more than 20 seconds, the similarity score is automatically set to zero,
#' ensuring that they end up in different clusters. If you are importing data
#' from another vendor, ensure that either you change the feature times to
#' seconds (default in ramclustR) or you change this value to a value
#' representing minutes, rather than seconds (e.g. 0.33). Used primarily to
#' speed up compuation time by not calculating values above this threshold.
#' @param deepSplit boolean e.g. =FALSE, access to the deepSplit function in
#' \code{\link{dynamicTreeCut}}. False generates fewer spectra,
#' TRUE splits spectra more readily and thereby generates more spectra.
#' For more detailed description, see documentation for the
#' \code{\link{dynamicTreeCut}} package.
#' @param blocksize integer number of features processed in one block =2000,
#' ramclustR makes attempts to minimize memory demands to maintain processing
#' speed by avoiding memory constraints. It does so by processing the full
#' similarity matrix in blocks of blocksize dimensions. This has two benefits:
#' one is that a given PC is less likely to run out of memory during processing
#' and the second is that we need not process blocks that contain not feature
#' pairs with a delta rt of greater than maxt, thereby avoiding needless
#' calculations and processing time.
#' @param minModuleSize the number of features required for a cluster to be
#' reported. That is, if a feature is clustered in a group with less than
#' minModuleSize features, it will not be exported. If you wish to analyze
#' singletons, set minModuleSize=1.
#' @param linkage The linkage method used to perform heirarchical
#' fastcluster-based heirarchical clustering. default="average".
#'
#' @return a list containing 2 elements:
#'
#' 1. the original peak table with the results of retention time
#' correlation clustering included in additional column "RtCorrClust" as a
#' data frame.
#'
#' 2. A data frame of the weighted means and details of the most intense
#' feature for each group. In addition for each group details of the best sample
#' to consider for subsequent MSn fragmentation analysis based on the most
#' top 5 most intense samples is provided. This "best" sample for MS/MS is also
#' selected whilst concurrently minimising the number of samples to potentially
#' reinject for targetted MSn fragmentation analysis.
#'
#' @source \url{http://pubs.acs.org/doi/abs/10.1021/ac501530d} RAMClust: A Novel
#' Feature Clustering Method Enables Spectral-Matching-Based Annotation for
#' Metabolomics Data.
#' @export
ramClustMod <- function (peakTable=NULL, obsNames=NULL,
st = 3, sr = 0.5, maxt = 20, mult = 5, deepSplit = FALSE, blocksize = 2000,
hmax = 0.3, minModuleSize = 2, linkage = "average"){
# package dependencies
# require(xcms, quietly = TRUE)
# require(ff, quietly = TRUE)
# require(fastcluster, quietly = TRUE)
# require(dynamicTreeCut, quietly = TRUE)
if(is.null(obsNames)){
stop('argument obsNames is missing with no default')
}
# error handling or read from csv function
peakTable <- MetMSLine:::tableCheckRead(peakTable, stringsAsFactors=F)
# match obsNames to peak table colnames
obsIndx <- match(obsNames, colnames(peakTable))
# if less than all matched then stop
if(length(obsIndx) < length(obsNames)){
stop(length(obsIndx), " of ", length(obsNames),
" observation names were matched in the peakTable column names, check the obsNames and peakTable column names")
}
# replace any NAs with zero
obsTable <- t(peakTable[, obsIndx])
# if contains rtmin and rtmax columns
if(all(c("rtmin", "rtmax") %in% colnames(peakTable))){
st <- median((peakTable[, "rtmax"] - peakTable[, "rtmin"])/2)
}
times <- peakTable[, "rtmed"]
mzs <- peakTable[, "mzmed"]
xcmsOrd <- order(times)
obsTable <- obsTable[, order(times)]
mzs <- mzs[order(times)]
times <- times[order(times)]
featnames <- paste(mzs, "_", times, sep = "")
dimnames(obsTable)[[2]] <- featnames
n <- ncol(obsTable)
vlength <- (n * (n - 1))/2
nblocks <- floor(n/blocksize)
ffmat <- ff::ff(vmode = "double", dim = c(n, n), init = 0)
gc()
eval1 <- expand.grid(0:nblocks, 0:nblocks)
names(eval1) <- c("j", "k")
eval1 <- eval1[which(eval1[, "j"] <= eval1[, "k"]), ]
bl <- nrow(eval1)
a <- Sys.time()
cat("\n", paste("calculating ramclustR similarity: nblocks = ",
bl))
cat("\n", "finished:")
RCsim <- function(bl) {
cat(bl, " ")
j <- eval1[bl, "j"]
k <- eval1[bl, "k"]
startc <- min((1 + (j * blocksize)), n)
if ((j + 1) * blocksize > n) {
stopc <- n
}
else {
stopc <- (j + 1) * blocksize
}
startr <- min((1 + (k * blocksize)), n)
if ((k + 1) * blocksize > n) {
stopr <- n
}
else {
stopr <- (k + 1) * blocksize
}
if (startc <= startr) {
mint <- min(abs(outer(range(times[startr:stopr]),
range(times[startc:stopc]), FUN = "-")))
if (mint <= maxt) {
temp1 <- round(exp(-(((abs(outer(times[startr:stopr],
times[startc:stopc], FUN = "-"))))^2)/(2 *
(st^2))), digits = 20)
temp2 <- round(exp(-((1 - (pmax(cor(obsTable[, startr:stopr],
obsTable[, startc:stopc]))))^2)/(2 * (sr^2))),
digits = 20)
temp <- 1 - (temp1 * temp2)
temp[which(is.nan(temp))] <- 1
temp[which(is.na(temp))] <- 1
temp[which(is.infinite(temp))] <- 1
ffmat[startr:stopr, startc:stopc] <- temp
rm(temp1)
rm(temp2)
rm(temp)
gc()
}
if (mint > maxt) {
ffmat[startr:stopr, startc:stopc] <- 1
}
}
gc()
}
system.time(sapply(1:bl, RCsim))
b <- Sys.time()
cat("\n", "\n")
cat(paste("RAMClust feature similarity matrix calculated and stored:",
round(difftime(b, a, units = "mins"), digits = 1), "minutes"))
gc()
blocksize <- mult * round(blocksize^2/n)
nblocks <- floor(n/blocksize)
remaind <- n - (nblocks * blocksize)
RC <- vector(mode = "integer", length = vlength)
if(nblocks != 0){
pb <- txtProgressBar(min=0, max=nblocks, style=3)
}
for (k in 0:(nblocks)) {
if(nblocks != 0){
Sys.sleep(0.01)
setTxtProgressBar(pb, k)
}
startc <- 1 + (k * blocksize)
if ((k + 1) * blocksize > n) {
stopc <- n
}
else {
stopc <- (k + 1) * blocksize
}
temp <- ffmat[startc:nrow(ffmat), startc:stopc]
temp <- temp[which(row(temp) - col(temp) > 0)]
if (exists("startv") == FALSE)
startv <- 1
stopv <- startv + length(temp) - 1
RC[startv:stopv] <- temp
gc()
startv <- stopv + 1
rm(temp)
gc()
}
rm(startv)
gc()
message("Converting RAMClustR similarity matrix to a distance object...this
is a potentially computationally intensive step please wait...")
flush.console()
RC <- structure(RC, Size = (n), Diag = FALSE, Upper = FALSE,
method = "RAMClustR", Labels = featnames, class = "dist")
gc()
c <- Sys.time()
cat("\n", "\n")
cat(paste("RAMClust distances converted to distance object:",
round(difftime(c, b, units = "mins"), digits = 1), "minutes"))
ff::delete.ff(ffmat)
rm(ffmat)
gc()
system.time(RC <- fastcluster::hclust(RC, method = linkage))
gc()
d <- Sys.time()
cat("\n", "\n")
cat(paste("fastcluster based clustering complete:", round(difftime(d,
c, units = "mins"), digits = 1), "minutes"))
message("cutting dendrogram dynamically...")
flush.console()
if (minModuleSize == 1) {
clus <- dynamicTreeCut::cutreeDynamicTree(RC, maxTreeHeight = hmax, deepSplit = deepSplit,
minModuleSize = 2)
sing <- which(clus == 0)
clus[sing] <- max(clus) + 1:length(sing)
}
if (minModuleSize > 1) {
clus <- dynamicTreeCut::cutreeDynamicTree(RC, maxTreeHeight = hmax,
deepSplit = deepSplit,
minModuleSize = minModuleSize)
}
gc()
rm(RC)
peakTable$RtCorrClusts <- clus[order(xcmsOrd)]
zeroIndx <- which(peakTable$RtCorrClusts != 0)
message("Calculating weighted mean for ",
length(unique(peakTable$RtCorrClusts[zeroIndx])),
" pseudospectra accounting for ", length(peakTable$RtCorrClusts[zeroIndx]),
" of ", nrow(peakTable), " total features ")
flush.console()
# calculate weighted.means pseudospectra and return max intensity feature
# and sample
wMeanPspec <- by(peakTable[zeroIndx, ],
as.factor(peakTable$RtCorrClust[zeroIndx]), function(x){
# calc weights each variable
wts.tmp <- rowSums(x[, obsIndx])
wt.mean.tmp <- apply(x[, obsIndx], 2, function(y){
weighted.mean(y, wts.tmp)})
# concatenate max feature intensity and weighted means
maxFeat.indx <- which.max(wts.tmp)
# required columns from xcms diff
reqCols <- seq(1, ncol(peakTable), 1)
reqCols <- colnames(peakTable)[setdiff(reqCols, obsIndx)]
# calculate average intensity for max weight
meanInt <- mean(as.numeric(x[maxFeat.indx, obsIndx]))
names(meanInt) <- "meanInt"
# identify top 5 max Int samples
top5int <- head(colnames(sort(x[maxFeat.indx, obsIndx],
decreasing=T)), n=5)
names(top5int) <- paste0("mostIntSamp_", 1:5)
# sum features in pseudospectra
nFeatPspec <- nrow(x)
names(nFeatPspec) <- "nFeatPspec"
wt.mean.tmp <- c(x[maxFeat.indx, reqCols], meanInt,
nFeatPspec, top5int, wt.mean.tmp)
return(wt.mean.tmp)})
# convert by array to data frame
wMeanPspec <- data.frame(do.call(rbind, wMeanPspec), stringsAsFactors=F)
# frequency of most intense sample
mostIntIndx <- grep("mostIntSamp_", colnames(wMeanPspec))
freqMostInt <- sort(table(unlist(wMeanPspec[, mostIntIndx])), decreasing=T)
# add a column for the best sample to select for MS/MS
bestSampMSMS <- apply(wMeanPspec[, mostIntIndx], 1, function(x){
mostIntSamp.tmp <- unlist(x)
indx.tmp <- which(names(freqMostInt) %in%
mostIntSamp.tmp)
bestSamp.tmp <- names(freqMostInt)[indx.tmp[1]]
return(bestSamp.tmp)})
# rearrange column order and add in best sample for MS/MS
sampIdLogi <- colnames(wMeanPspec) %in% obsNames
RtCorrClust <- sort(unique(peakTable$RtCorrClust))
RtCorrClust <- RtCorrClust[RtCorrClust != 0]
wMeanPspec <- data.frame(RtCorrClust=RtCorrClust,
wMeanPspec[, sampIdLogi == F], bestSampMSMS,
wMeanPspec[, sampIdLogi == T], stringsAsFactors=F)
# coerce list to character
wMeanPspec <- data.frame(lapply(wMeanPspec, as.character), stringsAsFactors=F)
# return a list containing the original xcms diff report with rt corr clust
# info and the weighted mean data frame
return(list(diffRepAdd=peakTable, wMeanPspec=wMeanPspec))
# RC$featclus <- clus
# RC$frt <- times
# RC$fmz <- mzs
# RC$xcmsOrd <- xcmsOrd
# msint <- rep(0, length(RC$fmz))
# message("Calculating weighted means for each pseudospectral cluster...")
# flush.console()
# for (i in 1:ncol(obsTable)) {
# msint[i] <- weighted.mean(obsTable[, i], obsTable[, i])
# }
# RC$msint <- msint
# # if (ExpDes[[2]]["MSlevs", 1] == 2) {
# # msmsint <- rep(0, length(RC$fmz))
# # for (i in 1:ncol(obsTable)) {
# # msmsint[i] <- weighted.mean(data2[, i], data2[, i])
# # }
# # RC$msmsint <- msmsint
# # }
# clrt <- aggregate(RC$frt, by = list(RC$featclus), FUN = "mean")
# RC$clrt <- clrt[which(clrt[, 1] != 0), 2]
# clrtsd <- aggregate(RC$frt, by = list(RC$featclus), FUN = "sd")
# RC$clrtsd <- clrtsd[which(clrtsd[, 1] != 0), 2]
# RC$nfeat <- as.vector(table(RC$featclus)[2:max(RC$featclus)])
# RC$nsing <- length(which(RC$featclus == 0))
# e <- Sys.time()
# cat("\n", "\n")
# cat(paste("dynamicTreeCut based pruning complete:", round(difftime(e,
# d, units = "mins"), digits = 1), "minutes"))
# f <- Sys.time()
# cat("\n", "\n")
# cat(paste("RAMClust has condensed", n, "features into", max(clus),
# "spectra in", round(difftime(f, a, units = "mins"), digits = 1),
# "minutes", "\n"))
# RC$cmpd <- paste("C", 1:length(RC$clrt), sep = "")
# RC$ann <- RC$cmpd
# RC$annconf <- rep("", length(RC$clrt))
# RC$annnotes <- rep("", length(RC$clrt))
# RC$MSdata <- obsTable
#
# cat("\n", "\n", "... collapsing features into spectra")
# wts <- colSums(obsTable[])
# RC$SpecAbund <- matrix(nrow = nrow(obsTable), ncol = max(clus))
# for (ro in 1:nrow(RC$SpecAbund)) {
# for (co in 1:ncol(RC$SpecAbund)) {
# RC$SpecAbund[ro, co] <- weighted.mean(obsTable[ro,
# which(RC$featclus == co)], wts[which(RC$featclus ==
# co)])
# }
# }
# dimnames(RC$SpecAbund)[[1]] <- dimnames(RC$MSdata)[[1]]
# dimnames(RC$SpecAbund)[[2]] <- paste("C", 1:ncol(RC$SpecAbund),
# sep = "")
#
# g <- Sys.time()
# cat("\n", "\n")
# cat(paste("RAMClustR has collapsed feature quantities\n into spectral quantities:",
# round(difftime(g, f, units = "mins"), digits = 1),
# "minutes", "\n"))
#
# rm(obsTable)
#
# # if (length(dimnames(RC$SpecAbund)[[1]]) > length(unique(dimnames(RC$SpecAbund)[[1]]))) {
# # RC$SpecAbundAve <- aggregate(RC$SpecAbund[, 1:ncol(RC$SpecAbund)],
# # by = list(dimnames(RC$SpecAbund)[[1]]), FUN = "mean",
# # simplify = TRUE)
# # dimnames(RC$SpecAbundAve)[[1]] <- RC$SpecAbundAve[, 1]
# # RC$SpecAbundAve <- as.matrix(RC$SpecAbundAve[, 2:ncol(RC$SpecAbundAve)])
# # dimnames(RC$SpecAbundAve)[[2]] <- dimnames(RC$SpecAbund)[[2]]
# # gc()
# # }
# #
# # gc()
#
# return(RC)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.