#' rc.ramclustr
#'
#' Main clustering function for grouping features based on their analytical behavior.
#'
#' @param ramclustObj ramclustR object: containing ungrouped features. constructed by rc.get.xcms.data, for example
#' @param st numeric: sigma t - time similarity decay value
#' @param sr numeric: sigma r - correlational similarity decay value
#' @param maxt numeric: maximum time difference to calculate retention similarity for - all values beyond this are assigned similarity of zero
#' @param deepSplit logical: controls how agressively the HCA tree is cut - see ?cutreeDynamicTree
#' @param blocksize integer: number of features (scans?) processed in one block =1000,
#' @param mult numeric: internal value, can be used to influence processing speed/ram usage
#' @param hmax numeric: precut the tree at this height, default 0.3 - see ?cutreeDynamicTree
#' @param minModuleSize integer: how many features must be part of a cluster to be returned? default = 2
#' @param linkage character: heirarchical clustering linkage method - see ?hclust
#' @param cor.method character: which correlational method used to calculate 'r' - see ?cor
#' @param rt.only.low.n logical: default = TRUE At low injection numbers, correlational relationships of peak intensities may be unreliable. by defualt ramclustR will simply ignore the correlational r value and cluster on retention time alone. if you wish to use correlation with at n < 5, set this value to FALSE.
#' @param collapse logical: if true (default), feature quantitative values are collapsed into spectra quantitative values.
#' @param cor.use character: see 'use' option from 'cor' method. Default = 'everything'. if alternate (i.e. 'pairwise.complete.obs'), and NA may be returned, NA are replaced with 0 before clustering.
#' @details Main clustering function output - see citation for algorithm description or vignette('RAMClustR') for a walk through. batch.qc. normalization requires input of three vectors (1) batch (2) order (3) qc. This is a feature centric normalization approach which adjusts signal intensities first by comparing batch median intensity of each feature (one feature at a time) QC signal intensity to full dataset median to correct for systematic batch effects and then secondly to apply a local QC median vs global median sample correction to correct for run order effects.
#' @return $featclus: integer vector of cluster membership for each feature
#' @return $clrt: cluster retention time
#' @return $clrtsd: retention time standard deviation of all the features that comprise that cluster
#' @return $nfeat: number of features in the cluster
#' @return $nsing: number of 'singletons' - that is the number of features which clustered with no other feature
#' @return $cmpd: compound name. C#### are assigned in order of output by dynamicTreeCut. Compound with the most features is classified as C0001...
#' @return $ann: annotation. By default, annotation names are identical to 'cmpd' names. This slot is a placeholder for when annotations are provided
#' @return $SpecAbund: the cluster intensities after collapsing features to clusters
#' @return $SpecAbundAve: the cluster intensities after averaging all samples with identical sample names
#' @importFrom graphics abline axis boxplot hist "legend" "par" "plot" "points" "title"
#' @importFrom stats aggregate cor fitted lm loess median predict quantile sd weighted.mean
#' @importFrom utils edit read.csv read.delim read.delim2 write.csv packageVersion citation
#' @importFrom fastcluster hclust
#' @importFrom dynamicTreeCut cutreeDynamicTree
#' @importFrom e1071 skewness
#' @importFrom preprocessCore normalize.quantiles
#'
#' @references Broeckling CD, Afsar FA, Neumann S, Ben-Hur A, Prenni JE. RAMClust: a novel feature clustering method enables spectral-matching-based annotation for metabolomics data. Anal Chem. 2014 Jul 15;86(14):6812-7. doi: 10.1021/ac501530d. Epub 2014 Jun 26. PubMed PMID: 24927477.
#' @references Broeckling CD, Ganna A, Layer M, Brown K, Sutton B, Ingelsson E, Peers G, Prenni JE. Enabling Efficient and Confident Annotation of LC-MS Metabolomics Data through MS1 Spectrum and Time Prediction. Anal Chem. 2016 Sep 20;88(18):9226-34. doi: 10.1021/acs.analchem.6b02479. Epub 2016 Sep 8. PubMed PMID: 7560453.
#' @concept ramclustR
#' @concept RAMClustR
#' @concept metabolomics
#' @concept mass spectrometry
#' @concept clustering
#' @concept feature
#' @concept MSFinder
#' @concept xcms
#' @author Corey Broeckling
#' @export
rc.ramclustr <- function(
ramclustObj=NULL,
st=NULL,
sr=NULL,
maxt=NULL,
deepSplit=FALSE,
blocksize=2000,
mult=5,
hmax=NULL,
collapse=TRUE,
minModuleSize=2,
linkage="average",
cor.method="pearson",
rt.only.low.n = TRUE,
cor.use = "everything"
) {
if(is.null(ramclustObj)) {
stop("please supply a ramclustR object as input. see rc.get.xcms.data, for example")
}
########
# define ms levels, used several times below
if(is.null(ramclustObj$MSMSdata)) {
mslev <- 1
} else {
mslev <- 2
}
a<-Sys.time()
########
# set default parameters of not defined in function call
if(is.null(hmax)) {hmax<-0.3}
if(is.null(st)) {
if(!is.null(ramclustObj$st)) {
st <- ramclustObj$st
} else {
# use max feature retention time / 500 as a reasonable approximation
# not really validated....
st <- max(ramclustObj$frt)/500
}
}
if(is.null(maxt)) {maxt <- 100*st}
if(is.null(sr)) {sr <- 0.7}
params <- c(
"st"=st,
"sr"=sr,
"maxt"=maxt,
"deepSplit"=deepSplit,
"blocksize"=blocksize,
"mult"=mult,
"hmax"=hmax,
"collapse"=collapse,
"minModuleSize"=minModuleSize,
"linkage"=linkage,
"cor.method"=cor.method,
"rt.only.low.n" = rt.only.low.n
)
########
# establish some constants for downstream processing
data1 <- ramclustObj$MSdata
if(mslev == 2) {
data2 <- ramclustObj$MSMSdata
} else {
data2 <- data1
}
n<-ncol(data1)
vlength<-(n*(n-1))/2
nblocks<-floor(n/blocksize)
times <- ramclustObj$frt
mzs <- ramclustObj$fmz
featnames <- paste(ramclustObj$frt, ramclustObj$fmz, sep = "_")
########
# make list of all row and column blocks to evaluate
eval1<-expand.grid(0:nblocks, 0:nblocks)
names(eval1)<-c("j", "k") #j for cols, k for rows
eval1<-eval1[which(eval1[,"j"]<=eval1[,"k"]),] #upper triangle only
bl<-nrow(eval1)
tmp.ramclustObj <- calculate.similarity(
numcols = n,
data1 = data1, data2 = data2,
times = times, blocksize = blocksize,
mult = mult, maxt = maxt,
st = st, sr = sr,
rt.only.low.n = rt.only.low.n,
cor.method = cor.method,
cor.use = cor.use)
tmp.ramclustObj<-structure(
tmp.ramclustObj,
Size=(n),
Diag=FALSE,
Upper=FALSE,
method="RAMClustR",
Labels=featnames,
class="dist")
tmp.ramclustObj[which(is.na(tmp.ramclustObj))] <- 1
gc()
c<-Sys.time()
# cat("RAMClust distances converted to distance object", '\n')
########
# cluster using fastcluster package,
system.time(tmp.ramclustObj<-fastcluster::hclust(
tmp.ramclustObj,
method=linkage))
gc()
d<-Sys.time()
cat("fastcluster based clustering complete", '\n')
if(minModuleSize==1) {
clus<-dynamicTreeCut::cutreeDynamicTree(tmp.ramclustObj, maxTreeHeight=hmax, deepSplit=deepSplit, minModuleSize=2)
sing<-which(clus==0)
clus[sing]<-max(clus)+1:length(sing)
}
if(minModuleSize>1) {
clus<-dynamicTreeCut::cutreeDynamicTree(
tmp.ramclustObj,
maxTreeHeight=hmax,
deepSplit=deepSplit,
minModuleSize=minModuleSize)
}
gc()
########
# build results into ramclustObj
ramclustObj$merger <- tmp.ramclustObj$merger
ramclustObj$height <- tmp.ramclustObj$height
ramclustObj$order <- tmp.ramclustObj$order
ramclustObj$labels <- tmp.ramclustObj$labels
ramclustObj$method <- tmp.ramclustObj$method
ramclustObj$call <- tmp.ramclustObj$call
ramclustObj$dist.method <- tmp.ramclustObj$dist.method
ramclustObj$featclus<-clus
clrt<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="mean")
ramclustObj$clrt<-clrt[which(clrt[,1]!=0),2]
clrtsd<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="sd")
ramclustObj$clrtsd<-clrtsd[which(clrtsd[,1]!=0),2]
ramclustObj$nfeat<-as.vector(table(ramclustObj$featclus)[2:max(ramclustObj$featclus)])
ramclustObj$nsing<-length(which(ramclustObj$featclus==0))
e<-Sys.time()
cat("dynamicTreeCut based pruning complete", '\n')
f<-Sys.time()
cat(paste("RAMClust has condensed", n, "features into", max(clus), "spectra", '\n'))
strl<-nchar(max(ramclustObj$featclus)) - 1
ramclustObj$cmpd<-paste("C", formatC(1:length(ramclustObj$clrt), digits = strl, flag = 0 ) , sep="")
# cat(ramclustObj$cmpd[1:10], '\n')
ramclustObj$ann<-ramclustObj$cmpd
ramclustObj$annconf<-rep(4, length(ramclustObj$clrt))
ramclustObj$annnotes<-rep("", length(ramclustObj$clrt))
########
# collapse feature dataset into spectrum dataset
if(collapse=="TRUE") {
cat("collapsing feature into spectral signal intensities", '\n')
wts<-colMeans(data1[], na.rm = TRUE)
ramclustObj$SpecAbund<-matrix(nrow=nrow(data1), ncol=max(clus))
for (ro in 1:nrow(ramclustObj$SpecAbund)) {
for (co in 1:ncol(ramclustObj$SpecAbund)) {
ramclustObj$SpecAbund[ro,co]<- weighted.mean(data1[ro,which(ramclustObj$featclus==co)], wts[which(ramclustObj$featclus==co)], na.rm = TRUE)
}
}
dimnames(ramclustObj$SpecAbund)[[2]]<-ramclustObj$cmpd
g<-Sys.time()
}
if(!is.null(ramclustObj$sample_names)) {
smp.names <- ramclustObj$sample_names
} else {
if(!is.null(ramclustObj$phenoData[,"sample_names"])) {
smp.names <- ramclustObj$sample_names
} else {
smp.names <- ramclustObj$sample.names
}
}
if(is.null(smp.names)) {
stop("sample names is null")
} else {
dimnames(ramclustObj$SpecAbund)[[1]] <- smp.names
}
if(is.null(dimnames(ramclustObj$SpecAbund)[[1]])) {
stop('this appears to be an older format ramclustR object and does not have a "phenoData" slot with sample names')
}
if(nrow(ramclustObj$MSdata) < 5 & rt.only.low.n) {
warning('\n', "too few samples to use correlational similarity, clustering by retention time only", '\n')
}
ramclustObj$history$ramclustr <- paste0(
"Features were clustered using the ramclustR algorithm (Broeckling 2014). Parameter settings were as follows: ",
"st = ", st,
", sr = ", sr,
", maxt = ", maxt,
", deepSplit = ", deepSplit,
", hmax = ", hmax,
", minModuleSize = ", minModuleSize,
", and cor.method = ", cor.method, ". ",
if(nrow(ramclustObj$MSdata) < 5 & rt.only.low.n) {
"There were too few samples to use correlational similarity, thus clustering was performed using retention time only. "
} else {""}
)
if(is.null(ramclustObj$params)) {ramclustObj$params <- list()}
ramclustObj$params$rc.ramclustr <- params
return(ramclustObj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.