#' pmfxcms
#'
#' A wrapper function for XCMS using common PMF platforms.
#' @details This function uses a template file as input to guide XCMS parameter selection.
#' @details Peak detection using centwave (LC-TOF) or matchedFilter (GC-Quad), density based peak grouping, loess rt correction, regrouping, and fillPeaks.
#' @details Additionally performs unsupervised removal of ultrawide chromatgoraphic peaks and outlier detection and removal after peak detection step.
#' @details an XCMS object is returned.
#'
#' @param ExpDes R object generated by RAMClustR::defineExperiment function.
#' @param MStag Character: "01.cdf" the text string used to recognize xmcs files as MS (low collision energy) when using indiscriminant MS/MS (MSE, AIF, ...)
#' @param idMSMStag Character: "02.cdf",the text string used to recognize xmcs files as MS/MS (high collision energy) when using indiscriminant MS/MS (MSE, AIF, ...)
#' @param filetype Character "cdf", The file extension - will be appended to file names in the csv 'factor file'. case insensitive.
#' @param factorfile Character "seq.csv", The csv file containing filename (column 1) and sample name (column 2), Sample names can contain a delimitor separating factors, but all samples must contain the same number of factors (delimitors)
#' @param cores Integer default = 4, how many computer cores to use in processing
#' @param minpw Numerica (3) minimum peak width in seconds
#' @param maxpw Numeric (30) maximum peak width in seconds
#' @param filtPeaks Logical - TRUE. Should a filtering step be performed after peak detection to remove peaks out of minpw-maxpw range?
#' @param ppmerr Numeric (35). If CentWave is used, what ppm error setting is used for feature detection?
#' @param minTIC Numeric (10). If outTIC = TRUE, total ion signal minTIC fold lower than the median total ion signal are removed.
#' @param outTIC Logical (TRUE). Should we remove injections based on total ion signal following peak detection? Uses outlier detection based on the distribution of signal values across samples and a cutoff defined by minTIC (on the low end of the distribution only)
#' @param outPCA Logical (TRUE). Should we remove injections based on binned mass distribution after peak detection? Uses outlier detection on PC1 scores.
#' @param pcut Numeric (0.05). p.value cutoff used to detect outliers. p values optionall corrected by p.adj (benhamini hochberg by default)
#' @param p.adj character ("BH"). what pvalue false discovery rate correction to use on outlier detection steps. see ?p.adjust for options.
#' @param snthresh numeric (10). signal to noise threshold used for feature detection (centWave or matchedFilter)
#' @param mzdiffquad (0.5) mz bin size used for matchedFilter algorithm.
#' @param bwpre numeric (3). bw parameter used in the feature grouping (correspondence) step BEFORE retention time adjustement
#' @param bwpost numeric (1.5). bw parameter used in the feature grouping (correspondence) step AFTER retention time adjustment. If rt adjustment works well, we should be able to use a lower value than bwpre.
#' @param minfrac numeric (0.15). minimum fraction of samples a feature must be found in (of full dataset) to be passed to final sample set.
#' @param fwhm numeric (3). peak width at half height, used in matchedFilter algorithm.
#' @param maxt numeric (NULL). retention time in seconds - peaks with retention times greater than maxt are excluded.
#' @param step ?? (NULL).
#' @param rtcor character("loess"). one of NULL, "loess", "linear", "obiwarp", "nearest" NULL
#' @param seqskip integer (0). Number of lines from top of factorfile to skip before starting. rarely used.
#' @param regroup logical (FALSE). If true, try to load saved xcms object from 'datasets/' directory in working directory, skipping peak detection.
#' @param rem.files vector of integer values. if regroup == TRUE, optional vector of sample numbers to remove.
#' @return returns an xcms object
#' @concept RAMClustR
#' @author Corey Broeckling
#' @export
pmfxcms<-function(
ExpDes = NULL,
MStag="01.cdf",
idMSMStag="02.cdf",
filetype="cdf",
factorfile="seq.csv",
cores=4,
minpw=3,
maxpw=30,
filtPeaks=TRUE,
ppmerr=35,
minTIC=10,
outTIC=TRUE,
outPCA=TRUE,
pcut=0.05,
p.adj="BH",
snthresh=10,
mzdiffquad=0.5,
bwpre=3,
bwpost=1,
minfrac=0.15,
fwhm=3,
maxt=NULL,
step=NULL,
rtcor="loess", #one of NULL, "loess", "linear", "obiwarp", "nearest" NULL
seqskip=0,
regroup=FALSE,
rem.files=NULL
) {
require(xcms)
require(pcaMethods)
## INTERNAL FUNCTIONS
minlTIC<-function(xcmsObj=NULL,
mintic=minTIC) {
filelist<-vector()
for(i in 1:length(xcmsObj@filepaths)) {
fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
filelist<-c(filelist, fpv[length(fpv)] )
}
TIC<-aggregate(xcmsObj@peaks[,"into"], by=list(xcmsObj@peaks[,"sample"]), FUN="sum")[,2]
if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1])) {
MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
TIC<-TIC[grep(MStag, filelist, ignore.case=TRUE)]}
if(grepl("GC", ExpDes[[1]]["platform",1])){
MSfiles<-filelist
}
outliers<- which(TIC<(median(TIC)/mintic))
if(length(outliers)>0){
if(is.null(MStag)==FALSE) {
outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
}
keep<-vector(length=length(filelist)); keep[]<-"keep"
keep[which(match(filelist, outliers)>0)]<-"outlier"
outliers<-split(xcmsObj, f=keep)$outlier
save(outliers, file="datasets/xcmsPeaks_minTICOutliers.Rdata")
xcmsObj<-split(xcmsObj, f=keep)$keep
}
return(xcmsObj)[[1]]
}
outlTIC<-function(xcmsObj=NULL) {
filelist<-vector()
for(i in 1:length(xcmsObj@filepaths)) {
fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
filelist<-c(filelist, fpv[length(fpv)] )
}
TIC<-aggregate(xcmsObj@peaks[,"into"], by=list(xcmsObj@peaks[,"sample"]), FUN="sum")[,2]
if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1])) {
MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
TIC<-TIC[grep(MStag, filelist, ignore.case=TRUE)]}
if(grepl("GC", ExpDes[[1]]["platform",1])){
MSfiles<-filelist
}
TIC<-as.vector(scale(TIC, center=TRUE, scale=TRUE))
pvals<-p.adjust(2*pnorm(abs(TIC), lower.tail=FALSE), method="BH")
outliers<- which(pvals<pcut)
if(length(outliers)>0){
if(is.null(MStag)==FALSE) {
outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
}
keep<-vector(length=length(filelist)); keep[]<-"keep"
keep[which(match(filelist, outliers)>0)]<-"outlier"
outliers<-split(xcmsObj, f=keep)$outlier
save(outliers, file="datasets/xcmsPeaksTICOutliers.Rdata")
xcmsObj<-split(xcmsObj, f=keep)$keep
}
return(xcmsObj)[[1]]
}
outlPCA<-function(
xcmsObj=xset
){
filelist<-vector()
for(i in 1:length(xcmsObj@filepaths)) {
fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
filelist<-c(filelist, fpv[length(fpv)] )
}
MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
if(length(MSMSfiles) > 0 & ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1] == 2) {
MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
} else {
MSfiles<-filelist
}
maxmz<-ceiling(max(xcmsObj@peaks[,"mz"])) + 2
minmz<-floor(min(xcmsObj@peaks[,"mz"])) - 2
range<-maxmz-minmz
binmat<-matrix(nrow=length(MSfiles), ncol=range)
colnames(binmat)<-as.character(minmz:(maxmz-1))
for(i in 1:length(MSfiles)){
targ<-which(filelist==MSfiles[i])
bin<-floor(xcmsObj@peaks[which(xcmsObj@peaks[,"sample"]==targ), "mz"])
binval<-by(xcmsObj@peaks[which(xcmsObj@peaks[,"sample"]==targ),"into"], bin, FUN=sum)
slots<-names(binval)
binval<-as.vector(binval)
binmat[i,slots]<-round(binval)
}
binmat[is.na(binmat)]<-0
binmat<-binmat[,which(colSums(binmat)>0)]
outpc<-pca(binmat, scale="pareto")
PC1<-as.vector(scale(outpc@scores[,1], center=TRUE, scale=TRUE))
PC2<-as.vector(scale(outpc@scores[,2], center=TRUE, scale=TRUE))
pvals1<-p.adjust(2*pnorm(abs(PC1), lower.tail=FALSE), method="BH")
pvals2<-p.adjust(2*pnorm(abs(PC2), lower.tail=FALSE), method="BH")
minpval<-pmin(pvals1, pvals2)
outliers<- which(minpval<pcut)
if(length(outliers)>0){
if(grepl("Xevo", ExpDes[[2]]["msinst",1]) & !is.null(ExpDes$instrument["CE2",1])) {
outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
}
keep<-vector(length=length(filelist)); keep[]<-"keep"
keep[which(match(filelist, outliers)>0)]<-"outlier"
ouliers<-split(xcmsObj, f=keep)$outlier
save(ouliers, file="datasets/xcmsPeaksPCOutliers.Rdata")
xcmsObj<-split(xcmsObj, f=keep)$keep
}
return(xcmsObj)
}
##setup sample information, experimental design, and check raw data files against seq file
dir.create("datasets")
dataset<-list.files(getwd(), pattern=filetype, recursive = FALSE, ignore.case=TRUE)
if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1]==2)){
dataset<-dataset[union(grep(MStag, dataset, ignore.case=TRUE), grep(idMSMStag, dataset, ignore.case=TRUE) )]
}
if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1]==1) &
!grepl("DsDA", ExpDes[[1]]["platform",1])){
dataset<-dataset[grep(MStag, dataset, ignore.case=TRUE)]
}
length(dataset)
seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
files<-seq[,1]
if(grepl("thermo", ExpDes[[2]]["msinst",1], ignore.case=TRUE)) {
seq<-read.csv(file="seq.csv", header=TRUE, skip=seqskip, stringsAsFactors=FALSE)
#seq<-seq[order(seq[,1]),]
files<-paste(seq[,1], filetype, sep=".")
SampID<-paste(seq[,2], ExpDes$design["delim",1], "MS", sep="")
}
if((ExpDes$instrument["MSlevs",1]==2)) {
seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
seq<-seq[order(seq[,1]),]
files<-c(paste(seq[,1], "01.", filetype, sep=""), paste(seq[,1], "02.", filetype, sep=""))
SampID<-as.factor(c(paste(seq[,2], ExpDes$design["delim",1], "MS", sep=""), paste(seq[,2], ExpDes$design["delim",1], "idMSMS", sep="")))
}
if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument["MSlevs",1]==1)) {
seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
seq<-seq[order(seq[,1]),]
files<-seq[,1]
if(filetype == "cdf") {files<-paste(files, "01.", sep="")} else {files = paste(files, ".", sep="")}
files<-c(paste(files, filetype, sep=""))
SampID<-as.factor(c(paste(seq[,2], ExpDes$design["delim",1], "MS", sep="")))
}
# cat("dataset currently: ", '\n')
# cat(dataset)
# cat("dataset end: ", '\n')
dataset<-intersect(toupper(files), toupper(dataset))
missing<-setdiff(toupper(files), toupper(dataset))
if(length(missing)>0) {
cat("Files in", factorfile, "missing from working directory", '\n')
cat(missing)
stop()
}
##xcms section
if(!is.null(minTIC) & !is.numeric(minTIC)) stop("please choose a numeric value for 'minTIC'")
if(regroup) {
load("datasets/xcmsPeaks.Rdata")
if(grepl("Xevo", ExpDes$instrument["msinst",1])) {mzwid<-(300*ppmerr/1000000*4)} else {mzwid<-mzdiffquad}
if(!is.null(rem.files)) {
rem<-rep(0, length(xset@filepaths))
for(i in 1:length(rem.files)) {
rem[grep(rem.files[i], xset@filepaths)]<-1
}
xset<-xcms::split(xset, f=rem)[[1]]
}
} else {
if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]) & !grepl("DsDA", ExpDes[[1]]["platform",1])) { ########for LC
xset<-xcmsSet(files, nSlaves=cores, method = "centWave",
ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean",
integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
save(xset, file="datasets/xcmsPeaks.Rdata")
mzwid<-(300*ppmerr/1000000*4)
}
if(grepl("GC", ExpDes[[1]]["platform",1]) | grepl("GC", ExpDes[[2]]["chrominst",1]) ) { #######for GC
xset <- xcmsSet(dataset, nSlaves=cores, method = "matchedFilter", fwhm = fwhm,
max = 500, snthresh = snthresh, step = mzdiffquad, steps = 2, mzdiff = mzdiffquad,
index = FALSE, sleep = 0)
mzwid<-mzdiffquad
}
if(grepl("MICRO", ExpDes[[1]]["platform",1])) { #######for QTOFMicro data
mzdiffquad<-0.07
xset <- xcmsSet(dataset, nSlaves=cores, method = "matchedFilter", fwhm = fwhm,
max = 500, snthresh = snthresh, step = 0.1, steps = 2, mzdiff = mzdiffquad,
index = FALSE, sleep = 0)
mzwid<-mzdiffquad
}
if(grepl("TOF", ExpDes[[2]]["msinst",1]) & !grepl("MICRO", ExpDes[[1]]["platform",1]) & !grepl("Xevo", ExpDes$instrument["msinst",1]) ) { #######for QTOFMicro data
xset<-xcmsSet(files, nSlaves=cores, method = "centWave",
ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean", integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
save(xset, file="datasets/xcmsPeaks.Rdata")
mzwid<-(300*ppmerr/1000000*4)
}
if(grepl("TOF", ExpDes[[2]]["msinst",1]) & grepl("DsDA", ExpDes[[1]]["platform",1]) ) { #######for QTOFMicro data
xset<-xcmsSet(files, nSlaves=cores, method = "centWave",
ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean", integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
save(xset, file="datasets/xcmsPeaks.Rdata")
mzwid<-(300*ppmerr/1000000*4)
}
}
mzrange1<-round(range(xset@peaks[,c("mzmin", "mzmax")]))
xset@peaks[which(xset@peaks[,"mzmin"] < mzrange1[1]), "mzmin"]<-mzrange1[1]
xset@peaks[which(xset@peaks[,"mzmax"] > mzrange1[2]), "mzmax"]<-mzrange1[2]
xset@peaks[which(xset@peaks[,"mz"] < mzrange1[1]), "mz"]<-mzrange1[1]
xset@peaks[which(xset@peaks[,"mz"] > mzrange1[2]), "mz"]<-mzrange1[2]
if(!is.null(maxt)) {
orig<-xset@peaks
good<-which(orig[,"rt"]<maxt)
filt<-orig[good,]
xset@peaks<-filt
}
if(filtPeaks) {
orig<-xset@peaks
good<-which((orig[,"rtmax"]-orig[,"rtmin"])<(3*maxpw))
filt<-orig[good,]
xset@peaks<-filt
}
if(!is.null(minTIC) & is.numeric(minTIC)) xset<-minlTIC(xcmsObj=xset, mintic=minTIC)
if(outTIC) xset<-outlTIC(xcmsObj=xset)
if(outPCA) xset<-outlPCA(xcmsObj=xset)
xset <- group(xset, bw=bwpre, minfrac=minfrac, max = 50, mzwid=mzwid)
# save(xset, file="datasets/xcmsGroup1.Rdata")
if(!is.null(rtcor)) {
if(rtcor=="loess") {
xset <- retcor(xset, method="loess", family = "gaussian",
plottype = "mdevden", span=bwpre,
missing=round(length(dataset)*0.9, digits=0))
}
if(rtcor=="linear") {
xset <- retcor(xset, method="linear", family = "gaussian", plottype = "mdevden", span=bwpre, missing=round(length(dataset)*0.9, digits=0))
}
if(rtcor=="obiwarp") {
xset<-retcor.obiwarp(xset, profStep=2*mzwid, response=10)
}
if(rtcor=="nearest") {
xset<-group.nearest(xset, mzVsRTbalance=5, mzCheck=mzwid, rtCheck=maxpw*2)
}
# save(xset, file="datasets/xcmsRtCor.Rdata")
if(!is.null(rtcor)) xset <- group(xset, bw=bwpost, minfrac=minfrac, max= 100, mzwid=mzwid)
# save(xset, file="datasets/xcmsGroup2.Rdata")
}
xset <- fillPeaks.chrom(xset, nSlaves = cores)
if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1])) { ########for LC
kept<- gsub(MStag, "", basename(xset@filepaths), ignore.case=TRUE)
kept<- gsub(idMSMStag, "", kept, ignore.case=TRUE)
SampID<-vector(length=length(kept))
for(k in 1:length(SampID)) {
SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
}
xset@phenoData$class<-SampID
}
if(grepl("GC", ExpDes[[1]]["platform",1]) | grepl("GC", ExpDes[[2]]["chrominst",1]) ) { #######for GC
kept<- unlist(strsplit(basename(xset@filepaths), ".CDF"))
SampID<-vector(length=length(kept))
for(k in 1:length(SampID)) {
SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
}
xset@phenoData$class<-SampID
}
if(grepl("MICRO", ExpDes[[1]]["platform",1])) { #######for GC
kept<- gsub(MStag, "", basename(xset@filepaths), ignore.case=TRUE)
SampID<-vector(length=length(kept))
for(k in 1:length(SampID)) {
SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
}
xset@phenoData$class<-SampID
}
save(xset, file="datasets/xcmsFillPeaks.Rdata")
return(xset)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.