R/pmf-xcms.R

Defines functions pmfxcms

Documented in pmfxcms

#' pmfxcms
#'
#' A wrapper function for XCMS using common PMF platforms. 
#' @details This function uses a template file as input to guide XCMS parameter selection.  
#' @details Peak detection using centwave (LC-TOF) or matchedFilter (GC-Quad), density based peak grouping, loess rt correction, regrouping, and fillPeaks. 
#' @details Additionally performs unsupervised removal of ultrawide chromatgoraphic peaks and outlier detection and removal after peak detection step.
#' @details an XCMS object is returned.  
#' 
#' @param ExpDes R object generated by RAMClustR::defineExperiment function.
#' @param MStag Character: "01.cdf" the text string used to recognize xmcs files as MS (low collision energy) when using indiscriminant MS/MS (MSE, AIF, ...)
#' @param idMSMStag Character: "02.cdf",the text string used to recognize xmcs files as MS/MS (high collision energy) when using indiscriminant MS/MS (MSE, AIF, ...)
#' @param filetype Character "cdf",  The file extension - will be appended to file names in the csv 'factor file'.  case insensitive.
#' @param factorfile Character "seq.csv",  The csv file containing filename (column 1) and sample name (column 2),  Sample names can contain a delimitor separating factors, but all samples must contain the same number of factors (delimitors)
#' @param cores Integer default = 4, how many computer cores to use in processing 
#' @param minpw Numerica (3) minimum peak width in seconds
#' @param maxpw Numeric (30) maximum peak width in seconds
#' @param filtPeaks Logical - TRUE.  Should a filtering step be performed after peak detection to remove peaks out of minpw-maxpw range?  
#' @param ppmerr Numeric (35).  If CentWave is used, what ppm error setting is used for feature detection?
#' @param minTIC Numeric (10).  If outTIC = TRUE, total ion signal minTIC fold lower than the median total ion signal are removed.
#' @param outTIC Logical (TRUE). Should we remove injections based on total ion signal following peak detection? Uses outlier detection based on the distribution of signal values across samples and a cutoff defined by minTIC (on the low end of the distribution only)  
#' @param outPCA Logical (TRUE). Should we remove injections based on binned mass distribution after peak detection? Uses outlier detection on PC1 scores.
#' @param pcut Numeric (0.05).  p.value cutoff used to detect outliers.  p values optionall corrected by p.adj (benhamini hochberg by default)
#' @param p.adj character ("BH"). what pvalue false discovery rate correction to use on outlier detection steps.  see ?p.adjust for options.  
#' @param snthresh numeric (10). signal to noise threshold used for feature detection (centWave or matchedFilter)
#' @param mzdiffquad (0.5) mz bin size used for matchedFilter algorithm. 
#' @param bwpre numeric (3).  bw parameter used in the feature grouping (correspondence) step BEFORE retention time adjustement
#' @param bwpost numeric (1.5).  bw parameter used in the feature grouping (correspondence) step AFTER retention time adjustment.  If rt adjustment works well, we should be able to use a lower value than bwpre.
#' @param minfrac numeric (0.15).  minimum fraction of samples a feature must be found in (of full dataset) to be passed to final sample set.
#' @param fwhm numeric (3).  peak width at half height, used in matchedFilter algorithm.
#' @param maxt numeric (NULL).  retention time in seconds - peaks with retention times greater than maxt are excluded. 
#' @param step ?? (NULL).
#' @param rtcor character("loess"). one of NULL, "loess", "linear", "obiwarp", "nearest" NULL
#' @param seqskip integer (0).  Number of lines from top of factorfile to skip before starting.  rarely used. 
#' @param regroup logical (FALSE).  If true, try to load saved xcms object from 'datasets/' directory in working directory, skipping peak detection.  
#' @param rem.files vector of integer values. if regroup == TRUE, optional vector of sample numbers to remove.

#' @return returns an xcms object
#' @concept RAMClustR
#' @author Corey Broeckling

#' @export 
pmfxcms<-function(
  ExpDes = NULL,
  MStag="01.cdf",
  idMSMStag="02.cdf",
  filetype="cdf",
  factorfile="seq.csv",
  cores=4,
  minpw=3,
  maxpw=30,
  filtPeaks=TRUE,
  ppmerr=35,
  minTIC=10,
  outTIC=TRUE,
  outPCA=TRUE,
  pcut=0.05,
  p.adj="BH",
  snthresh=10,
  mzdiffquad=0.5,
  bwpre=3,
  bwpost=1,
  minfrac=0.15,
  fwhm=3,
  maxt=NULL,
  step=NULL,
  rtcor="loess",  		#one of NULL, "loess", "linear", "obiwarp", "nearest" NULL
  seqskip=0,
  regroup=FALSE,
  rem.files=NULL
) {
  
  require(xcms)
  require(pcaMethods)
  
  ## INTERNAL FUNCTIONS
  minlTIC<-function(xcmsObj=NULL,
                    mintic=minTIC) {
    filelist<-vector()
    for(i in 1:length(xcmsObj@filepaths)) {
      fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
      filelist<-c(filelist, fpv[length(fpv)] )
    }
    TIC<-aggregate(xcmsObj@peaks[,"into"], by=list(xcmsObj@peaks[,"sample"]), FUN="sum")[,2]
    if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]))  {
      MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
      MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
      TIC<-TIC[grep(MStag, filelist, ignore.case=TRUE)]}
    if(grepl("GC", ExpDes[[1]]["platform",1])){
      MSfiles<-filelist
    }
    
    outliers<- which(TIC<(median(TIC)/mintic))
    if(length(outliers)>0){
      if(is.null(MStag)==FALSE) {
        outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
      }
      keep<-vector(length=length(filelist)); keep[]<-"keep"
      keep[which(match(filelist, outliers)>0)]<-"outlier"
      outliers<-split(xcmsObj, f=keep)$outlier
      save(outliers, file="datasets/xcmsPeaks_minTICOutliers.Rdata")
      xcmsObj<-split(xcmsObj, f=keep)$keep
    }
    return(xcmsObj)[[1]]
  }
  
  outlTIC<-function(xcmsObj=NULL) {
    filelist<-vector()
    for(i in 1:length(xcmsObj@filepaths)) {
      fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
      filelist<-c(filelist, fpv[length(fpv)] )
    }
    TIC<-aggregate(xcmsObj@peaks[,"into"], by=list(xcmsObj@peaks[,"sample"]), FUN="sum")[,2]
    if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]))  {
      MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
      MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
      TIC<-TIC[grep(MStag, filelist, ignore.case=TRUE)]}
    if(grepl("GC", ExpDes[[1]]["platform",1])){
      MSfiles<-filelist
    }
    TIC<-as.vector(scale(TIC, center=TRUE, scale=TRUE))
    pvals<-p.adjust(2*pnorm(abs(TIC), lower.tail=FALSE), method="BH")
    outliers<- which(pvals<pcut)
    if(length(outliers)>0){
      if(is.null(MStag)==FALSE) {
        outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
      }
      keep<-vector(length=length(filelist)); keep[]<-"keep"
      keep[which(match(filelist, outliers)>0)]<-"outlier"
      outliers<-split(xcmsObj, f=keep)$outlier
      save(outliers, file="datasets/xcmsPeaksTICOutliers.Rdata")
      xcmsObj<-split(xcmsObj, f=keep)$keep
    }
    return(xcmsObj)[[1]]
  }
  outlPCA<-function(
    xcmsObj=xset
  ){
    filelist<-vector() 
    for(i in 1:length(xcmsObj@filepaths)) {
      fpv<-unlist(strsplit(xcmsObj@filepaths[i], "[/]"))
      filelist<-c(filelist, fpv[length(fpv)] )
    }
    
    
    MSMSfiles<-filelist[grep(idMSMStag, filelist, ignore.case=TRUE)]
    if(length(MSMSfiles) > 0 & ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1] == 2) {
      MSfiles<-filelist[grep(MStag, filelist, ignore.case=TRUE)]
    } else {
      MSfiles<-filelist
    }
    maxmz<-ceiling(max(xcmsObj@peaks[,"mz"])) + 2
    minmz<-floor(min(xcmsObj@peaks[,"mz"])) - 2
    range<-maxmz-minmz
    binmat<-matrix(nrow=length(MSfiles), ncol=range)
    colnames(binmat)<-as.character(minmz:(maxmz-1))
    for(i in 1:length(MSfiles)){
      targ<-which(filelist==MSfiles[i])
      bin<-floor(xcmsObj@peaks[which(xcmsObj@peaks[,"sample"]==targ), "mz"])
      binval<-by(xcmsObj@peaks[which(xcmsObj@peaks[,"sample"]==targ),"into"], bin, FUN=sum)
      slots<-names(binval)
      binval<-as.vector(binval)
      binmat[i,slots]<-round(binval)
    }
    binmat[is.na(binmat)]<-0
    binmat<-binmat[,which(colSums(binmat)>0)]
    outpc<-pca(binmat, scale="pareto")
    PC1<-as.vector(scale(outpc@scores[,1], center=TRUE, scale=TRUE))
    PC2<-as.vector(scale(outpc@scores[,2], center=TRUE, scale=TRUE))
    pvals1<-p.adjust(2*pnorm(abs(PC1), lower.tail=FALSE), method="BH")
    pvals2<-p.adjust(2*pnorm(abs(PC2), lower.tail=FALSE), method="BH")
    minpval<-pmin(pvals1, pvals2)
    outliers<- which(minpval<pcut)
    if(length(outliers)>0){
      if(grepl("Xevo", ExpDes[[2]]["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]))   {
        outliers<-c(MSfiles[outliers], sub(MStag, idMSMStag, MSfiles[outliers], ignore.case=TRUE))
      }
      keep<-vector(length=length(filelist)); keep[]<-"keep"
      keep[which(match(filelist, outliers)>0)]<-"outlier"
      ouliers<-split(xcmsObj, f=keep)$outlier
      save(ouliers, file="datasets/xcmsPeaksPCOutliers.Rdata")
      xcmsObj<-split(xcmsObj, f=keep)$keep
    }
    return(xcmsObj)
  }
  
  
  
  ##setup sample information, experimental design, and check raw data files against seq file  
  dir.create("datasets")
  dataset<-list.files(getwd(), pattern=filetype, recursive = FALSE, ignore.case=TRUE)
  if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1]==2)){
    dataset<-dataset[union(grep(MStag, dataset, ignore.case=TRUE), grep(idMSMStag, dataset, ignore.case=TRUE) )]
  }
  if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument[which(row.names(ExpDes$instrument) == "MSlevs"),1]==1) & 
     !grepl("DsDA", ExpDes[[1]]["platform",1])){
    dataset<-dataset[grep(MStag, dataset, ignore.case=TRUE)]
  }
  
  
  length(dataset)
  seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
  files<-seq[,1]
  if(grepl("thermo", ExpDes[[2]]["msinst",1], ignore.case=TRUE)) {
    seq<-read.csv(file="seq.csv", header=TRUE, skip=seqskip, stringsAsFactors=FALSE)
    #seq<-seq[order(seq[,1]),]
    files<-paste(seq[,1], filetype, sep=".")
    SampID<-paste(seq[,2], ExpDes$design["delim",1], "MS", sep="")
  }
  if((ExpDes$instrument["MSlevs",1]==2)) {
    seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
    seq<-seq[order(seq[,1]),]
    files<-c(paste(seq[,1], "01.", filetype, sep=""), paste(seq[,1], "02.", filetype, sep=""))
    SampID<-as.factor(c(paste(seq[,2], ExpDes$design["delim",1], "MS", sep=""), paste(seq[,2], ExpDes$design["delim",1], "idMSMS", sep="")))
  }
  if(grepl("Waters", ExpDes[[2]]["msinst",1]) & (ExpDes$instrument["MSlevs",1]==1)) {
    seq<-read.csv(file=factorfile, header=TRUE, stringsAsFactors=FALSE)
    seq<-seq[order(seq[,1]),]
    files<-seq[,1]
    if(filetype == "cdf") {files<-paste(files, "01.", sep="")} else {files = paste(files, ".", sep="")}
    files<-c(paste(files, filetype, sep=""))
    SampID<-as.factor(c(paste(seq[,2], ExpDes$design["delim",1], "MS", sep="")))
  }
  # cat("dataset currently: ", '\n')
  # cat(dataset)
  # cat("dataset end: ", '\n')
  dataset<-intersect(toupper(files), toupper(dataset))
  missing<-setdiff(toupper(files), toupper(dataset))
  if(length(missing)>0) {
    cat("Files in", factorfile, "missing from working directory", '\n')
    cat(missing)
    stop()   
  }
  
  ##xcms section
  
  if(!is.null(minTIC) & !is.numeric(minTIC)) stop("please choose a numeric value for 'minTIC'")
  if(regroup) {
    load("datasets/xcmsPeaks.Rdata")
    if(grepl("Xevo", ExpDes$instrument["msinst",1])) {mzwid<-(300*ppmerr/1000000*4)} else {mzwid<-mzdiffquad}
    if(!is.null(rem.files)) {
      rem<-rep(0, length(xset@filepaths))
      for(i in 1:length(rem.files)) {
        rem[grep(rem.files[i], xset@filepaths)]<-1
      }
      xset<-xcms::split(xset, f=rem)[[1]]
    }
  } else {
    if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]) & !grepl("DsDA", ExpDes[[1]]["platform",1])) {							########for LC
      xset<-xcmsSet(files, nSlaves=cores, method = "centWave",
                    ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean", 
                    integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
      save(xset, file="datasets/xcmsPeaks.Rdata")
      mzwid<-(300*ppmerr/1000000*4)
    }
    
    if(grepl("GC", ExpDes[[1]]["platform",1]) | grepl("GC", ExpDes[[2]]["chrominst",1]) ) {															#######for GC
      xset <- xcmsSet(dataset, nSlaves=cores, method = "matchedFilter", fwhm = fwhm,
                      max = 500, snthresh = snthresh, step = mzdiffquad, steps = 2, mzdiff = mzdiffquad, 
                      index = FALSE, sleep = 0)
      mzwid<-mzdiffquad
    }
    if(grepl("MICRO", ExpDes[[1]]["platform",1])) {														#######for QTOFMicro data
      mzdiffquad<-0.07
      xset <- xcmsSet(dataset, nSlaves=cores, method = "matchedFilter", fwhm = fwhm, 
                      max = 500, snthresh = snthresh, step = 0.1, steps = 2, mzdiff = mzdiffquad, 
                      index = FALSE, sleep = 0)
      mzwid<-mzdiffquad
    }
    if(grepl("TOF", ExpDes[[2]]["msinst",1]) & !grepl("MICRO", ExpDes[[1]]["platform",1]) & !grepl("Xevo", ExpDes$instrument["msinst",1])  ) {														#######for QTOFMicro data
      xset<-xcmsSet(files, nSlaves=cores, method = "centWave", 
                    ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean", integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
      save(xset, file="datasets/xcmsPeaks.Rdata")
      mzwid<-(300*ppmerr/1000000*4)
    }
    if(grepl("TOF", ExpDes[[2]]["msinst",1]) & grepl("DsDA", ExpDes[[1]]["platform",1])  ) {														#######for QTOFMicro data
      xset<-xcmsSet(files, nSlaves=cores, method = "centWave", 
                    ppm=ppmerr, peakwidth=c(minpw:maxpw), snthresh=snthresh, mzCenterFun="wMean", integrate=2, mzdiff=(300*ppmerr/1000000*4), verbose.columns=TRUE, fitgauss=TRUE)
      save(xset, file="datasets/xcmsPeaks.Rdata")
      mzwid<-(300*ppmerr/1000000*4)
    }
  }
  
  mzrange1<-round(range(xset@peaks[,c("mzmin", "mzmax")]))
  xset@peaks[which(xset@peaks[,"mzmin"] < mzrange1[1]), "mzmin"]<-mzrange1[1]
  xset@peaks[which(xset@peaks[,"mzmax"] > mzrange1[2]), "mzmax"]<-mzrange1[2]
  xset@peaks[which(xset@peaks[,"mz"] < mzrange1[1]), "mz"]<-mzrange1[1]
  xset@peaks[which(xset@peaks[,"mz"] > mzrange1[2]), "mz"]<-mzrange1[2]
  
  if(!is.null(maxt)) {
    orig<-xset@peaks
    good<-which(orig[,"rt"]<maxt)
    filt<-orig[good,]
    xset@peaks<-filt 
  }
  if(filtPeaks) {
    orig<-xset@peaks
    good<-which((orig[,"rtmax"]-orig[,"rtmin"])<(3*maxpw))
    filt<-orig[good,]
    xset@peaks<-filt
  }
  if(!is.null(minTIC) & is.numeric(minTIC)) xset<-minlTIC(xcmsObj=xset, mintic=minTIC)
  if(outTIC)  xset<-outlTIC(xcmsObj=xset)
  if(outPCA)  xset<-outlPCA(xcmsObj=xset)
  xset <- group(xset, bw=bwpre, minfrac=minfrac, max = 50, mzwid=mzwid)
  # save(xset, file="datasets/xcmsGroup1.Rdata")
  if(!is.null(rtcor)) {
    if(rtcor=="loess") { 
      xset <- retcor(xset,  method="loess", family = "gaussian", 
                     plottype = "mdevden", span=bwpre, 
                     missing=round(length(dataset)*0.9, digits=0)) 
    }
    if(rtcor=="linear") { 
      xset <- retcor(xset,  method="linear", family = "gaussian", plottype = "mdevden", span=bwpre, missing=round(length(dataset)*0.9, digits=0)) 
    }
    if(rtcor=="obiwarp") {
      xset<-retcor.obiwarp(xset, profStep=2*mzwid, response=10)
    }
    if(rtcor=="nearest") {
      xset<-group.nearest(xset, mzVsRTbalance=5, mzCheck=mzwid, rtCheck=maxpw*2)
    }
    
    # save(xset, file="datasets/xcmsRtCor.Rdata")
    if(!is.null(rtcor)) xset <- group(xset, bw=bwpost, minfrac=minfrac, max= 100, mzwid=mzwid)
    # save(xset, file="datasets/xcmsGroup2.Rdata")
  }
  xset <- fillPeaks.chrom(xset, nSlaves = cores)
  
  if(grepl("Xevo", ExpDes$instrument["msinst",1]) & !is.null(ExpDes$instrument["CE2",1]))  {         		########for LC
    kept<- gsub(MStag, "", basename(xset@filepaths), ignore.case=TRUE)
    kept<- gsub(idMSMStag, "", kept, ignore.case=TRUE)
    SampID<-vector(length=length(kept))
    for(k in 1:length(SampID)) {
      SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
    }
    xset@phenoData$class<-SampID
  }
  if(grepl("GC", ExpDes[[1]]["platform",1]) | grepl("GC", ExpDes[[2]]["chrominst",1]) ) {    										#######for GC
    kept<- unlist(strsplit(basename(xset@filepaths), ".CDF"))
    SampID<-vector(length=length(kept))
    for(k in 1:length(SampID)) {
      SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
    }
    xset@phenoData$class<-SampID
  }
  if(grepl("MICRO", ExpDes[[1]]["platform",1])) {    										#######for GC
    kept<- gsub(MStag, "", basename(xset@filepaths), ignore.case=TRUE)
    SampID<-vector(length=length(kept))
    for(k in 1:length(SampID)) {
      SampID[k]<-seq[which(toupper(kept[k]) == toupper(seq[,1])), 2]
    }
    xset@phenoData$class<-SampID
  }
  
  save(xset, file="datasets/xcmsFillPeaks.Rdata")
  return(xset)
}
cbroeckl/csu.pmf.tools documentation built on Jan. 26, 2024, 6:27 p.m.