R/pmf-xcms_new.R

Defines functions pmfxcms.2

Documented in pmfxcms.2

#' pmfxcms
#'
#' A wrapper function for XCMS using common PMF platforms. 
#' @details This function uses a template file as input to guide XCMS parameter selection.  
#' @details Peak detection using centwave (LC-TOF) or matchedFilter (GC-Quad), density based peak grouping, loess rt correction, regrouping, and fillPeaks. 
#' @details Additionally performs unsupervised removal of ultrawide chromatgoraphic peaks and outlier detection and removal after peak detection step.
#' @details an XCMS object is returned.  
#' 
#' @param filetype.ms1 File extension of raw data for MS level data.  This extention will be scanned in all raw data files in working directory and will be added to filenames in sequence .csv file
#' @param filetype.ms2 File extension of raw data for MSe level data.  If not MSe data, set to NA.  This extention will be scanned in all raw data files in working directory and will be added to filenames in sequence .csv file
#' @param sequence .csv file containing filenames (no extension) in first column, and sample name and/or factor names in remaining columns.
#' @param delim symbol used to delimit factors in sequence file.  "-" by default.  
#' @param ms.res numerical value representing mass resolution of the MS system.  if NA, assumed to be quad data (low res). if numeric, used to set mass tolerances for xcms.
#' @param peak.width numerical vector of length two giving peak width range in seconds i.e. c(3, 30)
#' @param sn signal to noise threshold for peak detection
#' @param minfrac minimum proportion of samples in which a feature must be present (at one MS level). i.e. 0.4 is 4 out of 10.
#' @param bw.pre bandwidth for pre-rt correction grouping.  suggested value is 3, but if there is a good deal of retention drift over the sample set a larger value should be used. 
#' @param bw.post bandwidth for post-rt correction grouping.  suggested values is 1.5.  
#' @param reprocess filepath/name - should we try reprocessing an existing xcmsSet?  provide file name.  Will reprocess all steps after peak detection. 
#' @return returns an xcms object
#' @concept RAMClustR xcms
#' @author Corey Broeckling

#' @export 
pmfxcms.2<-function(
  filetype.ms1 = "_01.mzML",
  filetype.ms2 = "_02.mzML",
  sequence = 'seq.csv',
  delim = "-",
  ms.res = 30000,
  peak.width = c(3,30),
  sn = 5,
  minfrac = 0.4,
  bw.pre = 3,
  bw.post = 1.2,
  n.cores = 4,
  reprocess = NULL
) {
  
  require(xcms)
  #  require(pcaMethods)
  
  ##setup sample information, experimental design, and check raw data files against seq file  
  
  dir.create("datasets")
  dataset<-list.files(getwd(), pattern=filetype.ms1, recursive = FALSE, ignore.case=TRUE)
  if(length(dataset) == 0) {
    stop("no .ms1 data files found")
  }
  if(!is.na(filetype.ms2)) {
    dataset.2 <- list.files(getwd(), pattern=filetype.ms2, recursive = FALSE, ignore.case=TRUE)
    if(length(dataset.2) == 0) {
      stop("no .ms1 data files found. If this is not MSe data, set filetype.ms2 = NA")
    }
    dataset <- c(dataset, dataset.2)
  }
  
  seq <- read.csv(sequence, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
  
  # remove columns with no data, and remove leading and trailing whitespace
  r <- sapply(1:ncol(seq), FUN = function(x) {
    tmp <- which(is.na(seq[,x]))
    length(tmp)
  })
  r <- which(r == dim(seq)[[1]])
  if(length(r) > 0) {
    seq <- seq[,-r]
  }
  
  if(ncol(seq) < 2) {
    stop("sequence must contain at least two columns", '\n')
  }
  
  for(i in 1:ncol(seq)) {
    seq[,i] <- trimws(seq[,i])
  }
  
  ## ensure all files are present
  files <- paste0(seq[,1], filetype.ms1)
  if(!is.na(filetype.ms2)) {
    files <- c(files, paste0(seq[,1], filetype.ms2))
  }
  
  missing <- which(!files %in% dataset)
  if(length(missing)>0) {
    cat("missing files include: ", '\n')
    for(i in 1:length(missing)) {
      cat("  ", files[missing[i]], '\n')
    }
    stop("please correct missing file issue by either changing the sequence file or the filetype option", '\n')
  }
  
  ## if factors are provided in multiple columns, concatenate to single sample names
  if(ncol(seq) == 2) {
    sample.names <- seq[,2]
    factor.names <- names(seq)[2]
  } else {
    sample.names <- sapply(1:nrow(seq), FUN = function(x) {
      paste0(seq[x, 2:ncol(seq)], collapse = delim)
    })
    factor.names <- paste0(names(seq)[2:ncol(seq)], collapse = delim)
  }
  
  
  
  ########  xcms section
  
  ## generate OnDiskMSnExp MSn experiment
  ## using pd directly causes error
  if(is.na(filetype.ms2)) {
    pheno.data <- data.frame(sample.names)
  } else {
    pheno.data <- data.frame(c(sample.names, sample.names))
  }
  raw_data <- readMSData(files = files, 
                         pdata = new("NAnnotatedDataFrame", data.frame(sample.names)),
                         mode = "onDisk"
  )
  
  
  
  ## consider removing lockmass here so we do not have to do so in Pwiz
  
  ## which type of MS/MS data do we have (if any)?
  ## some basic data structure summary stats
  
  mse <- FALSE
  dda <- FALSE
  sonar <- FALSE
  hdmse <- FALSE
  swath <- FALSE
  
  h <- header(raw_data)
  mslevs <- table(h$msLevel)
  ms3 <- mslevs['3']; if(is.na(ms3)) {ms3 <- 0}
  ms2 <- mslevs['2']; if(is.na(ms2)) {ms2 <- 0}
  ms1 <- mslevs['1']; if(is.na(ms1)) {ms1 <- 1}  ## added if.is na to fix issue
  
  is.ms2 <- which(h$msLevel == 2)
  is.ms1 <- which(h$msLevel == 1)
  
  file.1 <- which(h[,"fileIdx"] == 1)
  ms1.time.range <- range(h[intersect(file.1, is.ms1),"retentionTime"])
  ms1.scan.interval <- (ms1.time.range[2] - ms1.time.range[1])/length(intersect(file.1, is.ms1))
  ms.fwhm <- round(550/ms.res, digits = 3)
  ppm <- round(1.5*(1000000*ms.fwhm)/(550*2.355))
  
  if(!is.na(ms2)) {
    set.masses <- unique(round(h$precursorMZ[is.ms2]))
  } else {set.masses <- NA}
  
  ## mse rules
  if(
    round(ms2/ms1, digits = 2) == 1
  ) {
    mse <- TRUE
  }
  
  ## dda/dsda rules
  if(
    !mse & 
    ms2 > 0 & 
    length(set.masses) > 205
  ) {
    dda <- TRUE
  }
  
  
  
  ### XCMS feature detection
  ## trick XCMS time
  orig.msLevel <- raw_data@featureData@data$msLevel
  raw_data@featureData@data$msLevel <- rep(1, length(orig.msLevel))
  
  ## set up parallel parameters
  mcpar <- SnowParam(workers = n.cores, type = "SOCK")
  
  if(is.na(ms.res)) {
    stop("not set up for matched.filter just yet", '\n')
  } else {
    ## centwave peak detection
    ## start XCMS processing: 
    ## peak detection with centWave
    cwp <- CentWaveParam(peakwidth = peak.width, 
                         ppm = ppm,
                         snthresh = sn,
                         mzdiff = ms.fwhm,
                         fitgauss = TRUE,
                         verboseColumns = TRUE)
    if(is.null(reprocess)) {
      xdata <- findChromPeaks(raw_data, param = cwp, msLevel = 1, BPPARAM = mcpar)
      save(xdata, file = "datasets/xcms.1.Rdata")
    } else {
      if(is.logical(reprocess)) {stop("reprocess should be a file name, not a logical statement, '\n'")}
      if(!file.exists(reprocess)) {stop("file", reprocess, "does not exist", '\n')}
      tmp <- load(reprocess)
      xdata <- get(tmp)
    }
    
    # if(mse) {
    #   xdata <- findChromPeaks(xdata, param = cwp, msLevel = 2, add = TRUE)
    # }
    # head(chromPeaks(xdata)) 
  }
  # table(orig.msLevel)
  # raw_data@featureData@data$msLevel <- rep(1, length(orig.msLevel))
  
  ## consider additionally defining sample groups based on experimental design factor levels
  if(is.na(filetype.ms2)) {
    sample.groups <- rep(1, length(xdata@.processHistory[[1]]@fileIndex))
  } else {
    sample.groups <- c(
      rep(1, length(xdata@.processHistory[[1]]@fileIndex)/2),
      rep(2, length(xdata@.processHistory[[1]]@fileIndex)/2)
    )
  }


  
  
  
  ### XCMS feature grouping, pre-RT correction
  
  pdp <- PeakDensityParam(sampleGroups = sample.groups, binSize = ms.fwhm,
                          minFraction = minfrac, bw = bw.pre)
  xdata <- groupChromPeaks(xdata, param = pdp) 
  save(xdata, file = "datasets/xcms.2.Rdata")
  
  
  
  ### XCMS retention time adjustment, peak density
  
  pgp <- PeakGroupsParam(
    minFraction = max(0.5, minfrac)
  )
  xdata <- adjustRtime(xdata, param = pgp) 
  save(xdata, file = "datasets/xcms.3.Rdata")
  
  ### XCMS feature grouping, pre-RT correction
  
  pdp <- PeakDensityParam(sampleGroups = sample.groups, binSize = ms.fwhm,
                          minFraction = minfrac, bw = bw.post)
  xdata <- groupChromPeaks(xdata, param = pdp) 
  save(xdata, file = "datasets/xcms.4.Rdata")
  
  ### XCMS fillPeaks
  
  fpp <- FillChromPeaksParam(expandMz = 0, expandRt = 0, ppm = 0)
  xdata <- fillChromPeaks(xdata, param = fpp, BPPARAM = mcpar)
  
  raw_data@featureData@data$msLevel <- orig.msLevel
  
  save(xdata, file="datasets/xcms.5.Rdata")
  return(xdata)
}




#' 
#' 
#' 
#' 
#' 
#' #' pmfxcms
#' #'
#' #' A wrapper function for XCMS using common PMF platforms. 
#' #' @details This function uses a template file as input to guide XCMS parameter selection.  
#' #' @details Peak detection using centwave (LC-TOF) or matchedFilter (GC-Quad), density based peak grouping, loess rt correction, regrouping, and fillPeaks. 
#' #' @details Additionally performs unsupervised removal of ultrawide chromatgoraphic peaks and outlier detection and removal after peak detection step.
#' #' @details an XCMS object is returned.  
#' #' 
#' #' @param ExpDes R object generated by RAMClustR::defineExperiment function.
#' #' @param filetype File extension of raw data.  This extention will be scanned in all raw data files in working directory and will be added to filenames in sequence .csv file
#' #' @param sequence .csv file containing filenames (no extension) in first column, and sample name and/or factor names in remaining columns.
#' #' @param delim symbol used to delimit factors in sequence file.  "-" by default.  
#' #' @param ms.res numerical value representing mass resolution of the MS system.  if NA, assumed to be quad data (low res). if numeric, used to set mass tolerances for xcms.
#' #' @param peak.width numerical vector of length two giving peak width range in seconds i.e. c(3, 30)
#' #' @param sn signal to noise threshold for peak detection
#' #' @param minfrac minimum proportion of samples in which a feature must be present (at one MS level). i.e. 0.4 is 4 out of 10.
#' #' @param bw.pre bandwidth for pre-rt correction grouping.  suggested value is 3, but if there is a good deal of retention drift over the sample set a larger value should be used. 
#' #' @param bw.post bandwidth for post-rt correction grouping.  suggested values is 1.5.  
#' #' @return returns an xcms object
#' #' @concept RAMClustR xcms
#' #' @author Corey Broeckling
#' 
#' #' @export 
#' pmfxcms.2<-function(
#'   ExpDes = NULL,
#'   filetype = "_012.mzML",
#'   sequence = 'seq.csv',
#'   delim = "-",
#'   ms.res = 30000,
#'   peak.width = c(3,30),
#'   sn = 10,
#'   minfrac = 0.4,
#'   bw.pre = 3,
#'   bw.post = 1.5
#' ) {
#'   
#'   require(xcms)
#'   #  require(pcaMethods)
#'   
#'   ##setup sample information, experimental design, and check raw data files against seq file  
#'   dir.create("datasets")
#'   dataset<-list.files(getwd(), pattern=filetype, recursive = FALSE, ignore.case=TRUE)
#'   
#'   seq <- read.csv(sequence, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
#'   
#'   # remove columns with no data, and remove leading and trailing whitespace
#'   r <- sapply(1:ncol(seq), FUN = function(x) {
#'     tmp <- which(is.na(seq[,x]))
#'     length(tmp)
#'   })
#'   r <- which(r == dim(seq)[[1]])
#'   if(length(r) > 0) {
#'     seq <- seq[,-r]
#'   }
#'   
#'   if(ncol(seq) < 2) {
#'     stop("sequence must contain at least two columns", '\n')
#'   }
#'   
#'   for(i in 1:ncol(seq)) {
#'     seq[,i] <- trimws(seq[,i])
#'   }
#'   
#'   ## ensure all files are present
#'   files <- paste0(seq[,1], filetype)
#'   missing <- which(!files %in% dataset)
#'   if(length(missing)>0) {
#'     cat("missing files include: ", '\n')
#'     for(i in 1:length(missing)) {
#'       cat("  ", files[missing[i]], '\n')
#'     }
#'     stop("please correct missing file issue by either changing the sequence file or the filetype option", '\n')
#'   }
#'   
#'   ## if factors are provided in multiple columns, concatenate to single sample names
#'   if(ncol(seq) == 2) {
#'     sample.names <- seq[,2]
#'     factor.names <- names(seq)[2]
#'   } else {
#'     sample.names <- sapply(1:nrow(seq), FUN = function(x) {
#'       paste0(seq[x, 2:ncol(seq)], collapse = delim)
#'     })
#'     factor.names <- paste0(names(seq)[2:ncol(seq)], collapse = delim)
#'   }
#'   
#'   
#'   
#'   ########  xcms section
#'   
#'   ## generate OnDiskMSnExp MSn experiment
#'   ## using pd directly causes error
#'   raw_data <- readMSData(files = files, 
#'                          pdata = new("NAnnotatedDataFrame", data.frame(sample.names)),
#'                          mode = "onDisk"
#'   )
#'   
#'   ## consider removing lockmass here so we do not have to do so in Pwiz
#'   
#'   ## which type of MS/MS data do we have (if any)?
#'   ## some basic data structure summary stats
#'   
#'   mse <- FALSE
#'   dda <- FALSE
#'   sonar <- FALSE
#'   hdmse <- FALSE
#'   swath <- FALSE
#'   
#'   h <- header(raw_data)
#'   mslevs <- table(h$msLevel)
#'   ms3 <- mslevs['3']; if(is.na(ms3)) {ms3 <- 0}
#'   ms2 <- mslevs['2']; if(is.na(ms2)) {ms2 <- 0}
#'   ms1 <- mslevs['1']
#'   
#'   is.ms2 <- which(h$msLevel == 2)
#'   is.ms1 <- which(h$msLevel == 1)
#'   
#'   file.1 <- which(h[,"fileIdx"] == 1)
#'   ms1.time.range <- range(h[intersect(file.1, is.ms1),"retentionTime"])
#'   ms1.scan.interval <- (ms1.time.range[2] - ms1.time.range[1])/length(intersect(file.1, is.ms1))
#'   ms.fwhm <- round(550/ms.res, digits = 3)
#'   ppm <- round((1000000*ms.fwhm)/(550*2.355))
#'   
#'   if(!is.na(ms2)) {
#'     set.masses <- unique(round(h$precursorMZ[is.ms2]))
#'   } else {set.masses <- NA}
#'   
#'   ## mse rules
#'   if(
#'     round(ms2/ms1, digits = 2) == 1
#'   ) {
#'     mse <- TRUE
#'   }
#'   
#'   ## dda/dsda rules
#'   if(
#'     !mse & 
#'     ms2 > 0 & 
#'     length(set.masses) > 205
#'   ) {
#'     dda <- TRUE
#'   }
#'   
#'   
#'   
#'   ### XCMS feature detection
#'   
#'   if(is.na(ms.res)) {
#'     stop("not set up for matched.filter just yet", '\n')
#'   } else {
#'     ## centwave peak detection
#'     ## start XCMS processing: 
#'     ## peak detection with centWave
#'     cwp <- CentWaveParam(peakwidth = peak.width, 
#'                          ppm = ppm,
#'                          snthresh = 5,
#'                          mzdiff = ms.fwhm,
#'                          fitgauss = TRUE,
#'                          verboseColumns = TRUE)
#'     xdata <- findChromPeaks(raw_data, param = cwp, msLevel = 1)
#'     if(mse) {
#'       xdata <- findChromPeaks(xdata, param = cwp, msLevel = 2, add = TRUE)
#'       }
#'         # head(chromPeaks(xdata)) 
#'   }
#'   
#'   ## consider additionally defining sample groups based on experimental design factor levels
#'   sample.groups <- rep(1, length(xdata@.processHistory[[1]]@fileIndex))
#'   # if(mse) {
#'   #   sample.groups <- c(sample.groups, rep(2, length(xdata@.processHistory[[2]]@fileIndex)))
#'   # }
#'   
#'   
#'   
#'   ### XCMS feature grouping, pre-RT correction
#'   
#'   pdp <- PeakDensityParam(sampleGroups = sample.groups,
#'                           minFraction = minfrac, bw = bw.pre)
#'   xdata <- groupChromPeaks(xdata, param = pdp) 
#'   
#'   
#'   
#'    
#'   ### XCMS retention time adjustment, peak density
#'   
#'   pgp <- PeakGroupsParam(
#'     minFraction = max(0.5, minfrac)
#'   )
#'   xdata <- adjustRtime(xdata, param = pgp) 
#'   
#'   
#'   ### XCMS feature grouping, pre-RT correction
#'   
#'   pdp <- PeakDensityParam(sampleGroups = sample.groups,
#'                           minFraction = minfrac, bw = bw.post)
#'   xdata <- groupChromPeaks(xdata, param = pdp) 
#'   
#'   
#'   ### XCMS fillPeaks
#'   
#'   fpp <- FillChromPeaksParam(expandMz = 0, expandRt = 0, ppm = 0)
#'   xdata <- fillChromPeaks(xdata, param = fpp)
#'   
#'   save(xdata, file="datasets/xcmsObject.Rdata")
#'   return(xdata)
#' }
cbroeckl/csu.pmf.tools documentation built on June 19, 2024, 6:37 p.m.