R/getTargetFeatureStatistic.R
In peakPantheR: Peak Picking and Annotation of High Resolution Experiments

Documented in getTargetFeatureStatistic

#' Calculate chromatographic peak properties
#'
#' Calculate the ppm error, retention time deviation, tailing factor and asymmetry factor for each measured feature.
#'
#' @param fittedCurves (list) A list (one entry per ROI window) of \code{peakPantheR_curveFit} or NA
#' @param targetFeatTable a \code{\link{data.frame}} of compounds to target as rows. Columns: \code{cpdID} (str), \code{cpdName} (str), \code{rtMin} (float in seconds), \code{rt} (float in seconds, or \emph{NA}), \code{rtMax} (float in seconds), \code{mzMin} (float), \code{mz} (float or \emph{NA}), \code{mzMax} (float).
#' @param foundPeakTable a \code{data.frame} as generated by \code{\link{findTargetFeatures}}, with features as rows and peak properties as columns. The following columns are mandatory: \code{mzMin}, \code{mz}, \code{mzMax}, \code{rtMin}, \code{rt}, \code{rtMax}
#' @param verbose (bool) if TRUE message when NA scans are removed
#'
#' @return A \code{data.frame} with measured compounds as rows and measurements and properties as columns (see Details).
#'
#' \subsection{Details:}{
#'   The returned \code{data.frame} is structured as follow:
#'   \tabular{ll}{
#'     found \tab was the peak found\cr
#'     rt \tab retention time of peak apex (sec)\cr
#'     rtMin \tab leading edge of peak retention time (sec) determined at 0.5\% of apex intensity\cr
#'     rtMax \tab trailing edge of peak retention time (sec) determined at 0.5\% of apex intensity\cr
#'     mz \tab weighted (by intensity) mean of peak m/z across scans\cr
#'     mzMin \tab m/z peak minimum (between rtMin, rtMax)\cr
#'     mzMax \tab m/z peak maximum (between rtMin, rtMax)\cr
#'     peakArea \tab integrated peak area\cr
#'     maxIntMeasured \tab maximum peak intensity in raw data\cr
#'     maxIntPredicted \tab maximum peak intensity based on curve fit\cr
#'     ppm_error \tab difference in ppm between the expected and measured m/z\cr
#'     rt_dev_sec \tab difference in seconds between the expected and measured rt\cr
#'     tailingFactor \tab the tailing factor is a measure of peak tailing. It is defined as the distance from the front slope of the peak to the back slope divided by twice the distance from the center line of the peak to the front slope, with all measurements made at 5\% of the maximum peak height. The tailing factor of a peak will typically be similar to the asymmetry factor for the same peak, but the two values cannot be directly converted\cr
#'    asymmetryFactor \tab the asymmetry factor is a measure of peak tailing. It is defined as the distance from the center line of the peak to the back slope divided by the distance from the center line of the peak to the front slope, with all measurements made at 10\% of the maximum peak height. The asymmetry factor of a peak will typically be similar to the tailing factor for the same peak, but the two values cannot be directly converted\cr
#'   }
#' }
#'
#' @examples
#' \dontrun{
#' # fittedCurve
#' cFit1           <- list(amplitude=162404.8057918259, center=3341.888,
#'                         sigma=0.078786133031045896, gamma=0.0018336101984172684,
#'                         fitStatus=2, curveModel="skewedGaussian")
#' class(cFit1)    <- 'peakPantheR_curveFit'
#' cFit2           <- list(amplitude=199249.10572753669, center=3382.577,
#'                         sigma=0.074904415304607966, gamma=0.0011471899372353885,
#'                         fitStatus=2, curveModel="skewedGaussian")
#' class(cFit2)    <- 'peakPantheR_curveFit'
#' input_fitCurves <- list(cFit1, cFit2)
#' 
# # ROI
#' input_ROI       <- data.frame(matrix(vector(), 2, 8, dimnames=list(c(), c("cpdID", 
#'                      "cpdName", "rtMin", "rt", "rtMax", "mzMin", "mz", "mzMax"))),
#'                      stringsAsFactors=F)
#' input_ROI[1,] 	<- c("ID-1", "testCpd 1", 3310., 3344.88, 3390., 522.19, 522.2, 522.21)
#' input_ROI[2,] 	<- c("ID-2", "testCpd 2", 3280., 3385.58, 3440., 496.19, 496.2, 496.21)
#' input_ROI[,3:8] <- sapply(input_ROI[,3:8], as.numeric)
#'
#' # foundPeakTable
#' input_foundPeakTable      <- data.frame(matrix(vector(), 2, 10, dimnames=list(c(), 
#'                                c("found", "rtMin", "rt", "rtMax", "mzMin", "mz", "mzMax",
#'                                "peakArea", "maxIntMeasured", "maxIntPredicted"))),
#'                                stringsAsFactors=F)
#' input_foundPeakTable[1,]  <- c(TRUE, 3309.758, 3346.827, 3385.410, 522.19, 522.2, 522.21, 
#'                                26133726, 889280, 901015)
#' input_foundPeakTable[2,]  <- c(TRUE, 3345.376, 3386.529, 3428.279, 496.19, 496.2, 496.21,
#'                                35472141, 1128960, 1113576)
#' input_foundPeakTable[,1]  <- sapply(input_foundPeakTable[,c(1)], as.logical)
#'
#' # Run peak statistics
#' peakStatistics <- getTargetFeatureStatistic(input_fitCurves, input_ROI, input_foundPeakTable)
#' peakStatistics   
#' #   found    rtMin       rt    rtMax  mzMin    mz  mzMax peakArea maxIntMeasured maxIntPredicted
#' # 1  TRUE 3309.758 3346.827 3385.410 522.19 522.2 522.21 26133726         889280          901015 
#' # 2  TRUE 3345.376 3386.529 3428.279 496.19 496.2 496.21 35472141        1128960         1113576
#' #   ppm_error rt_dev_sec tailingFactor asymmetryFactor
#' # 1        0      1.947      1.015385        1.026886
#' # 2        0      0.949      1.005372        1.009304
#' }
getTargetFeatureStatistic <- function(fittedCurves, targetFeatTable, foundPeakTable, verbose=FALSE) {

  ## Define the peak_shape_stat() -----------------------
  #
  # Calculate tailing factor and asymmetry factor
  #
  # Tailing Factor and Asymmetry Factor following the equations from http://www.chromforum.org/viewtopic.php?t=20079
  #
  # @param fittedCurve A \code{peakPantheR_curveFit} for the feature of interest or NA
  # @param apexRT (float) retention time in seconds of the measured peak apex
  # @param rtMin (float) leading edge retention time in seconds (at 0.5%)
  # @param rtMax (float) tailing edge retention time in seconds (at 0.5%)
  # @param statistic (str) either \code{tailingFactor} or \code{asymmetryFactor}
  # @param sampling (int) Number of points to employ when subsampling the fittedCurve between rtMin and rtMax
  #
  # @return Tailing factor or Asymmetry factor value
  #
  peak_shape_stat <- function(fittedCurve, apexRT, rtMin, rtMax, statistic="tailingFactor", sampling=250){

    # Check input
    if (is.na(apexRT)) { return(as.numeric(NA)) }
    if (is.na(rtMin)) { return(as.numeric(NA)) }
    if (is.na(rtMax)) { return(as.numeric(NA)) }
    if (all(is.na(fittedCurve))) { return(as.numeric(NA)) }
    
    # A B and C are retention time in seconds. C is RT of the apex, A is left side as x%, B is right side at x%. x% (cutoff) depends of the statistic employed.
    
    ## Find position of A and B
    cutoff      <- switch(statistic, tailingFactor=0.05, asymmetryFactor=0.1)
    cutoff_int  <- cutoff * predictCurve(fittedCurve, x=apexRT) # intensity at cutoff pt of interest
    
    ## A side
    # reverse order for up slope
    A_grid_rt   <- seq(from=apexRT, to=rtMin, by=((rtMin-apexRT)/(sampling-1)))
    A_slope_int <- predictCurve(fittedCurve, x=A_grid_rt)
    # pos of 1st point past cutoff
    A_cutoff_pt <- match(-1,sign(A_slope_int - cutoff_int))
    # should not be NA as rtMin Max is at 0.5% while cutoff is 5 or 10%
    if (is.na(A_cutoff_pt)) { return(NA) }                    
    # points left and right from pt of interest
    A_key_pt    <- c(A_cutoff_pt-1, A_cutoff_pt)                
    # linear interpolation of exact rt
    A           <- stats::approx(x=A_slope_int[A_key_pt], y=A_grid_rt[A_key_pt], xout=cutoff_int)$y
  
    ## B side
    B_grid_rt   <- seq(from=apexRT, to=rtMax, by=((rtMax-apexRT)/(sampling-1)))
    B_slope_int <- predictCurve(fittedCurve, x=B_grid_rt)
    # pos of 1st point past cutoff
    B_cutoff_pt <- match(-1,sign(B_slope_int - cutoff_int))
    # should not be NA as rtMin Max is at 0.5% while cutoff is 5 or 10%
    if (is.na(B_cutoff_pt)) { return(NA) }                    
    # points left and right from pt of interest
    B_key_pt    <- c(B_cutoff_pt-1, B_cutoff_pt)                
    # linear interpolation of exact rt
    B           <- stats::approx(x=B_slope_int[B_key_pt], y=B_grid_rt[B_key_pt], xout=cutoff_int)$y

    ## C is apexRT
    C <- apexRT
    
    ## Peak statistic
    if(statistic=="tailingFactor") {
      return( (B-A)/(2*(C-A)) )
    }
    if(statistic=="asymmetryFactor") {
      return( (B-C)/(C-A) )
    }
  }
  ## ----------------------------------------------------

	stime <- Sys.time()

	## Check input
  if (dim(targetFeatTable)[1] != dim(foundPeakTable)[1]) {
    stop('Number of features in "targetFeatTable" (', dim(targetFeatTable)[1],') and "foundPeakTable" (', dim(foundPeakTable)[1], ') do not match!')
  }
	if (length(fittedCurves) != dim(foundPeakTable)[1]) {
	  stop('Number of fitted curves in "fittedCurves" (', length(fittedCurves),') and number of features in "foundPeakTable" (', dim(foundPeakTable)[1], ') do not match!')
	}

  ## Calculate the statistics
  peakStat	<- data.frame(matrix(vector(), dim(targetFeatTable)[1], 4, dimnames=list(c(), c("ppm_error", "rt_dev_sec", "tailingFactor", "asymmetryFactor"))), stringsAsFactors=F)

  for (i in 1:dim(targetFeatTable)[1]) {
    # If the feature wasn't found we cannot work with it
    if (foundPeakTable$found[i]) {
      # ppm_error
      if (!is.na(targetFeatTable$mz[i])) {
        peakStat$ppm_error[i]     <- (abs(foundPeakTable$mz[i] - targetFeatTable$mz[i])/targetFeatTable$mz[i])*1E6
      }
      # rt_dev_sec
      if (!is.na(targetFeatTable$rt[i])) {
        peakStat$rt_dev_sec[i]    <- abs(foundPeakTable$rt[i] - targetFeatTable$rt[i])
      }
      # Tailing Factor
      peakStat$tailingFactor[i]   <- peak_shape_stat(fittedCurves[[i]], foundPeakTable$rt[i], foundPeakTable$rtMin[i], foundPeakTable$rtMax[i], statistic = "tailingFactor")
      # Asymmetry Factor
      peakStat$asymmetryFactor[i] <- peak_shape_stat(fittedCurves[[i]], foundPeakTable$rt[i], foundPeakTable$rtMin[i], foundPeakTable$rtMax[i], statistic = "asymmetryFactor")
    }
  }

  ## group the results and return
  finalTable  <- cbind.data.frame(foundPeakTable, peakStat)

	etime <- Sys.time()
	if (verbose) {
    message('Peak statistics done in: ', round(as.double(difftime(etime,stime)),2),' ',units( difftime(etime,stime)))
  }

  return( finalTable )
}