R/getTargetFeatureStatistic.R
In peakPantheR: Peak Picking and Annotation of High Resolution Experiments

Documented in getTargetFeatureStatistic

#' @title Calculate chromatographic peak properties
#'
#' @description Calculate the ppm error, retention time deviation, tailing
#' factor and asymmetry factor for each measured feature.
#'
#' @param fittedCurves (list) A list (one entry per ROI window) of
#' \code{peakPantheR_curveFit} or NA
#' @param targetFeatTable a \code{\link{data.frame}} of compounds to target as
#' rows. Columns: \code{cpdID} (str), \code{cpdName} (str), \code{rtMin} (float
#' in seconds), \code{rt} (float in seconds, or \emph{NA}), \code{rtMax} (float
#' in seconds), \code{mzMin} (float), \code{mz} (float or \emph{NA}),
#' \code{mzMax} (float).
#' @param foundPeakTable a \code{data.frame} as generated by
#' \code{\link{findTargetFeatures}}, with features as rows and peak properties
#' as columns. The following columns are mandatory: \code{mzMin}, \code{mz},
#' \code{mzMax}, \code{rtMin}, \code{rt}, \code{rtMax}
#' @param verbose (bool) if TRUE message when NA scans are removed
#'
#' @return A \code{data.frame} with measured compounds as rows and measurements
#' and properties as columns (see Details).
#'
#' \subsection{Details:}{
#' The returned \code{data.frame} is structured as follow:
#' \tabular{ll}{
#' found \tab was the peak found\cr
#' rt \tab retention time of peak apex (sec)\cr
#' rtMin \tab leading edge of peak retention time (sec) determined at 0.5\% of
#' apex intensity\cr
#' rtMax \tab trailing edge of peak retention time (sec) determined at 0.5\% of
#' apex intensity\cr
#' mz \tab weighted (by intensity) mean of peak m/z across scans\cr
#' mzMin \tab m/z peak minimum (between rtMin, rtMax)\cr
#' mzMax \tab m/z peak maximum (between rtMin, rtMax)\cr
#' peakArea \tab integrated peak area\cr
#' maxIntMeasured \tab maximum peak intensity in raw data\cr
#' maxIntPredicted \tab maximum peak intensity based on curve fit\cr
#' ppm_error \tab difference in ppm between the expected and measured m/z\cr
#' rt_dev_sec \tab difference in seconds between the expected and measured rt\cr
#' tailingFactor \tab the tailing factor is a measure of peak tailing. It is
#' defined as the distance from the front slope of the peak to the back slope
#' divided by twice the distance from the center line of the peak to the front
#' slope, with all measurements made at 5\% of the maximum peak height. The
#' tailing factor of a peak will typically be similar to the asymmetry factor
#' for the same peak, but the two values cannot be directly converted\cr
#' asymmetryFactor \tab the asymmetry factor is a measure of peak tailing. It is
#' defined as the distance from the center line of the peak to the back slope
#' divided by the distance from the center line of the peak to the front slope,
#' with all measurements made at 10\% of the maximum peak height. The asymmetry
#' factor of a peak will typically be similar to the tailing factor for the same
#' peak, but the two values cannot be directly converted\cr
#' }
#' }
#'
#' @details
#' ## Examples cannot be computed as the function is not exported:
#' # fittedCurve
#' cFit1 <- list(amplitude=162404.8057918259, center=3341.888,
#'                 sigma=0.078786133031045896, gamma=0.0018336101984172684,
#'                 fitStatus=2, curveModel='skewedGaussian')
#' class(cFit1) <- 'peakPantheR_curveFit'
#' cFit2        <- list(amplitude=199249.10572753669, center=3382.577,
#'                     sigma=0.074904415304607966, gamma=0.0011471899372353885,
#'                     fitStatus=2, curveModel='skewedGaussian')
#' class(cFit2)    <- 'peakPantheR_curveFit'
#' input_fitCurves <- list(cFit1, cFit2)
#' 
#' # ROI
#' input_ROI <- data.frame(matrix(vector(), 2, 8, dimnames=list(c(), c('cpdID',
#'                 'cpdName', 'rtMin', 'rt', 'rtMax', 'mzMin', 'mz', 'mzMax'))),
#'                 stringsAsFactors=FALSE)
#' input_ROI[1,] <- c('ID-1', 'testCpd 1', 3310., 3344.88, 3390., 522.19, 522.2,
#'                     522.21)
#' input_ROI[2,] <- c('ID-2', 'testCpd 2', 3280., 3385.58, 3440., 496.19, 496.2,
#'                     496.21)
#' input_ROI[,3:8] <- vapply(input_ROI[,3:8], as.numeric, FUN.VALUE=numeric(2))
#'
#' # foundPeakTable
#' input_foundPeakTable <- data.frame(matrix(vector(), 2, 10, dimnames=list(c(),
#'                     c('found', 'rtMin', 'rt', 'rtMax', 'mzMin', 'mz',
#'                     'mzMax','peakArea','maxIntMeasured','maxIntPredicted'))),
#'                     stringsAsFactors=FALSE)
#' input_foundPeakTable[1,]  <- c(TRUE, 3309.758, 3346.827, 3385.410, 522.19,
#'                                 522.2, 522.21, 26133726, 889280, 901015)
#' input_foundPeakTable[2,]  <- c(TRUE, 3345.376, 3386.529, 3428.279, 496.19,
#'                                 496.2, 496.21, 35472141, 1128960, 1113576)
#' input_foundPeakTable[,1]  <- vapply(input_foundPeakTable[,c(1)], as.logical,
#'                                 FUN.VALUE=logical(1))
#'
#' # Run peak statistics
#' peakStatistics <- getTargetFeatureStatistic(input_fitCurves, input_ROI,
#'                                             input_foundPeakTable)
#' peakStatistics   
#' #   found    rtMin       rt    rtMax  mzMin    mz  mzMax peakArea
#' # 1  TRUE 3309.758 3346.827 3385.410 522.19 522.2 522.21 26133726
#' # 2  TRUE 3345.376 3386.529 3428.279 496.19 496.2 496.21 35472141
#' #    maxIntMeasured maxIntPredicted ppm_error rt_dev_sec tailingFactor
#' # 1          889280          901015         0      1.947      1.015385
#' # 2         1128960         1113576         0      0.949      1.005372
#' #    asymmetryFactor
#' # 1         1.026886
#' # 2         1.009304
getTargetFeatureStatistic <- function(fittedCurves, targetFeatTable,
                                        foundPeakTable, verbose = FALSE) {
    stime <- Sys.time()
    # Check input
    if (dim(targetFeatTable)[1] != dim(foundPeakTable)[1]) {
        stop("Number of features in \"targetFeatTable\" (",
            dim(targetFeatTable)[1], ") and \"foundPeakTable\" (",
            dim(foundPeakTable)[1], ") do not match!") }
    if (length(fittedCurves) != dim(foundPeakTable)[1]) {
        stop("Number of fitted curves in \"fittedCurves\" (",
            length(fittedCurves),
            ") and number of features in \"foundPeakTable\" (",
            dim(foundPeakTable)[1], ") do not match!") }
    # Calculate the statistics
    peakStat <- data.frame(matrix(vector(), dim(targetFeatTable)[1], 4,
                            dimnames = list(c(), c("ppm_error", "rt_dev_sec",
                                                    "tailingFactor",
                                                    "asymmetryFactor"))),
                            stringsAsFactors = FALSE)
    # only work with found features
    for (i in seq_len(dim(targetFeatTable)[1])) {
        if (foundPeakTable$found[i]) {
            # ppm_error
            if (!is.na(targetFeatTable$mz[i])) {
                peakStat$ppm_error[i] <- (abs(foundPeakTable$mz[i] -
                        targetFeatTable$mz[i])/targetFeatTable$mz[i]) * 1e+06 }
            # rt_dev_sec
            if (!is.na(targetFeatTable$rt[i])) {
                peakStat$rt_dev_sec[i] <- foundPeakTable$rt[i] -
                        targetFeatTable$rt[i] }
            # Tailing Factor
            peakStat$tailingFactor[i] <- peak_shape_stat(fittedCurves[[i]],
                                                    foundPeakTable$rt[i],
                                                    foundPeakTable$rtMin[i],
                                                    foundPeakTable$rtMax[i],
                                                    statistic = "tailingFactor")
            # Asymmetry Factor
            peakStat$asymmetryFactor[i] <- peak_shape_stat(fittedCurves[[i]],
                                                foundPeakTable$rt[i],
                                                foundPeakTable$rtMin[i],
                                                foundPeakTable$rtMax[i],
                                                statistic = "asymmetryFactor")}}
    # group the results and return
    finalTable <- cbind.data.frame(foundPeakTable, peakStat)
    etime <- Sys.time()
    if (verbose) { message("Peak statistics done in: ",
                            round(as.double(difftime(etime, stime)), 2),
                            " ", units(difftime(etime, stime))) }
    return(finalTable)
}


# -----------------------------------------------------------------------------
# getTargetFeatureStatistic helper functions

## Define the peak_shape_stat()
#
# Calculate tailing factor and asymmetry factor
#
# Tailing Factor and Asymmetry Factor following the equations from
# http://www.chromforum.org/viewtopic.php?t=20079
#
# @param fittedCurve A \code{peakPantheR_curveFit} for the feature of
#   interest or NA
# @param apexRT (float) retention time in seconds of the measured peak apex
# @param rtMin (float) leading edge retention time in seconds (at 0.5%)
# @param rtMax (float) tailing edge retention time in seconds (at 0.5%)
# @param statistic (str) either \code{tailingFactor} or
#   \code{asymmetryFactor}
# @param sampling (int) Number of points to employ when subsampling the
#   fittedCurve between rtMin and rtMax @return Tailing factor or Asymmetry
#   factor value
peak_shape_stat <- function(fittedCurve, apexRT, rtMin, rtMax,
                            statistic = "tailingFactor", sampling = 250) {
    # Check input
    if (is.na(apexRT)) { return(as.numeric(NA)) }
    if (is.na(rtMin)) { return(as.numeric(NA)) }
    if (is.na(rtMax)) { return(as.numeric(NA)) }
    if (all(is.na(fittedCurve))) { return(as.numeric(NA)) }

    # A B and C are retention time in seconds. C is RT of the apex, A is
    # left side at x%, B is right side at x%.
    # x% (cutoff) depends of the statistic employed.
    ## Find position of A and B
    cutoff <- switch(statistic, tailingFactor = 0.05, asymmetryFactor = 0.1)
    cutoff_int <- cutoff * predictCurve(fittedCurve, x = apexRT)
        # intensity at cutoff pt of interest
    ## A side
    # reverse order for up slope
    A_grid_rt   <- seq(from = apexRT, to = rtMin,
                        by = ((rtMin - apexRT)/(sampling - 1)))
    A_slope_int <- predictCurve(fittedCurve, x = A_grid_rt)
    # pos of 1st point past cutoff
    A_cutoff_pt <- match(-1, sign(A_slope_int - cutoff_int))
    # should not be NA as rtMin Max is at 0.5% while cutoff is 5 or 10%
    if (is.na(A_cutoff_pt)) { return(NA) }
    # points left and right from pt of interest
    A_key_pt    <- c(A_cutoff_pt - 1, A_cutoff_pt)
    # linear interpolation of exact rt
    A           <- stats::approx(x = A_slope_int[A_key_pt],
                                y = A_grid_rt[A_key_pt],
                                xout = cutoff_int)$y
    ## B side
    B_grid_rt   <- seq(from = apexRT, to = rtMax,
                        by = ((rtMax - apexRT)/(sampling - 1)))
    B_slope_int <- predictCurve(fittedCurve, x = B_grid_rt)
    # pos of 1st point past cutoff
    B_cutoff_pt <- match(-1, sign(B_slope_int - cutoff_int))
    # should not be NA as rtMin Max is at 0.5% while cutoff is 5 or 10%
    if (is.na(B_cutoff_pt)) { return(NA) }
    # points left and right from pt of interest
    B_key_pt    <- c(B_cutoff_pt - 1, B_cutoff_pt)
    # linear interpolation of exact rt
    B           <- stats::approx(x = B_slope_int[B_key_pt],
                                y = B_grid_rt[B_key_pt],
                                xout = cutoff_int)$y
    ## C is apexRT
    C <- apexRT
    ## Peak statistic
    if (statistic == "tailingFactor") { return((B - A)/(2 * (C - A))) }
    if (statistic == "asymmetryFactor") { return((B - C)/(C - A)) }
}