Nothing
#' Find and integrate target features in each ROI
#'
#' For each ROI, fit a curve and integrate the largest feature in the box. Each entry in \code{ROIsDataPoints} must match the corresponding row in \code{ROI}. The curve shape to employ for fitting can be changed with \code{curveModel} while fitting parameters can be changed with \code{params} (list with one param per ROI window). \code{rtMin} and \code{rtMax} are established at 0.5% of apex intensity using a moving window from the apex outward (the window is the ROI width); if after 8 iterations \code{rtMin} or \code{rtMax} is not found, NA is returned and the peak fit rejected. \code{peakArea} is calculated from \code{rtMin} to \code{rtMax}. \code{mz} is the weighted (by intensity) average mz of datapoints falling into the \code{rtMin} to \code{rtMax} range, \code{mzMin} and \code{mzMax} are the minimum and maxmimum mass in these range. If \code{rtMin} or \code{rtMax} falls outside of ROI (extracted scans), \code{mzMin} or \code{mzMax} are returned as the input ROI limits and \code{mz} is an approximation on the datapoints available (if no scan of the ROI fall between rtMin/rtMax, mz would be NA, the peak is rejected). If any of the two following ratio are superior to \code{maxApexResidualRatio}, the fit is rejected: 1) ratio of fit residuals at the apex (predicted apex fit intensity vs measured apex intensity: fit overshoots the apex), 2) ratio of predicted apex fit intensity vs maximum measured peak intensity (fit misses the real apex in the peak).
#'
#' @param ROIsDataPoints (list) A list (one entry per ROI window) of data.frame with signal as row and retention time ("rt"), mass ("mz") and intensity ("int) as columns. Must match each row of ROI.
#' @param ROI (data.frame) A data.frame of compounds to target as rows. Columns: \code{rtMin} (float in seconds), \code{rtMax} (float in seconds), \code{mzMin} (float), \code{mzMax} (float)
#' @param curveModel (str) Name of the curve model to fit (currently \code{skewedGaussian})
#' @param params (list or str) Either 'guess' for automated parametrisation or list (one per ROI windows) of "guess" or list of curve fit parameters
#' @param sampling (int) Number of points to employ when subsampling the fittedCurve (rt, rtMin, rtMax, integral calculation)
#' @param maxApexResidualRatio (float) Ratio of maximum allowed fit residual at the peak apex, compared to the fit max intensity. (e.g. 0.2 for a maximum residual of 20\% of apex intensity)
#' @param verbose (bool) If TRUE message the time taken and number of features found
#' @param ... Passes arguments to \code{fitCurve} to alter peak fitting (\code{params})
#'
#' @return A list: \code{list()$peakTable} (\emph{data.frame}) with targeted features as rows and peak measures as columns (see Details), \code{list()$curveFit} (\emph{list}) a list of \code{peakPantheR_curveFit} or NA for each ROI.
#'
#' \subsection{Details:}{
#' The returned \code{data.frame} is structured as follow:
#' \tabular{ll}{
#' found \tab was the peak found\cr
#' rt \tab retention time of peak apex (sec)\cr
#' rtMin \tab leading edge of peak retention time (sec) determined at 0.5\% of apex intensity\cr
#' rtMax \tab trailing edge of peak retention time (sec) determined at 0.5\% of apex intensity\cr
#' mz \tab weighted (by intensity) mean of peak m/z across scans\cr
#' mzMin \tab m/z peak minimum (between rtMin, rtMax)\cr
#' mzMax \tab m/z peak maximum (between rtMin, rtMax)\cr
#' peakArea \tab integrated peak area\cr
#' maxIntMeasured \tab maximum peak intensity in raw data\cr
#' maxIntPredicted \tab maximum peak intensity based on curve fit (at apex)\cr
#' }
#' }
#'
#' @examples
#' \dontrun{
#' ## Load data
#' library(faahKO)
#' library(MSnbase)
#' netcdfFilePath <- system.file('cdf/KO/ko15.CDF', package = "faahKO")
#' raw_data <- MSnbase::readMSData(netcdfFilePath, centroided=TRUE, mode='onDisk')
#'
#' ## targetFeatTable
#' targetFeatTable <- data.frame(matrix(vector(), 2, 8, dimnames=list(c(), c("cpdID",
#' "cpdName", "rtMin", "rt", "rtMax", "mzMin", "mz", "mzMax"))),
#' stringsAsFactors=F)
#' targetFeatTable[1,] <- c("ID-1", "Cpd 1", 3310., 3344.888, 3390., 522.194778, 522.2, 522.205222)
#' targetFeatTable[2,] <- c("ID-2", "Cpd 2", 3280., 3385.577, 3440., 496.195038, 496.2, 496.204962)
#' targetFeatTable[,3:8] <- sapply(targetFeatTable[,3:8], as.numeric)
#'
#' ROIsPt <- extractSignalRawData(raw_data, rt=targetFeatTable[,c('rtMin','rtMax')],
#' mz=targetFeatTable[,c('mzMin','mzMax')], verbose=TRUE)
#' # Reading data from 2 windows
#'
#' foundPeaks <- findTargetFeatures(ROIsPt, targetFeatTable, verbose=T)
#' # Warning: rtMin/rtMax outside of ROI; datapoints cannot be used for mzMin/mzMax calculation,
#' # approximate mz and returning ROI$mzMin and ROI$mzMax for ROI #1
#' # Found 2/2 features in 0.07 secs
#'
#' foundPeaks
#' # $peakTable
#' # found rtMin rt rtMax mzMin mz mzMax peakArea maxIntMeasured maxIntPredicted
#' # 1 TRUE 3309.759 3346.828 3385.410 522.1948 522.2 522.2052 26133727 889280 901015.8
#' # 2 TRUE 3345.377 3386.529 3428.279 496.2000 496.2 496.2000 35472141 1128960 1113576.7
#' #
#' # $curveFit
#' # $curveFit[[1]]
#' # $amplitude
#' # [1] 162404.8
#' #
#' # $center
#' # [1] 3341.888
#' #
#' # $sigma
#' # [1] 0.07878613
#' #
#' # $gamma
#' # [1] 0.00183361
#' #
#' # $fitStatus
#' # [1] 2
#' #
#' # $curveModel
#' # [1] "skewedGaussian"
#' #
#' # attr(,"class")
#' # [1] "peakPantheR_curveFit"
#' #
#' # $curveFit[[2]]
#' # $amplitude
#' # [1] 199249.1
#' #
#' # $center
#' # [1] 3382.577
#' #
#' # $sigma
#' # [1] 0.07490442
#' #
#' # $gamma
#' # [1] 0.00114719
#' #
#' # $fitStatus
#' # [1] 2
#' #
#' # $curveModel
#' # [1] "skewedGaussian"
#' #
#' # attr(,"class")
#' # [1] "peakPantheR_curveFit"
#' }
findTargetFeatures <- function(ROIsDataPoints, ROI, curveModel='skewedGaussian', params='guess', sampling=250, maxApexResidualRatio=0.2, verbose=FALSE, ...){
stime <- Sys.time()
## Check inputs
nROI <- nrow(ROI)
# ROIsDataPoints match ROI
if (length(ROIsDataPoints) != nROI) {
stop('Check input, number of ROIsDataPoints entries must match the number of rows of ROI')
}
# Check all data points fall into the corresponding ROI
for(r in 1:nROI) {
if(!all((ROIsDataPoints[[r]]$rt >= ROI[r,c('rtMin')]) & (ROIsDataPoints[[r]]$rt <= ROI[r,c('rtMax')]))) {stop('Check input not all datapoints for window #', r, ' are into the corresponding ROI (rt)')}
if(!all((ROIsDataPoints[[r]]$mz >= ROI[r,c('mzMin')]) & (ROIsDataPoints[[r]]$mz <= ROI[r,c('mzMax')]))) {stop('Check input not all datapoints for window #', r, ' are into the corresponding ROI (mz)')}
}
# Check params input
if (!(is.character(params) | is.list(params))) {
stop('Check input, "params" must be "guess" or list')
}
# params is 'guess' if character
if (is.character(params)){
if (params != 'guess') {
stop('Check input, "params" must be "guess" if not list')
}
}
# length if params is list
if (is.list(params)){
if (length(params) != nROI) {
stop('Check input, number of parameters must match number of rows of ROI')
}
}
## Init output
outTable <- data.frame(matrix(vector(), nROI, 10, dimnames=list(c(),c('found','rtMin','rt','rtMax','mzMin','mz','mzMax','peakArea','maxIntMeasured','maxIntPredicted'))), stringsAsFactors=F)
outTable$found <- rep(FALSE, nROI) # set found to FALSE
outCurveFit <- rep(list(NA), nROI)
# use input params or guess
if (any(params != 'guess')) {
useParams <- TRUE
if (verbose) {message('Curve fitting parameters passed as input employed')}
} else {
useParams <- FALSE
}
## Iterate over ROIs
for (i in 1:nROI) {
# set params for fitting
new_params <- 'guess'
if (useParams) {
new_params <- params[[i]]
}
## EIC to fit
tmp_EIC <- generateIonChromatogram(ROIDataPoint=ROIsDataPoints[[i]], aggregationFunction='sum')
## resample to a standardised grid of size the maximum between 2*sampling and raw number of data points (double sampling used for integration)
#rawData_EIC <- generateIonChromatogram(ROIDataPoint=ROIsDataPoints[[i]], aggregationFunction='sum')
#tmp_gridMin <- min(rawData_EIC$rt)
#tmp_gridMax <- max(rawData_EIC$rt)
#tmp_gridSampling <- max(2*sampling, dim(rawData_EIC)[1])
#grid_rt_EIC <- seq(from=tmp_gridMin, to=tmp_gridMax, by=((tmp_gridMax-tmp_gridMin)/(tmp_gridSampling-1)))
#raw_approx_fun <- stats::approxfun(x=rawData_EIC$rt, y=rawData_EIC$int)
#tmp_EIC <- data.frame(rt=grid_rt_EIC, int=raw_approx_fun(grid_rt_EIC))
## fit curve to EIC (store output). If failure move to next window
fittedCurve <- tryCatch(
{
## try
fittedCurve <- fitCurve(x=tmp_EIC$rt, y=tmp_EIC$int, curveModel=curveModel, params=new_params, ...)
},
error=function(cond) {
## catch
return(NA)
}
)
# catch fit failure
if (all(is.na(fittedCurve))) {
if (verbose) {message('Fit of ROI #', i,' is unsuccessful (try error)')}
# move to next window (empty df row was already initialised)
next
}
# discard fit if nls.lm fit status indicates unsuccessful completion
if ((fittedCurve$fitStatus == 0) | (fittedCurve$fitStatus ==5) | (fittedCurve$fitStatus ==-1)) {
if (verbose) {message('Fit of ROI #', i,' is unsuccessful (fit status)')}
# move to next window (empty df row was already initialised)
next
}
## rt (search on same bounds as peak fit +/-3s)
rt_EICmax <- tmp_EIC$rt[which.max(tmp_EIC$int)]
grid_rt <- seq(from=rt_EICmax-3, to=rt_EICmax+3, by=(6/(sampling-1)))
close_apex_int <- predictCurve(fittedCurve, x=grid_rt)
rt <- grid_rt[which.max(close_apex_int)]
## maxIntPredicted (fit apex intensity)
maxIntPredicted <- predictCurve(fittedCurve=fittedCurve, x=rt)
## rtMin, rtMax (look for 0.5% from max int, by rolling away from apex until match or too many iterations)
peakLim_int <- 0.005 * maxIntPredicted
deltaRt <- ROI$rtMax[i] - ROI$rtMin[i]
## Up slope
# init
rtMin <- as.numeric(NA)
boxMin <- rt
cntr <- 0
# search rtMin
while (is.na(rtMin) & cntr<=8) {
# box moves earlier in rt each time
boxMax <- boxMin
boxMin <- boxMax - deltaRt
cntr <- cntr+1
grid_rt <- seq(from=boxMax, to=boxMin, by=((boxMin-boxMax)/(sampling-1))) # reverse order for up slope
slope_int <- predictCurve(fittedCurve, x=grid_rt)
cutoff_pt <- match(-1,sign(slope_int - peakLim_int)) # pos of 1st point past cutoff
if (is.na(cutoff_pt)) {
rtMin <- as.numeric(NA)
next
}
key_pt <- c(cutoff_pt-1, cutoff_pt) # points left and right from rtMin
rtMin <- stats::approx(x=slope_int[key_pt], y=grid_rt[key_pt], xout=peakLim_int)$y # linear interpolation of exact rt
}
if (is.na(rtMin) & verbose) {message('Warning: rtMin cannot be determined for ROI #',i)}
## Down slope
# init
rtMax <- as.numeric(NA)
boxMax <- rt
cntr <- 0
# search rtMax
while (is.na(rtMax) & cntr<=8) {
# box moves later in rt each time
boxMin <- boxMax
boxMax <- boxMin + deltaRt
cntr <- cntr+1
grid_rt <- seq(from=boxMin, to=boxMax, by=((boxMax-boxMin)/(sampling-1)))
slope_int <- predictCurve(fittedCurve, x=grid_rt)
cutoff_pt <- match(-1,sign(slope_int - peakLim_int)) # pos of 1st point past cutoff
if (is.na(cutoff_pt)) {
rtMax <- as.numeric(NA)
next
}
key_pt <- c(cutoff_pt-1, cutoff_pt) # points left and right from rtMax
rtMax <- stats::approx(x=slope_int[key_pt], y=grid_rt[key_pt], xout=peakLim_int)$y # linear interpolation of exact rt
}
if (is.na(rtMax) & verbose) {message('Warning: rtMax cannot be determined for ROI #',i)}
# if rtMin or rtMax cannot be determined the fit is not successful
if (is.na(rtMin) | is.na(rtMax)) {
message('Fit of ROI #', i,' is unsuccessful (cannot determine rtMin/rtMax)')
# move to next window (empty df row was already initialised)
next
}
## maxIntMeasured (max raw data intensity; 0 in case of no scans)
maxIntMeasured <- max(0, tmp_EIC$int[(tmp_EIC$rt < rtMax) & (tmp_EIC$rt > rtMin)])
## mz, mzMin, mzMax
# if rtMin, rtMax are outside of ROI, we cannot calculate mzMin, mzMax and mz precisely:
# default to ROI$mzMIn, ROI$mzMax as a safe choice, and approximate mz
tmpRtMin <- rtMin
tmpRtMax <- rtMax
tmpMzMin <- ROI$mzMin[i]
tmpMzMax <- ROI$mzMax[i]
tmpROIData <- ROIsDataPoints[[i]]
isValid <- TRUE
# deal with rtMin rtMax outside of ROI (warning and no)
if ((tmpRtMin < ROI$rtMin[i]) | (tmpRtMax > ROI$rtMax[i])) {
isValid <- FALSE
if (verbose) { message('Warning: rtMin/rtMax outside of ROI; datapoints cannot be used for mzMin/mzMax calculation, approximate mz and returning ROI$mzMin and ROI$mzMax for ROI #',i) }
}
# subset datapoints mz to rtMin/rtMax range
tmpPt <- tmpROIData[(tmpROIData$rt > tmpRtMin) & (tmpROIData$rt < tmpRtMax), ]
# rtMin rtMax range can be used for mzMin mzMax calculation (otherwise init to ROI mzMin/Max)
if (isValid) {
tmpMzMin <- min(tmpPt$mz)
tmpMzMax <- max(tmpPt$mz)
}
# calculate mz (might be an approx) (weighted average of total intensity across all rt for each unique mz)
mzRange <- unique(tmpPt$mz)
mzTotalIntensity <- sapply(mzRange, function(x) {sum(tmpPt$int[tmpPt$mz == x])})
if (length(mzTotalIntensity) > 0) { # make sure weighted.mean doesn't crash if no scan match
mz <- stats::weighted.mean(mzRange, mzTotalIntensity)
} else {
mz <- NA
}
# tidy
mzMin <- tmpMzMin
mzMax <- tmpMzMax
# if mz, mzMin or mzMax cannot be determined the fit is not successful (mzMin/mzMax default to ROI)
if (is.na(mz) | is.na(mzMin) | is.na(mzMax)) {
if (verbose) {message('Fit of ROI #', i,' is unsuccessful (cannot determine mz/mzMin/mzMax)')}
# move to next window (empty df row was already initialised)
next
}
## integrate curve
tmpRtMin <- rtMin
tmpRtMax <- rtMax
# __a__
# /| \
# / h \
# /____|__b____\
# \ | /
# \ h /
# \___|__c_/
#
# Area = (a+b)/2 * h + (b+c)/2 * h
# = (a+2b+c)/2 * h
h <- (tmpRtMax-tmpRtMin)/(sampling-1)
grid_rt <- seq(from=tmpRtMin, to=tmpRtMax, by=h)
val_int <- predictCurve(fittedCurve, x=grid_rt)
dist <- sum( c(val_int, val_int[2:(sampling-1)]) )/2
peakArea <- dist * h
## Check quality of fit (residuals at apex, and max predicted to max measured)
# maxIntMeasured: peak max value in raw data
# maxIntPredicted: fit apex value
# IntRawApex: raw data apex value
raw_approx_fun <- stats::approxfun(x=tmp_EIC$rt, y=tmp_EIC$int)
IntRawApex <- raw_approx_fun(rt)
if (is.na(IntRawApex)) {IntRawApex <- 0}
# residual at apex
apex_residuals <- abs(maxIntPredicted - maxIntMeasured)
# residual between maximums (raw vs fit)
max_residuals <- abs(maxIntPredicted - IntRawApex)
# compare residuals to max fit intensity
if(((apex_residuals / maxIntPredicted) > maxApexResidualRatio) | ((max_residuals / maxIntPredicted) > maxApexResidualRatio)) {
if (verbose) {message('Fit of ROI #', i,' is unsuccessful (apex residuals is ', round(apex_residuals / maxIntPredicted, 2),' of max fit intensity, max intensity residuals is ', round(max_residuals / maxIntPredicted, 2),' of max fit intensity)')}
# move to next window (empty df row was already initialised)
next
}
# ! Unused filtering based on residuals !
# @param maxResidualRatio (float) Ratio of maximum allowed residual area compared to the fit area. (e.g. 0.20 for a maximum residual area of 20% of fit area)
### Check quality of fit (residual area compared to peakArea)
## use the common range of ROI(raw data) and curve fit
#tmpRtMin <- max(rtMin, min(tmpROIData$rt))
#tmpRtMax <- min(rtMax, max(tmpROIData$rt))
#h_res <- (tmpRtMax-tmpRtMin)/(sampling-1)
#grid_rt_res <- seq(from=tmpRtMin, to=tmpRtMax, by=h_res)
## project raw data approx and curve fit on common grid
#raw_approx_fun <- stats::approxfun(x=tmp_EIC$rt, y=tmp_EIC$int)
#raw_proj <- raw_approx_fun(grid_rt_res)
#fit_proj <- predictCurve(fittedCurve, x=grid_rt_res)
## calculate residuals area on section considered
#fit_residuals <- abs(raw_proj - fit_proj)
#dist_residuals <- sum( c(fit_residuals, fit_residuals[2:(sampling-1)]) )/2
#area_residuals <- dist_residuals * h_res
## calculate peak area on section considered
#dist_subfit <- sum( c(fit_proj, fit_proj[2:(sampling-1)]) )/2
#area_subfit <- dist_subfit * h_res
## compare residual area to fit area
#if (area_residuals > (area_subfit * maxResidualRatio)) {
# if (verbose) {message('Fit of ROI #', i,' is unsuccessful (fit residuals is ', round(area_residuals / area_subfit, 2),' of peak area)')}
# # move to next window (empty df row was already initialised)
# next
#}
## Set all values
# curveFit
outCurveFit[[i]] <- fittedCurve
# peaktable
outTable$found[i] <- TRUE
outTable$rt[i] <- rt
outTable$rtMin[i] <- as.numeric(rtMin)
outTable$rtMax[i] <- as.numeric(rtMax)
outTable$mz[i] <- mz
outTable$mzMin[i] <- mzMin
outTable$mzMax[i] <- mzMax
outTable$peakArea[i] <- peakArea
outTable$maxIntMeasured[i] <- maxIntMeasured
outTable$maxIntPredicted[i] <- maxIntPredicted
}
## output
etime <- Sys.time()
if (verbose) {
message('Found ', sum(outTable$found), '/', nROI, ' features in ', round(as.double(difftime(etime,stime)),2),' ',units( difftime(etime,stime)))
}
return(list(peakTable=outTable, curveFit=outCurveFit))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.