# modelPEAKS -------------------------------------------------------------------------------------------------------
#' @title Function models intensity integration values for missing peaks.
#'
#' @description Function models intensity integration values for each of the missing peaks and for each of the samples in which it was missed.
#' Modelling relies on a smooth spline fit to the values of the corresponding detected peaks.
#'
#' @param p \code{numeric} indicating peak IDs.
#' @param vars \code{character} with centWave values to be modelled, e.g. c("mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax").
#' @param object \code{massFlowTemplate} class object.
#'
#' @return Function returns a list of data.frames - a data.frame for each peak supplied with the argument p.
#'
modelPEAKS <- function(p, vars, object) {
peak <- object@peaks[[match(p, names(object@peaks))]]
pred_vals <- lapply(vars,
FUN = smoothVALUE,
x = 1:nrow(peak),
peak = peak
)
peak_vals <- setNames(as.data.frame(do.call("cbind", pred_vals)),
nm = vars
)
return(peak_vals)
}
# smoothVALUE -------------------------------------------------------------------------------------------------------
#' @title Helper function to model intensity integration values for missing peaks.
#'
#' @description Helper function is used by the \code{\link{modelPEAKS}} function to model intensity integration values for missing peaks.
#' Function fit a smoothing spline to centWave values reported for the peak-of-interest in all samples in the study.
#' For samples in which peak was not detected, centWave values are predicted.
#'
#' @param var \code{character} with a single centWave value to be modelled, e.g. "mz"
#' @param x \code{numeric} representing indeces of peaks for the supplied 'peak' data.frame.
#' @param peak \code{data.frame} with all centWave values (columns) for the peak-of-interest for all samples (rows). For samples in which peak as not detected, NAs must be listed.
#' @param lambda \code{numeric} representing smoothing parameter lambda. Set to 10 by default, which corresponds to number of study samoples in between two quality control samples in standard experimental protocols.
#'
#' @return Function returns centWave values for the peak-of-interest, which includes predicted values for samples in which it was not originally detected.
#'
smoothVALUE <- function(var, x, peak, lambda = 10) {
y <- peak[, match(var, names(peak))]
present <- which(!is.na(y))
if (length(x[present]) < 4 ) {
## spline will fail with an error message otherwise
return(y)
}
mod <- stats::smooth.spline(
x = x[present],
y = y[present],
lambda = lambda,
cv = TRUE
)
predicted <- stats::predict(mod, x = x[-present])$y
y[is.na(y)] <- predicted
return(y)
}
# extractMISS -------------------------------------------------------------------------------------------------------
#' @title Extract samples with missed peaks, for which intensity have to be re-integrated.
#'
#' @description Function builds a list with mz and rt regions for intensity re-integration.
#' This is a data-reduction step, helping to reduce the size of the variables being imputed to \code{\link{fillSAMPLE}} function for intensity re-integration.
#'
#' @param s \code{numeric} corresponding to sample number in the study.
#' @param values \code{list} with centWave values for each sample.
#' @param peaks_mod \code{list} with real and modelled centWave values for each peak.
#' @param valid \code{data.frame} with validated aligned peaks.
#'
#' @return Function returns a list with a data.frame for each sample in the study.
#' Each data.frame contains predicted mz and rt regions only for those peaks that were missed in the corresponding sample.
#'
extractMISS <- function(s, values, peaks_mod, valid) {
## for each missing peak, take the modelled mz/rt values and return in a new df
sample <- values[[s]]
peaks_miss <- which(is.na(sample$mz))
if (length(peaks_miss) == 0) {
return(data.frame())
}
peaks_mod_miss <- lapply(peaks_mod[peaks_miss], function(p) {
p[s, ]
})
peaks_mod_miss <- do.call("rbindCLEAN", peaks_mod_miss)
peaks_mod_miss$peakid <- valid$peakid[peaks_miss]
return(peaks_mod_miss)
}
# fillSAMPLE -------------------------------------------------------------------------------------------------
#' @title Fill intensity values for peaks missed in the sample-of-interest.
#'
#' @param s \code{numeric} corresponding to the number of the sample-of-interest in the study.
#' @param sname \code{character} with full path to the raw LC-MS file for the sample-of-interest.
#' @param sdata \code{data.frame} with all centWave values for the sample-of-interest.
#' @param values \code{data.frame} with predicted mz and rt regions only for those peaks that were missed in the corresponding sample.
#'
#' @return Function returns updated \code{data.frame} with the centWave values for the sample-of-interest.
#'
fillSAMPLE <- function(s, sname, sdata, values) {
if (validFILE(sname) == TRUE) {
## if missing peaks are present for this sample
if (nrow(values) > 0) {
raw <- readDATA(sname)
## extract intensity of scans corresponding to mz&rt regions for every peak
peaks_int <- extractINT(raw,
mz = values[, c("mzmin", "mzmax")],
rt = values[, c("rtmin", "rtmax")]
)
## get into and maxo values for each peak
values_filled <- as.data.frame(setNames(replicate(n = ncol(values) + 2, numeric(), simplify = FALSE), nm = c(names(values), "maxo", "into")))
for (p in 1:length(peaks_int)) {
peak <- peaks_int[[p]]
values_filled[p, names(values)] <- values[p, ]
## if scans were found for this peak
if (nrow(peak) > 0) {
## get rt width
rts <- unique(peak$rt)
rt_range <- range(peak$rt)
rt_width <- (rt_range[2] - rt_range[1]) / length(rts)
## get max int for each scan
rt_max <- sapply(rts, function(rt) {
max(peak[peak$rt == rt, "int"])
})
maxo <- max(rt_max)
## get into integration
mz_max <- sapply(unique(peak$mz), function(mz) {
max(peak[peak$mz == mz, "int"])
})
into <- sum(mz_max) * rt_width
values_filled[p, c("maxo", "into")] <- c(maxo, into)
} else {
## return default values - 0s
values_filled[p, c("maxo", "into")] <- 0
}
}
## bind with centWave results
## retain original peakid order in sdata
sdata[match(values_filled$peakid, sdata$peakid), colnames(values_filled)] <- values_filled
}
## cleanup garbage
gc(verbose = FALSE)
return(sdata)
} else {
return(list(nsample = s, status = "FAILED", error = validFILE(sname)))
}
}
# extractINT --------------------------------------------------------------
#' @title Extract raw signal values for defined mz and rt region.
#'
#' @description Helper function used by \code{\link{fillSAMPLE}} function to extract spectrum intensity values for a defined mz and rt window.
#'
#' @param raw \code{OnDiskMSnExp} class object.
#' @param rt \code{data.frame} with rtmin and rtmax values (columns) for peaks (rows) that have to be re-integrated in the sample-of-interest.
#' @param mz \code{data.frame} with mzmin and mzmax values (columns) for peaks (rows) that have to be re-integrated in the sample-of-interest.
#'
#' @return Function returns a \code{list} of data.frames with extracted raw LC-MS signal values.
#'
extractINT <- function(raw, rt, mz) {
## which scans are required to be filled
raw_rt <- MSnbase::rtime(raw)
scans_to_fill <- sort(unique(unlist(apply(rt, 1, function(r) {
which(raw_rt >= r[1] &
raw_rt <= r[2])
}))))
# if no scans
if (length(scans_to_fill) == 0) {
return(data.frame())
}
## extract spectra and rt values for each scan
raw_rt_to_fill <- raw[scans_to_fill]
raw_spec <- MSnbase::spectra(raw_rt_to_fill)
raw_spec_rt <- MSnbase::rtime(raw_rt_to_fill)
## filter by mz for each peak in the list
peaks_int <- vector("list", nrow(rt))
for (p in 1:nrow(rt)) {
## extract scans for the single peak
peak_scans <- which(raw_spec_rt >= rt[p, 1] &
raw_spec_rt <= rt[p, 2])
if (length(peak_scans) == 0) {
peak_int <- data.frame()
} else {
## extract corresponding spectra values
peak_scans <- raw_spec[peak_scans]
peak_mz <- mz[p, ]
peak_int <- lapply(peak_scans, function(scan) {
suppressWarnings(
scan_mz <- MSnbase::filterMz(scan, peak_mz)
)
if (!scan_mz@peaksCount) {
NULL
} else {
data.frame(rt = scan_mz@rt, mz = scan_mz@mz, int = scan_mz@intensity)
}
})
peak_int <- Filter(length, peak_int)
if (length(peak_int) > 0) {
peak_int <- do.call("rbindCLEAN", peak_int)
} else {
peak_int <- data.frame()
}
}
peaks_int[[p]] <- peak_int
}
return(peaks_int)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.