Nothing
#' @title Correct Overloaded Peaks from GC-MS data.
#'
#' @description \code{CorrectOverloadedPeaks} will take an xcmsRaw data structure
#' (or any imported mzXML) and search for overloaded peaks within the mass traces.
#' It will correct overloaded peaks automatically using an Gaussian or
#' IsotopicRatio approach, generate QC plots and write the corrected data
#' back into the original xcmsRaw.
#'
#' @details This is a high level function to batch pre-process metabolomics data
#' which are partially overloaded before continuing with the standard workflow
#' of peak identification etc.. It relies internally on \code{\link{FitGaussPeak}}
#' and \code{\link{FitPeakByIsotopicRatio}} to modify data of individual intensity
#' signal. Basically the function aims to identify automatically overloaded regions
#' and extracts base peak chromatograms for all overloaded m/z traces within these
#' regions, which are corrected and put back into the original data structure.
#' For simplicity some potentially interesting parameters are hidden at the top of
#' the function definition. They have been set to values determined empirically to
#' be working for a Bruker impact II MS (high-res QTOF) coupled to GC and LC via
#' APCI and ESI respectively. For more details please
#' see \doi{10.1021/acs.analchem.6b02515}.
#'
#' @references \doi{10.1021/acs.analchem.6b02515}
#'
#' @param data An xcmsRaw-object or an mzXML-object as imported by \code{\link{read.mzXML}}.
#' @param method Either Gauss or EMG (usually better results) or Isoratio (more robust for non-Gaussian peak shapes).
#' @param detection_limit If=1 only peaks hitting detector saturation (ds) will be corrected, can be lowered to 0.95 to catch also peaks going into saturation.
#' @param ds Detector saturation. Will be determined based on data if not specified explicitly.
#' @param silent QC-plots will be generated if silent=FALSE and additional Warnings() will be generated.
#' @param testing Will automatically set silent=FALSE and store all extracted regions with overloaded peaks in the working directory as \code{cor_df_all.RData}.
#' @param attotwm All-the-Time-of-the-World-Mode. If calculation time doesn't matter try this out. :)
#' @param region From an initial QC-Plot file you may reprocess a specific overloaded region. Don't forget to specify the ds parameter explicitly.
#' @param peak You may further restrict the reprocessing to a specific peak within the region.
#'
#' @return
#' An corrected xcmsRaw- or mzXML-object which can be exported to file. Additionally a QC-plot pdf-file if silent=FALSE.
#'
#' @examples
#' \dontrun{
#' # load mzXML test data
#' data(mzXML_data)
#' CorrectOverloadedPeaks(data = mzXML_data, method = "EMG", silent = FALSE)
#' }
#'
#' @seealso \code{\link{ModelGaussPeak}}
#' @seealso \code{\link{FitGaussPeak}}
#' @seealso \code{\link{FitPeakByIsotopicRatio}}
#' @seealso \code{\link{read.mzXML}}
#' @seealso \code{\link{write.mzXML}}
#'
#' @export
#'
#' @importFrom grDevices dev.off
#' @importFrom grDevices pdf
#' @importFrom graphics legend
#' @importFrom graphics par
#' @importFrom stats median
#'
CorrectOverloadedPeaks <- function(
data = NULL,
method = c("Isoratio", "Gauss", "EMG"),
detection_limit = 1,
ds = NULL,
silent = TRUE,
testing = FALSE,
attotwm = FALSE,
region = NULL,
peak = NULL)
{
# POTENTIAL PARAMETERS that could be allowed for the user to modify
# mz_gap_allowed : what is the MS resolution? Mass accuracy in case of overloading? And, more important, expected mass error in case of overloading (0.13==130mDa)
mz_gap_allowed <- 0.015
# allowed_scans_missing : get number and position of overloaded regions as well as overloaded masses
allowed_scans_missing <- 4
# mz_dev : to test for mass stability [in ppm]
mz_dev <- 15
# corr_scan_offset : what is the upstream/downstream offset used for mass/int correction
# depending on the scan rate and the separation quality/slope of peak front/tail
corr_scan_offset <- c(-10, 10)
# ds_cutoff : lower level for iso-ratio determination; upper level for non-linearity/correction-assumption
ds_cutoff <- c(0.005, 0.9)
# cor_fac_front : for 'isoratio': use only front (TRUE) or tail (FALSE) information to correct data
cor_fac_front <- TRUE
# limit_of_correction [time unit of sample]: peaks which are overloaded for more than this should not attempted to be corrected
limit_of_correction <- 8
# allowed_front_tail_ratio : what is the maximum ratio allowed in min(front)/min(tail); if aftr=10 than fronting is assumed for 10 fold difference; set to Inf to avaoid peak shape testing
aftr <- 2.5
# try to correct for ion suppression
correct_ion_suppression <- FALSE
# in Gauss method cut off baseline to improve fitting
account_for_baseline_offset <- TRUE
if (testing) silent <- FALSE
method <- match.arg(method)
# internal functions
DetermineDetectorSaturation <- function(ds, pks, inf) {
if (is.null(ds)) {
ds <- max(sapply(pks, function(x) {
max(x[, 2])
}))
c1 <- sum(sapply(pks, function(x) {
sum(x[, 2] == ds)
})) >= 3
c2 <- sum(table(unlist(sapply(pks, function(x) {
x[x[, 2] > (ds / 2), 2]
}))) >= sum(sapply(pks, function(x) {
sum(x[, 2] == ds)
})) / 2) == 1
c3 <- all(sapply(pks, function(x) {
max(x[, 2]) == ds
}) == (inf[, "basePeakIntensity"] == ds))
overloaded_peaks_present <- c1 && c2 && c3
} else {
overloaded_peaks_present <- any(sapply(pks, function(x) {
any(x[, 2] >= ds)
}))
}
attr(ds, "opp") <- overloaded_peaks_present
return(ds)
}
GetOverloadedRegions <- function(inf, pks, lim, allowed_scans_missing, mz_gap_allowed) {
tmp <- GroupByGaps(x = inf[which(inf[, "basePeakIntensity"] >= lim), "retentionTime"], gap = allowed_scans_missing * median(diff(inf[, "retentionTime"])))
tmp <- split(inf[which(inf[, "basePeakIntensity"] >= lim), c("seqNum", "retentionTime", "basePeakMZ")], tmp) # ; length(ovreg)
mzaff_all <- lapply(1:length(tmp), function(i) {
sapply(tmp[[i]][, "seqNum"], function(j) {
pks[[j]][which(pks[[j]][, 2] >= lim), 1]
})
})
mzovld <- lapply(mzaff_all, function(x) {
sapply(split(unlist(x), GroupByGaps(unlist(x), mz_gap_allowed)), median)
})
for (i in 1:length(mzovld)) attr(mzovld[[i]], "seqNum") <- range(tmp[[i]][, "seqNum"])
return(mzovld)
}
ExtractRawData <- function(mz, scans, pks, lim, mz_gap_allowed) {
# extract intensity of M+0 and M+x until the first isotope no exceeding ds in wide window
idx_iso <- 1
cor_df <- data.frame(
inf[scans, c("seqNum", "retentionTime")],
t(sapply(scans, function(j) {
pks[[j]][which.min(abs(mz - pks[[j]][, 1])), , drop = F]
})),
t(sapply(scans, function(j) {
pks[[j]][which.min(abs(mz + idx_iso * 1.0033 - pks[[j]][, 1])), , drop = F]
}))
)
colnames(cor_df) <- c("Scan", "RT", "mz0", "int0", "mz1", "int1")
check_iso_int <- any(cor_df[, "int1"] >= detection_limit * ds)
while (check_iso_int) {
idx_iso <- idx_iso + 1
cor_df <- cbind(cor_df, data.frame(t(sapply(scans, function(j) {
pks[[j]][which.min(abs(mz + idx_iso * 1.0033 - pks[[j]][, 1])), , drop = F]
}))))
colnames(cor_df)[ncol(cor_df) - c(1, 0)] <- paste0(c("mz", "int"), idx_iso)
check_iso_int <- any(cor_df[, paste0("int", idx_iso)] >= detection_limit * ds)
}
cor_df <- cbind(cor_df, "modified" = FALSE)
# restrict to overloaded area for this specific M0
idx <- range(which(cor_df[, "int0"] >= (ds_cutoff[2] * detection_limit * ds)))
if ((idx[1] + corr_scan_offset[1]) >= 1) {
idx[1] <- idx[1] + corr_scan_offset[1]
} else {
idx[1] <- 1
}
if ((idx[2] + corr_scan_offset[2]) <= nrow(cor_df)) {
idx[2] <- idx[2] + corr_scan_offset[2]
} else {
idx[2] <- nrow(cor_df)
}
idx <- seq(idx[1], idx[2])
cor_df <- cor_df[idx, ]
# test if this area contains two peaks (e.g. isoforms) of same mz and limit to larger overloaded and give a warning
# tmp <- cor_df[,"int0"]>(ds_cutoff[2]*detection_limit*ds)
# diff(cumsum(tmp))
# remove scans with strong mz deviation for M0
f <- abs(cor_df[, "mz0"] - mz) < mz_gap_allowed
if (any(!f)) cor_df <- cor_df[cumsum(f) > 0, ]
# limit to interesting area
idx <- range(which(cor_df[, "int0"] > lim))
cor_df <- cor_df[seq(idx[1], idx[2]), ]
return(cor_df)
}
PutDataBack <- function(pks, cor_df) {
if (any(cor_df[, "modified"])) {
for (j in cor_df$Scan[cor_df[, "modified"]]) {
idx <- which(cor_df$Scan == j)
pks[[j]][which.min(abs(pks[[j]][, 1] - cor_df[idx, "mz0"])), 2] <- cor_df[idx, "int0"]
pks[[j]][which.min(abs(pks[[j]][, 1] - cor_df[idx, "mz1"])), 2] <- cor_df[idx, "int1"]
pks[[j]][which.min(abs(pks[[j]][, 1] - cor_df[idx, "mz2"])), 2] <- cor_df[idx, "int2"]
}
}
return(pks)
}
# extract peak and scan infos
if (inherits(data, "xcmsRaw")) {
scannum <- as.integer(diff(c(data@scanindex, length(data@env$mz))))
pks <- sapply(split(matrix(c(data@env$mz, data@env$intensity), ncol = 2), factor(rep(1:length(scannum), times = scannum))), matrix, ncol = 2)
# $$JL: fixed in V1.2.17$$
# inf <- data.frame("seqNum"=data@acquisitionNum,"retentionTime"=data@scantime,"basePeakMZ"=sapply(pks,function(x){x[which.max(x[,2])[1],1]}),"basePeakIntensity"=sapply(pks,function(x){max(x[,2])}))
inf <- data.frame("seqNum" = 1:length(data@scanindex), "retentionTime" = data@scantime, "basePeakMZ" = sapply(pks, function(x) {
x[which.max(x[, 2])[1], 1]
}), "basePeakIntensity" = sapply(pks, function(x) {
max(x[, 2])
}))
}
if (inherits(data, "mzXML")) {
pks <- lapply(data[["scan"]], function(x) {
as.matrix(data.frame("mz" = as.numeric(x$mass), "intensity" = as.numeric(x$peaks)))
})
tmp.rt <- sapply(data[["scan"]], function(x) {
x <- strsplit(x$scanAttr, " ")[[1]]
x[grep("retentionTime", x)]
})
tmp.rt <- as.numeric(gsub("S\"", "", gsub("retentionTime=\"PT", "", tmp.rt)))
inf <- data.frame("seqNum" = 1:length(pks), "retentionTime" = tmp.rt, "basePeakMZ" = sapply(pks, function(x) {
x[which.max(x[, 2])[1], 1]
}), "basePeakIntensity" = sapply(pks, function(x) {
max(x[, 2])
}))
}
# if an attribute 'file_in' is attached to 'data' it will be used for warnings and plot names
if (!is.null(attr(data, "file_in"))) file_in <- attr(data, "file_in") else file_in <- ""
# determine value of detector saturation (if not specified explicitly) and presence of overloaded peaks
ds <- DetermineDetectorSaturation(ds = ds, pks = pks, inf = inf)
if (!silent) cat(paste("\nProcessing...", file_in, "\n"))
# apply correction if overloaded peaks are present
if (!attr(ds, "opp")) {
if (!silent) print("Found no overloaded peaks in sample. Return data as is.")
} else {
# file name for plot output
if (!silent) grDevices::pdf(ifelse(file_in == "", paste0(format(Sys.time(), "%Y-%m-%d_%H%M%S"), "_CorrectOverloadedPeaks_ControlPlot.pdf"), paste0(file_in, ".pdf")))
# search rt-regions and masses above detector saturation
mzovld <- GetOverloadedRegions(inf = inf, pks = pks, lim = detection_limit * ds, allowed_scans_missing = allowed_scans_missing, mz_gap_allowed = mz_gap_allowed)
if (!silent) cat(paste("\nTrying to correct", length(mzovld), "overloaded regions.\n"))
# write all extracted raw data into list for individual file for testing purposes
if (testing) {
cor_df_all <- lapply(1:length(mzovld), function(i) {
vector("list", length = length(mzovld[[i]]))
})
}
# correct overloaded intensity data in 'i' regions by extrapolating based on isotopic ratio or gaussian approximation
for (i in 1:length(mzovld)) {
tmp_rng <- attr(mzovld[[i]], "seqNum")
if ((tmp_rng[1] + corr_scan_offset[1]) >= 1) tmp_rng[1] <- tmp_rng[1] + corr_scan_offset[1]
if ((tmp_rng[2] + corr_scan_offset[2]) <= nrow(inf)) tmp_rng[2] <- tmp_rng[2] + corr_scan_offset[2]
scans <- seq(min(tmp_rng), max(tmp_rng))
# and all M0 within these regions
for (k in 1:length(mzovld[[i]])) {
# extract data/put M0-M2 infos together
cor_df <- ExtractRawData(mz = mzovld[[i]][k], scans = scans, pks = pks, lim = ds_cutoff[1] * ds, mz_gap_allowed = mz_gap_allowed)
# keep next line for testing
if (testing) cor_df_all[[i]][[k]] <- cor_df
# error checks/warnings
# [$$] is currently not used (to many warnings)
if (!silent & FALSE) {
# test for mass stability
if (!all(apply(cor_df[, c("mz0", "mz1", "mz2")], 2, function(x) {
diff(range(x)) < mz_dev * median(x) / 10^6
}))) {
warning(paste(basename(file_in), "region", i, "m/z", k, "Strong mass diff"))
}
# test if M1 and M2 are overloaded
if (all(apply(cor_df[, c("int1", "int2")], 2, function(x) {
max(x) > ds_cutoff[2] * ds
}))) {
warning(paste(basename(file_in), "region", i, "m/z", k, "Both isotopes approach detector saturation"))
}
# test if M1 and M2 are flatened
if (all(apply(cor_df[, c("int1", "int2")], 2, is.FlatTopPeak))) {
warning(paste(basename(file_in), "region", i, "m/z", k, "Both isotopes are flat-toped (while below detector saturation)"))
}
}
# get the scans which need to be corrected, this is usually determined by 'ds_cutoff' and 'detection_limit'
idx <- range(which(cor_df[, "int0"] >= (ds_cutoff[2] * detection_limit * ds)))
idx <- seq(idx[1], idx[2])
# however, in regions with several overloaded signals we observe ion suppression for 'smaller' peaks which could be isotopes, fragments
# but also M+H in case that a fragment is the base peak. Therefore, we may want to modify the cut_off parameter in FitGaussPeak to not include
if (correct_ion_suppression) {
# data points already affected by ion suppression. We can test that by:
test_ionsupp <- attr(mzovld[[i]], "seqNum") - range(cor_df[, "Scan"])
test_ionsupp <- test_ionsupp[1] < 4 | test_ionsupp[2] > -4
# test_ionsupp <- test_ionsupp[1]<4 & test_ionsupp[2]>-4
if (test_ionsupp) {
# and adjust idx accordingly
cutoff_modifier <- 0.4
idx <- range(which(cor_df[, "int0"] >= (cutoff_modifier * ds_cutoff[2] * detection_limit * ds)))
idx <- seq(idx[1], idx[2])
}
} else {
test_ionsupp <- FALSE
}
# check for various potential errors and drawbacks
err_code <- 0
# set a limit to meaningfull correction
if (diff(range(cor_df[idx, "RT"])) >= limit_of_correction) err_code <- 1
# check if front/tail is missing (may happen, if import window was to narrow)
if (min(idx) == 1) err_code <- 2
if (max(idx) == nrow(cor_df)) err_code <- 3
# temporary error code for testing
# specifying i and k as identified from a QC-plot file here allows to interrupt at this specific peak
if (!is.null(region)) {
err_code <- ifelse(i != region, 9, err_code)
}
if (!is.null(peak)) {
err_code <- ifelse(k != peak, 9, err_code)
}
# print some error messages
if (err_code > 0) {
tmp.rt <- round(mean(cor_df[idx, "RT"]), 2)
tmp.mz <- round(mean(cor_df[idx, "mz0"]), 4)
if (err_code == 1) print(paste("Did not correct a peak with overloading wider than", limit_of_correction, "at RT =", tmp.rt, "for mz =", tmp.mz))
if (err_code == 2) print(paste("Did not correct a peak without detectable front at RT =", tmp.rt, "for mz =", tmp.mz))
if (err_code == 3) print(paste("Did not correct a peak without detectable tail at RT =", tmp.rt, "for mz =", tmp.mz))
if (err_code == 9) print("Skip peak because 'region' and/or 'peak' are specified.")
} else {
# browser()
if (!silent) print(paste("Processing Region/Mass:", paste(i, "/", k)))
# calculate IsotopicRatioFit as basis or final, correct only M0 intensity because M+1 will be corrected independently if overloaded
cor_df_mod <- cor_df
cor_df_mod[, "int0"] <- FitPeakByIsotopicRatio(cor_df, idx = idx, silent = silent)[, "int0"]
# calculate gauss fit aditionally if specified
if (method == "Gauss" | method == "EMG") {
front_scans <- 1:ifelse(min(idx) <= 1, 1, min(idx) - 1)
tail_scans <- ifelse(max(idx) < nrow(cor_df), max(idx) + 1, nrow(cor_df)):nrow(cor_df)
# is peak tail or front skewed??
# assumption: intensity should start and fall back to baseline of similar vlaue, if not probably fronting/tailing occur
test_peak_shape <- min(cor_df[front_scans, "int0"]) / min(cor_df[tail_scans, "int0"])
gauss_scale <- c(0.8, length(idx) * 2)
gauss_steps <- length(idx) * 10
gauss_main <- paste("mz =", round(median(cor_df[, "mz0"]), 4), paste0("(Region ", i, ", Peak ", k, ")"))
# fronting
if (test_peak_shape > aftr) {
gauss_fit <- FitGaussPeak(x = cor_df[, "RT"], y = cor_df[, "int0"], scale_range = gauss_scale, steps = gauss_steps, idx = idx, strip_data = "front", account_for_baseline_offset = account_for_baseline_offset, method = method, silent = silent, xlab = "RT", ylab = "Int", main = gauss_main)
if (!silent) graphics::legend(x = "bottomleft", "fronting detected", bty = "n", text.col = 2)
}
# tailing
if (test_peak_shape < (1 / aftr)) {
gauss_fit <- FitGaussPeak(x = cor_df[, "RT"], y = cor_df[, "int0"], scale_range = gauss_scale, steps = gauss_steps, idx = idx, strip_data = "tail", account_for_baseline_offset = account_for_baseline_offset, method = method, silent = silent, xlab = "RT", ylab = "Int", main = gauss_main)
if (!silent) graphics::legend(x = "bottomright", "tailing detected", bty = "n", text.col = 2)
}
# more or less symmetric peak
if (test_peak_shape >= (1 / aftr) & test_peak_shape <= aftr) {
gauss_fit <- FitGaussPeak(x = cor_df[, "RT"], y = cor_df[, "int0"], scale_range = gauss_scale, steps = gauss_steps, idx = idx, method = method, account_for_baseline_offset = account_for_baseline_offset, silent = silent, xlab = "RT", ylab = "Int", main = gauss_main)
}
if (!silent & test_ionsupp) graphics::legend(x = "bottom", "ion suppression detected", bty = "n", text.col = 2)
# if isoratio-fit is not higher than gauss solution (should not happen) then substitute and keep iso-solution elsewise as a fallback
if (max(cor_df_mod[, "int0"]) < max(gauss_fit)) cor_df_mod[, "int0"] <- gauss_fit
}
if (attotwm) {
# re-initialize index
idx <- range(which(cor_df[, "int0"] >= (ds_cutoff[2] * detection_limit * ds)))
idx <- seq(idx[1], idx[2])
# generate an overview for results of various parameter combinations
attotwm_m <- rep(c("Gauss", "EMG"), each = 6)
attotwm_c <- rep(c(0.4, 0.9, 0.4, 0.9), each = 3)
attotwm_s <- rep(c("front", "none", "tail"), times = 4)
attotwm_r <- NULL
graphics::par(mfrow = c(4, 3))
graphics::par(cex = 0.4)
for (attotwm_i in 1:12) {
attotwm_r <- c(attotwm_r, max(FitGaussPeak(
x = cor_df[, "RT"], y = cor_df[, "int0"], scale_range = gauss_scale, steps = gauss_steps, account_for_baseline_offset = account_for_baseline_offset, silent = silent, xlab = "RT", ylab = "Int",
cutoff = attotwm_c[attotwm_i],
strip_data = attotwm_s[attotwm_i],
method = attotwm_m[attotwm_i],
main = paste(attotwm_m[attotwm_i], attotwm_c[attotwm_i], attotwm_s[attotwm_i], sep = ", ")
)))
}
graphics::par(mfrow = c(1, 1))
graphics::par(cex = 1)
gauss_fit <- FitGaussPeak(
x = cor_df[, "RT"], y = cor_df[, "int0"], scale_range = gauss_scale, steps = gauss_steps, account_for_baseline_offset = account_for_baseline_offset, silent = silent, xlab = "RT", ylab = "Int",
cutoff = attotwm_c[which.max(attotwm_r)],
strip_data = attotwm_s[which.max(attotwm_r)],
method = attotwm_m[which.max(attotwm_r)],
main = paste(attotwm_m[which.max(attotwm_r)], attotwm_c[which.max(attotwm_r)], attotwm_s[which.max(attotwm_r)], sep = ", ")
)
if (max(cor_df_mod[, "int0"]) < max(gauss_fit)) cor_df_mod[, "int0"] <- gauss_fit
}
cor_df_mod[idx, "modified"] <- TRUE
# put corrected data back into temporary peak list
pks <- PutDataBack(pks = pks, cor_df = cor_df_mod)
}
}
}
# close plotting device
if (!silent) grDevices::dev.off()
# assign to base
if (testing) {
print("Storing non-corrected data information in 'cor_df_all.RData'")
save(cor_df_all, file = "cor_df_all.RData")
}
# put corrected data back into file
if (inherits(data, "xcmsRaw")) {
# data@env$mz <- unlist(sapply(pks,function(x){x[,1]}))
data@env$intensity <- unlist(sapply(pks, function(x) {
x[, 2]
}))
}
if (inherits(data, "mzXML")) {
# for (i in 1:length(pks)) data[["scan"]][[i]][["mass"]] <- pks[[i]][,1]
# print(table(sapply(1:length(pks), function(i) {all(data[["scan"]][[i]][["peaks"]]==round(pks[[i]][,2]))})))
for (i in 1:length(pks)) data[["scan"]][[i]][["peaks"]] <- round(pks[[i]][, 2])
# print(table(sapply(1:length(pks), function(i) {all(data[["scan"]][[i]][["peaks"]]==round(pks[[i]][,2]))})))
}
}
invisible(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.