R/DeconvoluteSpectrum.R
In HiResTEC: Non-Targeted Fluxomics on High-Resolution Mass-Spectrometry Data

Documented in DeconvoluteSpectrum

#' @title Deconvolute a Mass Spectrum from a list of raw data files.
#'
#' @description \code{DeconvoluteSpectrum} will evaluate a list of `xcmsRaw` or
#'     `xcmsRawLike` objects at a given time (rt) and potential mass (mz1). The
#'     main purpose is to deconvolute the mass spectrum at rt including mz1.
#'
#' @details The specific advantage of \code{DeconvoluteSpectrum} is, that it does
#'     not deconvolute signals within a single measurement file but uses correlation
#'     tests over a set of measurements to improve statistical power. It will test
#'     all mz around a specified rt to co-apex with some mz1, have a low rt
#'     difference and consistent intensity ratio over all samples.
#'
#' @param dat A list of `xcmsRaw` or `xcmsRawLike` objects.
#' @param rt Retention time of the expected peak.
#' @param rt_dev Allowed retention time deviation.
#' @param mz1 If specified, ensure that this mass is included in the spectrum (assumed base peak). Can be NULL otherwise in which case the most intense peak at rt will be selected as mz1.
#' @param mz_dev Allowed mz deviation [Da].
#' @param use.mz.adjust Will adjust mz on an experiment wide basis.
#' @param ionization Either APCI or ESI. Choice will modify some internal parameters and checks performed.
#' @param smooth Smoothing parameter passed on to \link{getMultipleBPC}.
#'
#' @return A pseudo spectrum at rt (containing mz1 if specified). Effectively
#'     a 2-column matrix (mz, int) with rt as attribute.
#'
#' @examples
#' # The example measurement data provided with HiResTEC contain a peak at 1026s
#' raw <- HiResTEC::raw
#' HiResTEC::DeconvoluteSpectrum(raw, rt = 1026)
#'
#' @export
#'
#' @importFrom graphics par plot points legend
#' @importFrom stats median cor sd

DeconvoluteSpectrum <- function(dat = NULL, rt = NULL, rt_dev = 3, mz1 = NULL, mz_dev = 0.003, use.mz.adjust = FALSE, ionization = c("APCI", "ESI")[1], smooth = 0) {
  # putative parameters
  # rt_dev_max function tries to adjust rt_dev internally; however if this results in extrem values rt_dev_max is applied to limit this
  if (ionization == "APCI") {
    rt_dev_max <- 3 # how far may peaks deviate (between samples)
    rt_dev_min <- 0.5
    allowed_d_rt <- 0.15 # how far can co-apexes be apart from each other (intra sample)
    min_cor_rat <- 0.7 # how well have maxima to be correlated over samples
    nmax <- 1000 # number of peaks to consider in Deconvolution
  }
  if (ionization == "ESI") {
    rt_dev_max <- 6
    # rt_dev_min <- 1
    # allowed_d_rt <- 0.35
    # min_cor_rat <- 0.7
    rt_dev_min <- 2
    allowed_d_rt <- 0.5
    min_cor_rat <- 0.7
    nmax <- 1000
  }

  GetSpectrum <- function(x = NULL, rt = NULL, cutoff = 200, nmax = 150, se = 2, sort_int = FALSE) {
    # this is the scan where target mz1 is at max within all samples
    s <- which.min(abs(x@scantime - rt))
    if (s == length(x@scanindex)) s <- s - se
    if (s < se) s <- s + se + 1
    # these are potential co-eluting masses and their intensities
    m <- x@env$mz[(1 + x@scanindex[s]):x@scanindex[s + 1]]
    i <- x@env$intensity[(1 + x@scanindex[s]):x@scanindex[s + 1]]
    # m, which are considered for the spectrum should be
    # (i) above a cutoff
    flt <- which(i > cutoff)
    i <- i[flt]
    m <- m[flt]
    # # (ii) have there max in close proximity to s, i.e. they should not show higher int s+-se scans up/downstream
    # m_down <- x@env$mz[(1+x@scanindex[s-se]):x@scanindex[s-se+1]]
    # m_up <- x@env$mz[(1+x@scanindex[s+se]):x@scanindex[s+se+1]]
    # i_down <- x@env$intensity[(1+x@scanindex[s-se]):x@scanindex[s-se+1]]
    # i_up <- x@env$intensity[(1+x@scanindex[s+se]):x@scanindex[s+se+1]]
    # flt <- sapply(1:length(m), function(idx) {
    #   i_comp <- c(i_down[abs(m_down-m[idx])<mz_dev],i_up[abs(m_up-m[idx])<mz_dev])
    #   ifelse(length(i_comp)>=1, i_comp<i[idx], FALSE)
    # })
    # i <- i[flt]
    # m <- m[flt]
    # (iii) limited to a managable amount
    flt <- which(rank(-i) <= nmax)
    i <- i[flt]
    m <- m[flt]
    # return mz which fulfill above criteria
    if (sort_int) {
      return(m[order(i, decreasing = TRUE)])
    } else {
      return(m)
    }
  }

  if (is.null(mz1)) {
    # take the median of highest masses found in all provided samples
    mz1 <- median(sapply(dat, function(x) {
      GetSpectrum(x = x, rt = rt, sort_int = TRUE)[1]
    }))
  }

  # determine rt and int of max mz over exp
  if (!is.null(names(dat))) names(dat) <- NULL
  i_mz1 <- sapply(dat, function(x) {
    x <- getMultipleBPC(x = x, mz = mz1, mz_dev = mz_dev, rt = rt, rt_dev = rt_dev, zeroVal = 0, smooth = smooth)
    if (is.null(x)) {
      return(NA)
    } else {
      out <- x[attr(x, "maxBPC"), ]
      names(out) <- names(attr(x, "maxBPC"))
      return(out)
    }
  })

  # remove files which are empty in this region (e.g. no data recorded within time window or for defined mz-window)
  filt_files <- which(i_mz1 == 0 | is.na(i_mz1))
  if (length(filt_files) >= 1) {
    if (length(filt_files) == length(dat)) {
      spec <- matrix(NA, ncol = 2, dimnames = list(NULL, c("mz", "int")))[-1, ]
      attr(spec, "rt") <- rt
      return(spec)
    } else {
      i_mz1 <- i_mz1[-filt_files]
      dat <- dat[-filt_files]
    }
  }

  # readjust rt and rt_dev based on data
  if (!is.finite(median(as.numeric(names(i_mz1))[i_mz1 > 0], na.rm = T))) {
    #browser()
    message("ToDo: The rt should be adjusted.")
  } else {
    rt <- median(as.numeric(names(i_mz1))[i_mz1 > 0], na.rm = T)
  }

  # ==========================================================================================
  # [ToDo] think about readjusting rt and rt_dev based on this result
  # ==========================================================================================
  rt_dev <- ceiling(max(abs(as.numeric(names(i_mz1))[i_mz1 > 0] - rt)))

  # if rt_dev is still high (shouldn't necessary be larger than 2 for well aligned samples) try to fix
  if (rt_dev > rt_dev_max) {
    rt_dev <- rt_dev_max
  }
  if (rt_dev < rt_dev_min) {
    # happens if single samples are processed
    rt_dev <- rt_dev_min
  }

  # reevaluate mz1 with adjusted rt_dev and redo filtering
  i_mz1 <- sapply(dat, function(x) {
    x <- getMultipleBPC(x = x, mz = mz1, mz_dev = mz_dev, rt = rt, rt_dev = rt_dev, zeroVal = 0)
    if (is.null(x)) {
      return(NA)
    } else {
      out <- x[attr(x, "maxBPC"), ]
      names(out) <- names(attr(x, "maxBPC"))
      return(out)
    }
  })

  # remove files which are empty in this region (e.g. no data recorded within time window)
  filt_files <- which(i_mz1 == 0 | is.na(i_mz1))
  if (length(filt_files) >= 1) {
    if (length(filt_files) == length(dat)) {
      spec <- matrix(NA, ncol = 2, dimnames = list(NULL, c("mz", "int")))[-1, ]
      attr(spec, "rt") <- rt
      return(spec)
    } else {
      i_mz1 <- i_mz1[-filt_files]
      dat <- dat[-filt_files]
    }
  }

  # determine masses from spectrum
  bpm <- which.max(i_mz1)

  # ensure that mz1 is still contained
  # browser()
  mz2 <- GetSpectrum(x = dat[[bpm]], rt = as.numeric(names(i_mz1))[bpm], cutoff = min(c(200, 0.1 * max(i_mz1))), nmax = nmax)
  if (!any(abs(mz2 - mz1) < mz_dev)) mz2 <- sort(c(mz1, mz2))
  idx_mz1 <- which.min(abs(mz2 - mz1))

  # determine maxima for these masses from all files
  out <- lapply(dat, function(x) {
    getMultipleBPC(
      x = x, mz = mz2, mz_dev = mz_dev, rt = rt,
      ## rt_dev = if(smooth>0) rt_dev+diff(range(x@scantime))/length(x@scantime)*smooth else rt_dev, # adjust for loss due to smoothing
      rt_dev = rt_dev,
      zeroVal = NULL, smooth = smooth
    )
  })

  # get diffs for rt at max int between candidates (mz2 and base peak mz1)
  d_rt <- round(apply(sapply(1:length(out), function(i) {
    # attr(out[[i]],"rt")[apply(out[[i]],2,which.max)]-as.numeric(names(i_mz1)[i])
    attr(out[[i]], "rt")[apply(out[[i]], 2, which.max)] - attr(out[[i]], "rt")[which.max(out[[i]][, idx_mz1])]
  }), 1, median), 2)

  # get stable ratios and correlation between mz1 and all mz2's if more than 5 samples (as correlation is flawed otherwise)
  if (length(out) >= 5) {
    cor_rat <- round(apply(sapply(1:length(out), function(i) {
      flt <- rank(-out[[i]][, idx_mz1]) <= 10 & out[[i]][, idx_mz1] > 0
      if (sum(flt) >= 6) {
        suppressWarnings(
          cor(out[[i]][flt, , drop = F], out[[i]][flt, idx_mz1], use = "p")
        )
      } else {
        matrix(0, nrow = ncol(out[[i]]), ncol = 1)
      }
    }), 1, median, na.rm = T), 2)
    msg_warn <- NULL
  } else {
    # set cor_rat=1 artificially
    cor_rat <- rep(1, length(d_rt))
    msg_warn <- "Less than 5 samples provided, no correlation testing was performed."
  }

  mz_rat <- round(apply(sapply(1:length(out), function(i) {
    flt <- rank(-out[[i]][, idx_mz1]) <= 10 & out[[i]][, idx_mz1] > 0
    # this fails quite often
    apply(out[[i]][flt, , drop = F] / (out[[i]][flt, idx_mz1]), 2, median)
  }), 1, median), 4)

  # get mz intensity levels by scaling mz2 with median ratios
  mn_int <- round(i_mz1[bpm] * mz_rat)

  # combine these infos into dataframe
  tmp <- cbind(mz2, mn_int, d_rt, cor_rat)
  rownames(tmp) <- 1:nrow(tmp)

  # filter for mz2 belonging to base peak (=mz1) and return pseudo spectrum
  flt <- abs(d_rt) < allowed_d_rt & cor_rat > min_cor_rat
  # browser()
  if (any(flt, na.rm = T)) {
    spec <- matrix(c(mz2[which(flt)], mn_int[which(flt)]), ncol = 2, dimnames = list(NULL, c("mz", "int")))
    spec <- spec[is.finite(spec[, 2]), , drop = FALSE]
    spec <- spec[spec[, 2] > 0, , drop = FALSE]
  } else {
    spec <- matrix(NA, ncol = 2, dimnames = list(NULL, c("mz", "int")))[-1, ]
  }

  attr(spec, "rt") <- rt
  if (!is.null(msg_warn)) attr(spec, "warning") <- msg_warn
  return(spec)
}