R/53_ms2_spectra_matching_scores.R

Defines functions remove_noise match_ms2_fragments calculate_dotproduct calculate_ms2_matching_score

Documented in calculate_dotproduct calculate_ms2_matching_score match_ms2_fragments remove_noise

#' Calculate MS2 Spectra Matching Score
#'
#' This function computes the matching score between experimental and library MS2 spectra using the dot product method. The function allows for fragment matching based on mass-to-charge ratio (m/z) tolerance and intensity comparison, and calculates the score based on forward and reverse dot product and the fraction of matched fragments.
#'
#' @param experimental.spectrum A data frame representing the experimental spectrum, with columns `mz` for mass-to-charge ratios and `intensity` for corresponding intensities.
#' @param library.spectrum A data frame representing the library spectrum, with columns `mz` for mass-to-charge ratios and `intensity` for corresponding intensities.
#' @param ms2.match.ppm Numeric. The mass tolerance in parts per million (ppm) for matching MS2 fragments. Default is 30.
#' @param mz.ppm.thr Numeric. Threshold for m/z to use in ppm calculation. Default is 400.
#' @param method Character. The method to calculate the matching score. Currently, only "dotproduct" is supported. Default is "dotproduct".
#' @param fraction.weight Numeric. The weight of the fraction of matched fragments in the overall score. Default is 0.2.
#' @param dp.forward.weight Numeric. The weight for the forward dot product score in the overall score. Default is 0.7.
#' @param dp.reverse.weight Numeric. The weight for the reverse dot product score in the overall score. Default is 0.1.
#' @param remove_fragment_intensity_cutoff Numeric. A cutoff value to remove low-intensity fragments from both experimental and library spectra. Default is 0.
#'
#' @return A numeric value representing the MS2 matching score, which is a weighted combination of the forward dot product score, reverse dot product score, and the fraction of matched fragments.
#'
#' @details
#' The function normalizes the intensities of both the experimental and library spectra and filters out fragments below the specified intensity cutoff. It then calculates the matching score based on the dot product method. The forward dot product score is calculated using all matched fragments, while the reverse dot product score is computed using only the fragments that have non-zero intensity in the library spectrum.
#'
#' @examples
#' experimental.spectrum <- data.frame(mz = 1:10, intensity = 1:10)
#' library.spectrum <- data.frame(mz = 1:10, intensity = 1:10)
#' get_spectra_match_score(experimental.spectrum, library.spectrum)
#'
#' @author
#' Xiaotao Shen \email{xiaotao.shen@outlook.com}
#'
#' @export

calculate_ms2_matching_score <-
  function(experimental.spectrum,
           library.spectrum,
           ms2.match.ppm = 30,
           mz.ppm.thr = 400,
           method = c("dotproduct"),
           fraction.weight = 0.2,
           dp.forward.weight = 0.7,
           dp.reverse.weight = 0.1,
           remove_fragment_intensity_cutoff = 0) {
    method <- match.arg(method)
    experimental.spectrum <- as.data.frame(experimental.spectrum)
    library.spectrum <- as.data.frame(library.spectrum)
    
    experimental.spectrum$intensity <-
      experimental.spectrum$intensity / max(experimental.spectrum$intensity)
    library.spectrum$intensity <-
      library.spectrum$intensity / max(library.spectrum$intensity)
    
    experimental.spectrum <-
      experimental.spectrum %>%
      dplyr::filter(intensity > remove_fragment_intensity_cutoff)
    
    library.spectrum <-
      library.spectrum %>%
      dplyr::filter(intensity > remove_fragment_intensity_cutoff)
    
    match.matrix <-
      match_ms2_fragments(
        experimental.spectrum = experimental.spectrum,
        library.spectrum = library.spectrum,
        ms2.match.ppm = ms2.match.ppm,
        mz.ppm.thr = mz.ppm.thr
      )
    
    if (method == "dotproduct") {
      fraction <-
        sum(!is.na(match.matrix$Lib.index) &
              !is.na(match.matrix$Exp.index)) / nrow(match.matrix)
      
      dp_forward <- calculate_dotproduct(exp.int = match.matrix$Exp.intensity,
                                         lib.int = match.matrix$Lib.intensity)
      dp_reverse <-
        calculate_dotproduct(exp.int =
                               match.matrix$Exp.intensity[which(match.matrix$Lib.intensity > 0)],
                             lib.int =
                               match.matrix$Lib.intensity[which(match.matrix$Lib.intensity > 0)])
      dp_forward[is.na(dp_forward)] <- 0
      dp_reverse[is.na(dp_reverse)] <- 0
      ms2_match_score <-
        dp_forward * dp.forward.weight + dp_reverse * dp.reverse.weight +
        fraction * fraction.weight
    }
    
    return(ms2_match_score)
  }


#' Calculate Dot Product Score for MS2 Spectra
#'
#' This function calculates the dot product score between experimental and library MS2 spectra intensities. It weights the intensities and computes the similarity score using a modified dot product formula.
#'
#' @param exp.int A numeric vector of intensities from the experimental spectrum.
#' @param lib.int A numeric vector of intensities from the library spectrum.
#'
#' @return A numeric value representing the dot product similarity score between the experimental and library spectra.
#'
#' @details
#' The function applies a custom weighting scheme to the experimental and library intensities, where the weight for each intensity value is computed based on its proportion relative to the total sum of intensities in its respective spectrum. The weighted intensities are then used to calculate the dot product score, which is normalized to give a similarity score between the two spectra.
#'
#' @examples
#' calculate_dotproduct(exp.int = 1:10, lib.int = 1:10)
#'
#' @author
#' Xiaotao Shen \email{xiaotao.shen@outlook.com}
#'
#' @export

calculate_dotproduct <-
  function(exp.int, lib.int) {
    exp.weight <- lapply(exp.int, function(x) {
      1 / (1 + x / (sum(exp.int) - 0.5))
    }) %>%
      unlist()
    
    lib.weight <- lapply(lib.int, function(x) {
      1 / (1 + x / (sum(lib.int) - 0.5))
    }) %>%
      unlist()
    
    x <- exp.weight * exp.int
    y <- lib.weight * lib.int
    return(sum(x * y) ^ 2 / (sum(x ^ 2) * sum(y ^ 2)))
  }


#' Match MS2 Fragments Between Experimental and Library Spectra
#'
#' This function matches MS2 fragments between an experimental spectrum and a library spectrum based on m/z values, using a specified mass tolerance (ppm). It returns a matrix of matched fragments, including their m/z values and intensities.
#'
#' @param experimental.spectrum A data frame representing the experimental spectrum, with columns `mz` for mass-to-charge ratios and `intensity` for corresponding intensities.
#' @param library.spectrum A data frame representing the library spectrum, with columns `mz` for mass-to-charge ratios and `intensity` for corresponding intensities.
#' @param ms2.match.ppm Numeric. The mass tolerance in parts per million (ppm) for matching MS2 fragments. Default is 30.
#' @param mz.ppm.thr Numeric. A threshold for mass-to-charge ratio (m/z) to use in ppm calculation. Default is 400.
#'
#' @return A data frame with matched MS2 fragments, containing the following columns:
#' \describe{
#'   \item{Lib.index}{The index of the fragment in the library spectrum.}
#'   \item{Exp.index}{The index of the matched fragment in the experimental spectrum.}
#'   \item{Lib.mz}{The m/z value of the fragment in the library spectrum.}
#'   \item{Lib.intensity}{The intensity of the fragment in the library spectrum.}
#'   \item{Exp.mz}{The m/z value of the matched fragment in the experimental spectrum.}
#'   \item{Exp.intensity}{The intensity of the matched fragment in the experimental spectrum.}
#' }
#'
#' @details
#' The function first removes noisy fragments from both the experimental and library spectra based on the provided mass tolerance (ppm). It then performs matching by comparing the m/z values of the fragments. If multiple experimental fragments match a library fragment, the one with the highest intensity is selected. Any unmatched fragments are added with NA values for the respective library or experimental fragment.
#'
#'
#' @author
#' Xiaotao Shen \email{xiaotao.shen@outlook.com}
#'
#' @export

match_ms2_fragments <-
  function(experimental.spectrum,
           library.spectrum,
           ms2.match.ppm = 30,
           mz.ppm.thr = 400,
           direction = c("reverse", "forward"),
           remove.noise = TRUE) {
    direction <-
      match.arg(direction)
    ## remove noisy fragments
    if (remove.noise) {
      experimental.spectrum <-
        remove_noise(
          spectrum = experimental.spectrum,
          ms2.match.ppm = ms2.match.ppm,
          mz.ppm.thr = mz.ppm.thr
        )
      library.spectrum <-
        remove_noise(
          spectrum = library.spectrum,
          ms2.match.ppm = ms2.match.ppm,
          mz.ppm.thr = mz.ppm.thr
        )
    }
    
    ## for each fragment in library.spectrum,
    ## its matched fragments index in experimental.spectrum
    if (direction == "reverse") {
      match.idx <-
        lapply(library.spectrum$mz, function(x) {
          diff.mz <- abs(x - experimental.spectrum$mz)
          x[x < mz.ppm.thr] <- mz.ppm.thr
          mz.error <- diff.mz * 10 ^ 6 / x
          temp.idx <- which(mz.error < ms2.match.ppm)
          if (length(temp.idx) == 0) {
            return(NA)
          }
          if (length(temp.idx) > 1) {
            return(temp.idx[which.max(experimental.spectrum$intensity[temp.idx])])
          }
          return(temp.idx)
        })
      
      match.idx <- do.call(rbind, match.idx)
      match.idx <- cbind(seq_len(nrow(match.idx)), match.idx)
      colnames(match.idx) <- c("Lib", "Exp")
      
      non.idx2 <-
        setdiff(c(seq_len(nrow(
          experimental.spectrum
        ))), match.idx[, 2][!is.na(match.idx[, 2])])
      
      if (length(non.idx2) != 0) {
        match.idx2 <- data.frame(NA, non.idx2, stringsAsFactors = FALSE)
        colnames(match.idx2) <- c("Lib", "Exp")
      } else {
        match.idx2 <- NULL
      }
      
      match.matrix <-
        as.data.frame(rbind(match.idx, match.idx2), stringsAsFactors = FALSE)
      
      match.matrix <-
        data.frame(match.matrix, library.spectrum[match.matrix$Lib, c(1, 2)], experimental.spectrum[match.matrix$Exp, c(1, 2)])
      colnames(match.matrix) <-
        c("Lib.index",
          "Exp.index",
          "Lib.mz",
          "Lib.intensity",
          "Exp.mz",
          "Exp.intensity")
    } else{
      match.idx <-
        lapply(experimental.spectrum$mz, function(x) {
          diff.mz <- abs(x - library.spectrum$mz)
          x[x < mz.ppm.thr] <- mz.ppm.thr
          mz.error <- diff.mz * 10 ^ 6 / x
          temp.idx <- which(mz.error < ms2.match.ppm)
          if (length(temp.idx) == 0) {
            return(NA)
          }
          if (length(temp.idx) > 1) {
            return(temp.idx[which.max(library.spectrum$intensity[temp.idx])])
          }
          return(temp.idx)
        })
      
      match.idx <- do.call(rbind, match.idx)
      match.idx <- cbind(seq_len(nrow(match.idx)), match.idx)
      colnames(match.idx) <- c("Exp", "Lib")
      
      non.idx2 <-
        setdiff(c(seq_len(nrow(
          library.spectrum
        ))), match.idx[, 2][!is.na(match.idx[, 2])])
      
      if (length(non.idx2) != 0) {
        match.idx2 <- data.frame(NA, non.idx2, stringsAsFactors = FALSE)
        colnames(match.idx2) <- c("Exp", "Lib")
      } else {
        match.idx2 <- NULL
      }
      
      match.matrix <-
        as.data.frame(rbind(match.idx, match.idx2), stringsAsFactors = FALSE)
      
      match.matrix <-
        data.frame(match.matrix, library.spectrum[match.matrix$Lib, c(1, 2)], experimental.spectrum[match.matrix$Exp, c(1, 2)])
      colnames(match.matrix) <-
        c("Exp.index",
          "Lib.index",
          "Lib.mz",
          "Lib.intensity",
          "Exp.mz",
          "Exp.intensity")
    }
    
    match.matrix$Lib.intensity[is.na(match.matrix$Lib.intensity)] <- 0
    match.matrix$Exp.intensity[is.na(match.matrix$Exp.intensity)] <- 0
    rownames(match.matrix) <- NULL
    match.matrix
  }


#' Remove Noise from MS2 Spectrum
#'
#' This function removes noisy fragments from an MS2 spectrum based on mass-to-charge ratio (m/z) tolerance and intensity comparison, ensuring that only the most relevant peaks are retained.
#'
#' @param spectrum A data frame representing the MS2 spectrum, with columns `mz` for mass-to-charge ratios and `intensity` for corresponding intensities.
#' @param ms2.match.ppm Numeric. The mass tolerance in parts per million (ppm) for determining whether peaks are considered duplicates/noise. Default is 30.
#' @param mz.ppm.thr Numeric. A threshold for m/z values to use in ppm calculation. If the m/z difference is below this value, it is set to this threshold. Default is 400.
#'
#' @return A data frame with the noisy fragments removed. The data frame retains the columns `mz` and `intensity` for the cleaned spectrum.
#'
#' @details
#' The function sorts the spectrum by m/z values and calculates the differences between adjacent fragments. If the difference between any two fragments' m/z values is less than the specified ppm threshold, the fragment with the lower intensity is removed. This helps clean the spectrum by filtering out low-intensity fragments that are close to higher-intensity ones.
#'
#' @examples
#' \dontrun{
#' # Example spectrum
#' spectrum <- data.frame(mz = c(100, 100.001, 150, 200, 250), intensity = c(10, 5, 50, 100, 30))
#'
#' # Remove noise from the spectrum
#' clean_spectrum <- remove_noise(spectrum = spec, ms2.match.ppm = 30, mz.ppm.thr = 400)
#'
#' print(clean_spec)
#' }
#'
#' @author
#' Xiaotao Shen \email{xiaotao.shen@outlook.com}
#'
#' @export

remove_noise <-
  function(spectrum,
           ms2.match.ppm = 30,
           mz.ppm.thr = 400) {
    if (nrow(spectrum) == 1) {
      return(spectrum)
    }
    spectrum <- spectrum[order(spectrum[, 1]), ]
    mz <- spectrum[, 1]
    mz <- mz[-1]
    diff.mz <- diff(spectrum[, 1])
    mz[which(mz < mz.ppm.thr)] <- mz.ppm.thr
    mz.error <- diff.mz * 10 ^ 6 / mz
    temp.idx <- which(mz.error < ms2.match.ppm)
    if (length(temp.idx) > 0) {
      remove.idx <- lapply(temp.idx, function(idx) {
        c(idx, idx + 1)[which.min(spectrum[c(idx, idx + 1), 2])]
      })
      
      remove.idx <- unique(unlist(remove.idx))
      spectrum <- spectrum[-remove.idx, , drop = FALSE]
    } else {
      return(spectrum)
    }
  }
tidymass/metid documentation built on Oct. 8, 2024, 10:32 p.m.