R/assign_formulas.R

Defines functions assign_formulas

Documented in assign_formulas

#' @title Molecular Formula Assignment
#' @description
#' Assigns molecular formulas to molecular masses using a predefined library.
#' Input of the peaklist (`pl`) is internally checked [as_peaklist()],
#' converted to neutral masses [calc_neutral_mass()], and assigned with
#' molecular formulas based on the mass accuracy (`ma_dev`) provided [calc_ma_abs()].
#' The input can be either:
#' \itemize{
#'   \item A peaklist (`data.table`) containing m/z values or neutral masses
#'         and additional metadata
#'         .
#'   \item A numeric vector of m/z values or neutral masses without additional metadata
#'         (internally checked and standardized by [as_peaklist()]).
#' }
#'
#' @param pl Either a peaklist (`data.table`) with at least columns `mz`, `i_magnitude`,
#'        and `file_id`, or a numeric vector of masses. For numeric input, a minimal
#'        peaklist is constructed internally.
# @param ma_dev Mass accuracy in +/- parts per million (ppm)
# @param pol Polarity of the molecular ion ("neg", "pos", "neutral").
#' @inheritParams main_docu
#' @inheritDotParams calc_ma_abs
#' @inheritDotParams calc_neutral_mass
# @param pol Passed to [calc_neutral_mass()]
# @param ma_dev Passed to [calc_ma_abs()]
# @param memory_efficient (FALSE / TRUE) Splits large peaklists into chunks and
# calculates formulas in batches to be more memory efficient.
# @param chunk_n Number of peaks that are processed in one chunk
# when memory_efficient is st to TRUE (DEFAULT 5e5)
# This is slightly slower compared to formula assignment of the entire peaklist.
#' @import data.table
#' @details This function calculates the neutral mass of peaks in `pl` and
#' compares it to mass values in `formula_library`, assigning molecular formulas
#' based on mass accuracy thresholds. If 13C, 15N, or 34S isotope information
#' is missing, additional columns are added to the output table.
#' @return
#' A `data.table` where each row represents a molecular formula assigned to a
#' mass peak. The table contains:
#'
#' \itemize{
#'   \item All columns of the input peaklist `pl` (e.g. `mz`, `i_magnitude`, `file_id`).
#'   \item All columns of the input `formula_library` (e.g. `mf`, element counts).
#'   \item Calculated columns:
#'     \itemize{
#'       \item `m` — neutral mass.
#'       \item `m_cal` — exact mass of the assigned formula.
#'       \item `del` — absolute mass error (Da).
#'       \item `ppm` — mass error in parts per million.
#'       \item `mf_id` — unique ID for each (file_id, mf) combination.
#'     }
#'   \item Added isotope columns (`13C`, `15N`, `34S`) if missing in the library.
#' }
#'
#' One peak may receive zero, one, or multiple assigned formulas depending on the
#' mass accuracy threshold.
#' @author Boris P. Koch
#'
#' @examples
#' # Example using demo data
#' assign_formulas(pl = peaklist_demo,
#'                 formula_library = ume::lib_demo,
#'                 pol = "neg",
#'                 ma_dev = 0.2,
#'                 verbose = FALSE)
#' @export

assign_formulas <- function(pl,
                            formula_library,
                            verbose = FALSE,
                            #memory_efficient = FALSE,
                            #chunk_n = 5e5,
                            ...) {

  if (!is_ume_peaklist(pl)) {
    pl <- as_peaklist(pl)
  }

# Capture additional arguments from ellipsis
  inargs <- list(...)

  '13C'  <- '15N' <- '34S' <- NULL

# If the formula library is simply a character vector of molecular formulas
  if(is.vector(formula_library) & is.character(formula_library)){
    formula_library <- create_custom_formula_library(formula_library)
  }

# Make sure that library has no missing values and that "vkey" exists
# to do: only do this if a library different from ume.formulas is used
  formula_library <- check_formula_library(formula_library)

# Ensure required arguments are provided
  required_args <- c("pol", "ma_dev")
  missing_args <- setdiff(required_args, names(inargs))

  if (length(missing_args) > 0) {
    stop("Missing required argument(s): ", paste(missing_args, collapse = ", "))
  }

# Check peaklist, calculate neutral mass and mass accuracy thresholds
  pl[, m := calc_neutral_mass(mz = mz, ...)]

  # m_val <- calc_neutral_mass(mz = pl$mz, ...)
  # set(pl, j = "m", value = m_val)

  pl[, c("m_min", "m_max") := calc_ma_abs(m = m, ...)]

  .msg("Column names of peaklist:\n %s", paste0(names(pl), collapse = ", "))

  .msg("Assigning molecular formulas to masses using:
       Mass accuracy threshold: +/- %0.1f ppm
       Charge state: %s \n",
       inargs[["ma_dev"]], inargs[["pol"]])

# Check if the range of masses in pl are within the range of mass values in the library
  if (pl[, min(m_min)] < formula_library[, min(mass)-1] | pl[, max(m_max)] > formula_library[, max(mass)+1]) {
    if (verbose){
      message("  Peaklist mass range: ",
              round(pl[, min(m)], 5), " - ", round(pl[, max(m)], 5),
              " Da (", pl[, .N], " peak(s))")
      message("   Library mass range: ",
        round(formula_library[, min(mass)], 5), " - ", round(formula_library[, max(mass)], 5),
        " Da (", formula_library[, .N], " formula(s))\n"
        )}
    a <- pl[, .N]

  # This step truncates the peaklist. The truncation takes the min and max mass
    # in the library +/- 1 Da mass buffer
    pl <- pl[m_min >= min(formula_library$mass)-1 &
               m_max <= max(formula_library$mass)+1, ]
    pl[, max(m)]
    warning(
      "Mass range in peaklist is not completely covered by formula library.\n  ",
      a - pl[, .N],
      " peak(s) truncated from the original peak list.\n"
      )

    if (nrow(pl) == 0) {
      stop(
        "None of the masses in the peaklist (pl) are covered by the selected formula library.
           Stopping formula assignment."
      )
    }
  }

# This step truncates the formula library for performance reasons.
# The truncation takes the min and max mass in the peaklist (considering ma_de)
# and selects the mass range in the library accordingly.
  formula_library <- formula_library[mass %between% c(min(pl$m_min)-1, max(pl$m_max)+1)]

# Prefilter Peaklist ####
# If the formula library is very small (e.g. n<=100) the peaklist can be prefiltered.
# This is for the purpose of a targeted search based on a custom library or a
# single molecular formula.

  if(nrow(formula_library)<=10){
    .msg("Prefiltering peaklist.")

    prefiltered_pl <- data.table()  # Initialize empty table to store prefiltered results

  # Loop through the formulas in 'formula_library' and filter the peak list
  # based on each formula's mass

  for (i in 1:nrow(formula_library)) {
  # Filter `pl` to get only peaks that fall within a range around the formula mass
    filtered_pl_chunk <- pl[m_min >= min(formula_library[i,mass])-1 & m_max <= max(formula_library[i, mass])+1]

  # Add the filtered peaks to the prefiltered peak list
    prefiltered_pl <- rbind(prefiltered_pl, filtered_pl_chunk)
  }
  # Remove duplicates if any (e.g., if a peak appears for both formulas)
  prefiltered_pl <- unique(prefiltered_pl)
  pl <- prefiltered_pl
  }

# sort peaklist by mass
  setkeyv(pl, c("m_min", "m_max"))

# sort library by mass
  formula_library[, c("m_min", "m_max"):=mass]
  setkeyv(formula_library, c("m_min", "m_max"))
  cols <- c("m_min","m_max")

  # if(memory_efficient == TRUE) {
  #   mfd <- .foverlaps_chunked_lapply(
  #     lib = formula_library,
  #     pl  = pl,
  #     cols = cols,
  #     chunk_n = chunk_n,
  #     progress = isTRUE(verbose),
  #     show_ram = isTRUE(verbose),
  #     prefix = "Overlaps"
  #     # , deduplicate = TRUE  # enable ONLY if you confirm duplicates are spurious
  #   )
  # } else {  }
    mfd <- data.table::foverlaps(formula_library, pl, by.x = cols, by.y = cols, nomatch = 0L)


# remove helper columns 'm_min' and 'm_max'
  mfd[, c("m_min", "m_max", "i.m_min", "i.m_max"):=NULL]

# Create a unique ID for each molecular formula in a file
  mfd[, mf_id := .GRP, by = .(file_id, mf)]

# Rename column of calculated mass
  data.table::setnames(mfd, old = "mass", new = "m_cal", skip_absent = T)

# Add columns for heavy isotopes if not yet existing
  x <- c("13C", "15N", "34S")
  for (i in x) {
    if(!(i %in% names(mfd))){
      .msg("New column %s added having value 0\n", i)
      mfd<-mfd[, (i):=0]
    }
  }

# Calculate absolute and relative mass error
  mfd[, del:=round(m - m_cal, 5)]
  mfd[, ppm:=ume::calc_ma(m = m, m_cal = m_cal)]

  if(verbose) message(mfd[, .N], " formulas assigned to ", pl[, .N], " peak(s).\n")

# Check if identical formulas are assigned for different peaks of one spectrum
# to do: pretty slow as well
  x <- mfd[, .N, list(file_id, mf, `13C`, `34S`, `15N`)][order(N)][N>1]
  setkeyv(x, c("N", "mf"))

  if(verbose == TRUE){
  if(nrow(x) > 1){
    print_and_capture <- function(x) {paste(utils::capture.output(head(x[order(-N)])), collapse = "\n")}
    warning(nrow(x)
            , " formulas occur more than ones in a spectrum!\n  Maybe mass accuracy threshold set too wide? \n  Examples:\n"
            , print_and_capture(x)
            )
  }}

  setkeyv(mfd, c("file_id", "mz"))

  return(mfd)
}

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.