R/convert_molecular_formula_to_data_table.R

Defines functions convert_molecular_formula_to_data_table

Documented in convert_molecular_formula_to_data_table

#' @title Convert Molecular Formulas to a Data Table of Element Counts
#' @name convert_molecular_formula_to_data_table
#' @description
#' Parses a character vector of molecular formulas and returns a `data.table` where each row represents
#' a molecular formula, and each column corresponds to an element, showing the count of atoms of that element.
#' The resulting table follows the **Hill system order** for element arrangement.
#'
#' @details
#' This function extracts element counts from molecular formulas, including those with isotopic notation.
#' It ensures that only valid elements are included based on a reference table (`masses`) and flags invalid entries.
#' Duplicate molecular formulas are identified and processed only once, with a warning issued.
#'
#' The function internally creates an enriched `masses` table to account for isotopic symbols and standard element notation.
#'
#' @inheritParams main_docu
#' @param table_format A string (two options) that controls the output table format: `wide` (DEFAULT, dcast) or `long` (normalized, melt)
#'
#' @return A `data.table` with:
#' \describe{
#'   \item{mf}{Standardized molecular formula following the Hill order.}
#'   \item{mf_iso}{Original input molecular formula.}
#'   \item{mass}{Exact molecular mass calculated from element masses.}
#'   \item{elements}{Columns for each element present in the formulas, showing the atom count.}
#' }
#'
#' @section Warnings:
#' - If duplicate formulas are detected, only unique ones are processed, and a warning is issued.
#' - If invalid element symbols are found, the function stops with an error message.
#' - If a molecular formula contains duplicate isotopes/elements, an error is triggered.
#'
#' @section Notes:
#' - The function correctly handles isotopic notations such as `[13C]` and `[18O2]`.
#' - The output follows the **Hill order**, meaning **C, H first**, followed by other elements in alphabetical order.
#' - Single-element counts (e.g., `C1H4` → `CH4`) are formatted without explicit `1`.
#'
#' @import data.table
#' @examples
#' # Example usage
#' molecular_formulas <- c("C10H23NO4", "C10H24N4O2S", "C6[13C2]H12[18O2]ONaCl")
#' convert_molecular_formula_to_data_table(molecular_formulas)
#'
#' @family molecular formula functions
#' @keywords chemistry molecular-formula
#' @export

convert_molecular_formula_to_data_table <- function(mf, masses = ume::masses, table_format = c("wide", "long")) {

  table_format <- match.arg(table_format)
  i2 <- mf_iso <- m_iso <- m_iso_nm <- nominal_mass <- NULL

  # Verify that 'formulas' is a character vector
  if (!is.character(mf)) {
    stop("'mf' must be a character vector or a string.")
  }

  if (any(is.na(mf) | mf == "")) {
    stop("'mf' must be provided.")
  }


  # Check for duplicates in formulas
  duplicates <- length(mf) - length(unique(mf))

  if(duplicates){
    warning(duplicates, " duplicates identified in 'mf'. \nOnly one result will be returned for each duplicate.")
    mf <- unique(mf)
  }

  regex <- "\\[?[0-9]*[A-Z][a-z]*\\d*\\]?"  # Matches elements and isotopes

  # Apply regex matching to all formulas at once (avoiding loops)
  matches <- regmatches(mf, gregexpr(regex, mf, perl=TRUE))

  # Flatten results into a long vector
  dt <- data.table(mf_iso = rep(mf, lengths(matches)), match = unlist(matches))

  # Remove square brackets from isotopic elements
  dt[, match := gsub("\\[|\\]", "", match)]

  # Extract element and count
  dt[, c("isotope", "count") := tstrsplit(match, "(?<=[A-Za-z])(?=\\d)", perl=TRUE)]

  # Add helper column
  dt[, i2:=isotope]

  # Create a new masses table that combines different formats of element and isotope symbols
  m1 <- masses[, .(i2=label, symbol, exact_mass, nm, hill_order)]
  m2 <- masses[, .SD[which.min(exact_mass)], by = symbol][, .(i2=symbol, symbol, exact_mass, nm, hill_order)]
  masses_new <- unique(rbind(m1, m2))
  m3 <- masses_new[, .(i2=tolower(i2), symbol, exact_mass, nm, hill_order)]
  masses_new <- unique(rbind(masses_new, m3))

  # Check that formulas are valid: all elements / isotopes in formulas in masses table
  elements <- unique(dt$i2)
  false_elements <- dt[!i2 %in% masses_new$i2, i2]

  if (length(false_elements) > 0) {
    stop(
      "Some formulas contain invalid element/isotope symbols: '",
      paste0(false_elements, collapse = "', '"), "'"
    )
  }

  # Evaluate formulas for duplicate isotopes / elements
  if(nrow(dt[, .N, .(mf_iso, isotope)][N>1])>0){
    .msg(dt[, .N, .(mf_iso, isotope)][N>1])
    stop("Incorrect formula(s) identified")
  }

  # Convert count to integer and set default = 1
  dt[, count := as.integer(count)]
  dt[is.na(count), count := 1]

# Add information: mass, element symbol)
  dt <- masses_new[, .(i2, exact_mass, nm, symbol)][dt, on = "i2"]

  dt[, m_iso:=count*exact_mass]
  dt[, m_iso_nm:=count*nm]
  dt[, mass:=sum(m_iso), mf_iso]
  dt[, nominal_mass:=sum(m_iso_nm), mf_iso]
  dt[, mf:=paste0(names(tapply(count, symbol, sum)), tapply(count, symbol, sum), collapse = ""), mf_iso]

  if(table_format == "long"){return(dt[])}

  if(table_format == "wide"){
  dt <- dcast(dt, mf_iso+mf+mass+nominal_mass~i2, value.var = "count", fill = 0)
  dt[, mf:=gsub("([A-Z][a-z]?)(1)(?![0-9])", "\\1", mf, perl = TRUE)]

  dt[, vkey:=.I]

  iso_names <- get_isotope_info(dt)

  setnames(dt, iso_names$orig_name, iso_names$label, skip_absent = T)
  setcolorder(dt, c("vkey", "mf", "mf_iso", "mass", "nominal_mass", iso_names$label))
  }

# Return final table
  setnames(dt, "nominal_mass", "nm")
  return(dt[])
  }

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.