R/convert_data_table_to_molecular_formulas.R

Defines functions convert_data_table_to_molecular_formulas

Documented in convert_data_table_to_molecular_formulas

#' @title Convert Data Table with Element Counts to Molecular Formulas
#'
#' @description
#' Creates a character vector of molecular formulas and adds it as a column to the input `data.table`.
#' The molecular formula string follows the **Hill system order** for element arrangement.
#' If keep_element_sums == TRUE, a data.table is returned that also provides
#' the sum of atoms of each element in the molecular formula.
#'
#' @details
#' This function extracts element or isotope counts from a table with columns for each element of a molecular formula,
#' including those with isotopic notation.
#' It ensures that only valid elements are included based on a reference table (`masses`).
#'
#' The function internally uses the `ume::masses` table that contains element and isotopic symbols.
#'
#' @inheritParams main_docu
#' @param isotope_formulas Logical. If `TRUE` the output table will have an additional
#' molecular formula string that includes isotope information (e.g. "[12C5][13C1][1H12][16O6]")
#' @param keep_element_sums description. If `TRUE` the output table will have
#' additional columns containing the total count of atoms of an element (e.g. `S_tot`).
#' @return The original table `mfd` as data.table having additional columns:
#' \describe{
#'   \item{mf}{Standardized molecular formula following the Hill order.}
#'   \item{mf_iso}{If `isotope_formulas = TRUE`:
#'   Standardized molecular formula considering all isotopes of an element.}
#'   \item{C_tot}{If `keep_element_sums = TRUE`:
#'   The total count of all atoms that are carbon isotopes (similar for all other elements.}
#' }
#'
#' @section Notes:
#' - The function correctly handles isotopic notations such as `[13C]` and `[18O2]`.
#' - The output follows the **Hill order**, meaning **C, H first**, followed by other elements in alphabetical order.
#' - Single-element counts (e.g., `C1H4` → `CH4`) are formatted without explicit `1`.
#'
#' @import data.table
#' @examples
#' convert_data_table_to_molecular_formulas(mf_data_demo[, .(`12C`, `1H`, `14N`, `16O`, `31P`, `32S`)])
#' @family molecular formula functions
#' @keywords chemistry molecular-formula
#' @export

convert_data_table_to_molecular_formulas <- function(mfd,
                                                     isotope_formulas = FALSE,
                                                     keep_element_sums = FALSE,
                                                     verbose = FALSE,
                                                     ...) {

  keep_cols <- new_name <- orig_name <- mf_iso <- NULL

# Create a key for each line in mfd if not already existing
  if(!"vkey" %in% names(mfd)){
    mfd[, vkey:=.I]
  }

# Verify which columns have element or isotope information
  iso_cols <- get_isotope_info(mfd, ...)

# Rename isotope columns in mfd to match official nomenclature
  setnames(mfd, iso_cols$orig_name, iso_cols$label, skip_absent = TRUE)

# Make sure all isotope columns are integer type
  mfd[, (iso_cols$label) := lapply(.SD, as.integer), .SDcols = iso_cols$label]

# Step 1: Reshape the data table by stacking the elements and their counts
  dt_long <- data.table::melt(mfd, measure.vars = iso_cols$label, variable.name = "label",
                              value.name = "count", id.vars = "vkey", variable.factor = F)

# Filter out rows with zero counts
  dt_long <- dt_long[count > 0]

# Join isotope information
  dt_long <- iso_cols[, .(hill_order, label, symbol)][dt_long, on = "label"] # add hill_order

# Order for each formula and the hill order
  setkeyv(dt_long, c("vkey", "hill_order")) # order data.table

# Build formula strings for the standard molecular formula
  df_mf <-
    dt_long[, .(count_element = sum(count)), by = .(vkey, symbol)]  # sum up isotopes of the same element
  df_mf[count_element == 1, mf := symbol] # substring if element is n=1

  if(keep_element_sums){
  df_mf_sums <- dcast(df_mf, vkey ~ symbol, value.var = "count_element", fill = 0)
  setnames(df_mf_sums, names(df_mf_sums)[-1], paste0(names(df_mf_sums)[-1], "_tot"))
  }

  df_mf[count_element > 1, mf := paste0(symbol, count_element)] # substring if element is n>1

# remove unnecessary columns
  df_mf[, c("symbol", "count_element"):=NULL]

# This concatenates the sub-strings and takes most of the time!!
  if(verbose) message("Creating molecular formula string...")
  df_mf <- df_mf[, lapply(.SD, paste0, collapse = ""), by = .(vkey)]

# Build formula strings that include all isotopes
  if(isotope_formulas == T){
    df_mf_iso <-
      dt_long[, .(count_element = sum(count)), by = .(vkey, label)]  # sum up isotopes of the same element
    df_mf_iso[count_element == 1, mf_iso := paste0("[", label, "]")] # substring if element is n=1
    df_mf_iso[count_element > 1, mf_iso := paste0("[", label, count_element, "]")] # substring if element is n>1

  # remove unnecessary columns
    #keep_cols <- names(df_mf_iso)[names(df_mf_iso) %in% c("vkey", "mf", "mf_iso")]
    df_mf_iso[, c("label", "count_element") := NULL]

    if(verbose) message("Creating molecular formula string with isotope information...")
    #setkey(df_mf, vkey)
    df_mf_iso <- df_mf_iso[, lapply(.SD, paste0, collapse = ""), by = .(vkey)]

    # Join df_mf and df_mf_iso
    df_mf <- df_mf_iso[df_mf, on = "vkey"]
  }

# Join results with original data.table df
  mfd <-
    df_mf[mfd, on = c("vkey")] # join the mf strings with original datatable

  if(keep_element_sums){
    df_mf_sums[mfd, on = "vkey"]
  }

  if(verbose) message("Molecular formula strings created.")

  return(mfd)
}

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.