R/calc_indeces.R

Defines functions calc_ideg

Documented in calc_ideg

# Calculate Ideg ####
#' @title Calculate Degradation Index (Ideg)
#' @family calculations
#' @description
#' This function calculates the degradation index ('Ideg') following Flerus et al. (2012). High Ideg values indicate 'older' marine DOM
#' (i.e., a higher contribution of peaks that correlate negatively with delta14C), while low values indicate 'younger' DOM
#' (i.e., a higher contribution of peaks that correlate positively with delta14C)./
#'
#' Ideg is computed as the ratio of summed magnitudes for five negative (NEG) molecular formulas to the total summed magnitudes of five positive (POS)
#' and five negative (NEG) molecular formulas:
#'
#'
#' \deqn{Ideg = \frac{\sum{NEG}}{\sum{NEG} + \sum{POS}}}
#'
#'
#' The index ranges from 0 to 1 and is valid only if all required formulas (n = 10) are present. Ideg depends strongly on
#' the type of sample preparation, ionization method, and instrument settings, and should only be interpreted for relative
#' changes within the same dataset.
#'
#' @inheritParams main_docu
#' @param mf_col Character. The name of the column containing molecular formulas. Default is "mf".
#' @param magnitude_col Character. The name of the column containing magnitude values (absolute or relative). Default is "i_magnitude".
#' @import data.table
#' @return A `data.table` with columns:
#' - `grp`: Grouping variable.
#' - `ideg`: Calculated degradation index (rounded to 3 decimals).
#' - `ideg_n`: Number of assigned formulas used in the calculation.
#' @examples
#' # Create a minimal dataset containing all required POS and NEG formulas
#' library(data.table)
#'
#' demo_ideg <- data.table(
#'   file_id = 1,
#'   mf = c(
#'     "C17H20O9", "C19H22O10", "C20H22O10", "C20H24O11", "C21H26O11",   # NEG
#'     "C13H18O7", "C14H20O7", "C15H22O7", "C15H22O8", "C16H24O8"        # POS
#'   ),
#'   i_magnitude = c(
#'     1200, 900, 1500, 700, 800,     # NEG intensities
#'     2000, 1800, 2200, 1600, 1900   # POS intensities
#'   )
#' )
#'
#' calc_ideg(
#'   mfd = demo_ideg,
#'   mf_col = "mf",
#'   magnitude_col = "i_magnitude",
#'   grp = "file_id"
#' )
#' @export

calc_ideg <- function(mfd, mf_col = "mf", magnitude_col = "i_magnitude", grp = "file_id", ...) {
  # Validate input arguments
  if (!inherits(mfd, "data.table")) stop("'mfd' must be a data.table.")
  if (!mf_col %in% names(mfd)) stop(paste("Column", mf_col, "not found in 'mfd'."))
  if (!magnitude_col %in% names(mfd)) stop(paste("Column", magnitude_col, "not found in 'mfd'."))
  if (!grp %in% names(mfd)) {
    warning(paste0("Grouping column '", grp, "' not found. Calculating Ideg without grouping."))
    grp <- NULL
  }

# Define NEG and POS molecular formulas
  NEG <- c("C17H20O9", "C19H22O10", "C20H22O10", "C20H24O11", "C21H26O11")
  POS <- c("C13H18O7", "C14H20O7", "C15H22O7", "C15H22O8", "C16H24O8")

  cols <- c(grp, mf_col, magnitude_col)
  t <- mfd[mf %in% c(POS, NEG), ..cols]
  t <- t[mf %in% POS, ideg:="pos"]
  t <- t[mf %in% NEG, ideg:="neg"]

  t <- data.table::dcast(t, get(grp)~ideg, fun.aggregate = list(length, sum), value.var = "i_magnitude")
  if("grp" %in% names(t)){data.table::setnames(t, "grp", grp)}

# Make sure that all necessary columns for index calculation exist
  cols2 <- c("i_magnitude_sum_neg", "i_magnitude_sum_pos", "i_magnitude_length_neg", "i_magnitude_length_pos")
  cols2 <- cols2[!cols2 %in% names(t)]

  if(length(cols2)>0) t[, c(cols2):=NA]

# Calculate Ideg index
  t <- t[, .(ideg=round(i_magnitude_sum_neg/(i_magnitude_sum_neg+i_magnitude_sum_pos), 3)
        , ideg_n=sum(i_magnitude_length_neg, i_magnitude_length_pos)), grp]

  return(t[])
}

# Calculate iterr ####
#' @title Calculate terrestrial indeces Iterr and Iterr2 (after Medeiros et al. 2016)
#'
#' @description Calculate a degradation index 'Iterr' and modified index 'iterr2' after Medeiros et al. (2016).
#' High Iterr values represent higher contribution of terrestrial material (i.e. higher contribution
#' of peaks that correlate positively with delta13C) while low values represent less terrestrial
#' material (i.e. higher contribution of peaks that correlate negatively with delta13C).
#' Iterr / Iterr2 are calculated from a peak magnitude ratio of 50 or 5 POS and NEG formulas, respectively:
#' sum(POS) / (sum(POS) + sum(NEG))
#' Therefore Iterr / Iterr2 range between 1 and 0. It should be noted that absolute values strongly depend
#' on factors such as type of solid phase extraction, ionization method, instrument settings etc.
#' Therefore values can only be interpreted as relative changes.
#' It should also be noted that for an appropriate evaluation ALL index formulas must be present.
#' @name calc_iterr
#' @family index calculations
#' @inheritParams main_docu
#' @param mf_col Name of the column containing molecular formulas (string)
#' @param magnitude_col Name of the column containing absolute or
#' relative mass peak magnitudes (string).
#' @import data.table
#' @keywords misc
#' @return Iterr and iterr2 values
#' @examples
#' library(data.table)
#'
#' # Create a minimal dataset containing all required
#' # POS, NEG, POS2, and NEG2 formulas for demonstration
#'
#' demo_iterr <- data.table(
#'   file_id = 1,
#'   mf = c(
#'     # NEG (Iterr)
#'     'C13H12O5','C15H14O4','C14H12O5','C14H14O5','C13H12O6',
#'     'C16H16O4','C15H14O5','C14H12O6','C15H16O5','C14H14O6',
#'     'C16H14O5','C16H16O5','C15H14O6','C15H16O6','C14H14O7',
#'     'C17H16O5','C16H14O6','C17H18O5','C16H16O6','C15H14O7',
#'     'C17H16O6','C16H14O7','C18H18O6','C17H16O7','C17H18O7',
#'     'C18H16O7','C18H18O7','C17H16O8','C19H18O7','C20H20O7',
#'     'C19H18O8','C20H18O9','C19H16O10','C21H20O9','C20H18O10',
#'     'C22H22O9','C21H20O10','C23H22O10','C24H24O10','C25H26O10',
#'
#'     # POS (Iterr)
#'     'C15H19NO6','C15H21NO6','C17H21NO7','C17H23NO7','C17H22O8',
#'     'C16H21NO8','C17H20N2O7','C17H19NO8','C18H23NO7','C17H21NO8',
#'     'C18H24O8','C16H19NO9','C17H23NO8','C17H22O9','C17H24O9',
#'     'C18H21NO8','C17H19NO9','C18H23NO8','C18H22O9','C17H21NO9',
#'     'C18H24O9','C18H20N2O8','C18H21NO9','C19H24O9','C18H23NO9',
#'     'C18H22O10','C18H24O10','C20H24O9','C19H22O10','C20H26O9',
#'     'C19H24O10','C19H26O10','C20H24O10','C20H26O10','C19H24O11',
#'     'C20H24O11','C20H26O11','C20H26O12','C22H28O11','C21H28O12',
#'
#'     # NEG2 (Iterr2)
#'     'C17H18O7','C18H18O7','C17H16O7','C17H16O8','C15H16O6',
#'
#'     # POS2 (Iterr2)
#'     'C20H24O9','C20H24O10','C19H22O10','C17H21NO8','C20H26O9'
#'   ),
#'
#'   # Assign magnitude values (arbitrary but valid)
#'   i_magnitude = c(
#'     rep(1000, 40),  # NEG
#'     rep(2000, 40),  # POS
#'     rep(1500, 5),   # NEG2
#'     rep(1800, 5)    # POS2
#'   )
#' )
#'
#' calc_iterr(
#'   mfd = demo_iterr,
#'   mf_col = "mf",
#'   magnitude_col = "i_magnitude",
#'   grp = "file_id"
#' )
#' @export

calc_iterr <- function(mfd, mf_col = "mf", magnitude_col = "i_magnitude", grp = "file_id", ...){

  if(grp %in% names(mfd) == FALSE){
    warning(paste0("There exists no column '", grp, "' in molecular formula table 'mfd'."
                   , "\nNo grouping will be applied to calculate 'Ideg'."))
    grp <- 1
  }

  NEG <- c('C13H12O5', 'C15H14O4', 'C14H12O5', 'C14H14O5', 'C13H12O6', 'C16H16O4'
           , 'C15H14O5', 'C14H12O6', 'C15H16O5', 'C14H14O6', 'C16H14O5', 'C16H16O5'
           , 'C15H14O6', 'C15H16O6', 'C14H14O7', 'C17H16O5', 'C16H14O6', 'C17H18O5'
           , 'C16H16O6', 'C15H14O7', 'C17H16O6', 'C16H14O7', 'C18H18O6', 'C17H16O7'
           , 'C17H18O7', 'C18H16O7', 'C18H18O7', 'C17H16O8', 'C19H18O7', 'C20H20O7'
           , 'C19H18O8', 'C20H18O9', 'C19H16O10', 'C21H20O9', 'C20H18O10', 'C22H22O9'
           , 'C21H20O10', 'C23H22O10', 'C24H24O10', 'C25H26O10')
  POS <- c('C15H19NO6', 'C15H21NO6', 'C17H21NO7', 'C17H23NO7', 'C17H22O8', 'C16H21NO8'
           , 'C17H20N2O7', 'C17H19NO8', 'C18H23NO7', 'C17H21NO8', 'C18H24O8', 'C16H19NO9'
           , 'C17H23NO8', 'C17H22O9', 'C17H24O9', 'C18H21NO8', 'C17H19NO9', 'C18H23NO8'
           , 'C18H22O9', 'C17H21NO9', 'C18H24O9', 'C18H20N2O8', 'C18H21NO9', 'C19H24O9'
           , 'C18H23NO9', 'C18H22O10', 'C18H24O10', 'C20H24O9', 'C19H22O10', 'C20H26O9'
           , 'C19H24O10', 'C19H26O10', 'C20H24O10', 'C20H26O10', 'C19H24O11', 'C20H24O11'
           , 'C20H26O11', 'C20H26O12', 'C22H28O11', 'C21H28O12')

  NEG2 <- c('C17H18O7', 'C18H18O7', 'C17H16O7', 'C17H16O8', 'C15H16O6')
  POS2 <- c('C20H24O9', 'C20H24O10', 'C19H22O10', 'C17H21NO8', 'C20H26O9')

  cols <- c(grp, mf_col, magnitude_col)
  t <- mfd[mf %in% c(POS, NEG), ..cols]
  t <- t[mf %in% POS2, iterr2:="pos2"]
  t <- t[mf %in% NEG2, iterr2:="neg2"]
  t <- t[mf %in% POS, iterr:="pos"]
  t <- t[mf %in% NEG, iterr:="neg"]

  dt2 <- data.table::dcast(t, get(grp)~iterr2, fun.aggregate = list(length, sum)
                         , value.var = "i_magnitude"
                         , )
  if("grp" %in% names(dt2)){data.table::setnames(dt2, "grp", grp)}

# Make sure that all necessary columns for index calculation exist
  cols2 <- c("i_magnitude_sum_neg2", "i_magnitude_sum_pos2",
             "i_magnitude_length_neg2", "i_magnitude_length_pos2")
  cols2 <- cols2[!cols2 %in% names(dt2)]
  if(length(cols2)>0) dt2[, c(cols2):=NA]

# Calculate Iterr2 index
  dt2 <- dt2[, .(iterr2=round(i_magnitude_sum_neg2/(i_magnitude_sum_neg2+i_magnitude_sum_pos2), 3)
             , iterr2_n=sum(i_magnitude_length_neg2, i_magnitude_length_pos2)), grp]

# Calculate Iterr2
  dt <- data.table::dcast(t, get(grp)~iterr, fun.aggregate = list(length, sum)
                          , value.var = "i_magnitude"
                          , )
  if("grp" %in% names(dt)){data.table::setnames(dt, "grp", grp)}

# Make sure that all necessary columns for index calculation exist
  cols2 <- c("i_magnitude_sum_neg", "i_magnitude_sum_pos", "i_magnitude_length_neg", "i_magnitude_length_pos")
  cols2 <- cols2[!cols2 %in% names(dt)]
  if(length(cols2)>0) dt[, c(cols2):=NA]

# Calculate Iterr index
  dt <- dt[, .(iterr=round(i_magnitude_sum_neg/(i_magnitude_sum_neg+i_magnitude_sum_pos), 3)
               , iterr_n=sum(i_magnitude_length_neg, i_magnitude_length_pos)), grp]

# Combine Iterr and Iterr2
  x <- dt2[dt, on = grp]
  x[]
  return(x)
}


# Calculate the Simpson diversity index ####

#' @title Calculate the Simpson Diversity Index
#'
#' @description
#' The Simpson diversity index is calculated to measure the probability that two randomly selected individuals
#' (e.g., molecular formulas) belong to the same category. It quantifies the dominance or evenness within a dataset.
#'
#' The Simpson index is defined as:
#' \deqn{D = \sum (p_i^2)}
#' where:
#' - \eqn{p_i} is the relative abundance of the \eqn{i}-th molecular formula.
#'
#' The index ranges between 0 and 1:
#' - A value near 0 indicates high diversity (even distribution of abundances).
#' - A value of 1 indicates no diversity (one molecular formula dominates).
#'
#' @param mf Character vector. A list of unique molecular formulas.
#' @param magnitude Numeric vector. A list of respective abundances (intensities) for each molecular formula.
#' Must be non-negative and have the same length as `mf`.
#'
#' @return A single numeric value representing the Simpson diversity index. Returns 0 if `magnitude` is all zeros.
#' @examples
#' calc_simpson_index(
#'   mf = c("C10H20O5", "C12H18O3", "C18H30O6"),
#'   magnitude = c(1982375, 2424, 312410)
#' )
#' @export

calc_simpson_index <- function(mf, magnitude) {
  # Input validation
  if (!is.character(mf) || length(mf) == 0) stop("'mf' must be a non-empty character vector.")
  if (!is.numeric(magnitude) || length(magnitude) != length(mf))
    stop("'magnitude' must be a numeric vector of the same length as 'mf'.")
  if (any(magnitude < 0)) stop("'magnitude' must contain non-negative values.")

  # Total abundance and relative abundances
  total_abundance <- sum(magnitude)
  if (total_abundance == 0) return(0) # Simpson index is zero when total abundance is zero

  relative_abundances <- magnitude / total_abundance

  # Calculate Simpson index
  simpson_index <- sum(relative_abundances^2)

  return(simpson_index)
}


# Calculate Shannon diversity index ####

#' @title Calculate the Shannon Diversity Index
#'
#' @description
#' The Shannon diversity index is calculated to quantify the diversity of molecular formulas based on
#' their relative abundances. This index considers both the richness (number of unique formulas)
#' and the evenness (distribution of abundances). Higher values indicate greater diversity.
#'
#' The Shannon index is defined as:
#' \deqn{H = -\sum (p_i \cdot \ln(p_i))}
#' where:
#' - \eqn{p_i} is the relative abundance of the \eqn{i}-th molecular formula.
#'
#' Zero-abundance formulas are excluded from the calculation.
#'
#' @param mf Character vector. A list of unique molecular formulas.
#' @param magnitude Numeric vector. A list of respective abundances (intensities) for each molecular formula.
#' Must be non-negative and have the same length as `mf`.
#'
#' @return A single numeric value representing the Shannon diversity index. Returns 0 if `magnitude` is all zeros.
#' @examples
#' calc_shannon_index(
#'   mf = c("C10H20O5", "C12H18O3", "C18H30O6"),
#'   magnitude = c(1982375, 2424, 312410)
#' )
#' @export

calc_shannon_index <- function(mf, magnitude) {
  # Input validation
  if (!is.character(mf) || length(mf) == 0) stop("'mf' must be a non-empty character vector.")
  if (!is.numeric(magnitude) || length(magnitude) != length(mf))
    stop("'magnitude' must be a numeric vector of the same length as 'mf'.")
  if (any(magnitude < 0)) stop("'magnitude' must contain non-negative values.")

  # Total abundance and relative abundances
  total_abundance <- sum(magnitude)
  if (total_abundance == 0) return(0) # Shannon index is zero when total abundance is zero

  relative_abundances <- magnitude / total_abundance
  non_zero_relative_abundances <- relative_abundances[relative_abundances > 0]

  # Calculate Shannon index
  shannon_index <- -sum(non_zero_relative_abundances * log(non_zero_relative_abundances))

  return(shannon_index)
}


# Calculate Pielou eveness ####

#' @title Calculate Pielou's Evenness
#'
#' @description
#' This function calculates Pielou's evenness index, a measure of the distribution of abundances across
#' molecular formulas. Evenness ranges from 0 (one molecular formula dominates) to 1 (all formulas are equally abundant).
#'
#' Evenness is derived using the Shannon index:
#' \deqn{E = \frac{H}{\log(S)}}
#' where:
#' - \eqn{H} is the Shannon diversity index.
#' - \eqn{S} is the number of unique molecular formulas.
#'
#' If there is only one molecular formula, evenness is defined as 1.
#'
#' @param mf Character vector. A list of unique molecular formulas.
#' @param magnitude Numeric vector. A list of respective intensities (abundances) for each molecular formula.
#' Must be non-negative and have the same length as `mf`.
#'
#' @return A single numeric value representing Pielou's evenness.
#' @examples
#' calc_pielou_evenness(
#'   mf = c("C10H20O5", "C12H18O3", "C18H30O6"),
#'   magnitude = c(1982375, 2424, 312410)
#' )
#' @export

calc_pielou_evenness <- function(mf, magnitude) {
  # Input validation
  if (!is.character(mf) || length(mf) == 0) stop("'mf' must be a non-empty character vector.")
  if (!is.numeric(magnitude) || length(magnitude) != length(mf))
    stop("'magnitude' must be a numeric vector of the same length as 'mf'.")
  if (any(magnitude < 0)) stop("'magnitude' must contain non-negative values.")

  # Calculate total abundance and relative abundances
  total_abundance <- sum(magnitude)
  if (total_abundance == 0) return(0) # Evenness is undefined if total abundance is zero

  relative_abundances <- magnitude / total_abundance

  # Calculate Shannon index
  shannon_index <- -sum(relative_abundances * log(relative_abundances), na.rm = TRUE)

  # Count of unique molecular formulas
  mf_count <- length(unique(mf))

  # Handle cases with fewer than 2 unique formulas
  if (mf_count <= 1) return(1)

  # Calculate maximum possible Shannon index and Pielou's evenness
  max_possible_shannon <- log(mf_count)
  evenness <- shannon_index / max_possible_shannon

  return(evenness)
}

Try the ume package in your browser

Any scripts or data that you put into this service are public.

ume documentation built on Dec. 13, 2025, 1:06 a.m.