R/misc.R

Defines functions remove_empty_cols calc_kmd calc_nominal_km calc_km formula_to_mass summary_featuretable_cat summary_featuretable_pull summary_featuretable join_metadata

Documented in calc_km calc_kmd calc_nominal_km formula_to_mass join_metadata remove_empty_cols summary_featuretable

#' Join a featuretable and sample metadata
#'
#' @description
#' Joins a featuretable and associated sample metadata. Basically a wrapper around \code{\link[dplyr]{left_join}} where `by = "Sample"`.
#'
#' @param data A feature table created with \code{\link[metamorphr]{read_featuretable}}
#' @param metadata Sample metadata created with \code{\link[metamorphr]{create_metadata_skeleton}}
#'
#' @return A tibble with added sample metadata.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   join_metadata(toy_metaboscape_metadata)
join_metadata <- function(data, metadata) {
  dplyr::left_join(data, metadata, by = "Sample")
}



#' General information about a feature table and sample-wise summary
#'
#' @description
#' Information about a feature table. Prints information to the console (number of samples, number of features and if applicable number of groups,
#' replicates and batches) and returns a sample-wise summary as a list.
#'
#' @param data A tidy tibble created by \code{\link[metamorphr]{read_featuretable}}.
#' @param n_samples_max How many Samples should be printed to the console?
#' @param n_features_max How many Features should be printed to the console?
#' @param n_groups_max How many groups should be printed to the console?
#' @param n_batches_max How many Batches should be printed to the console?
#'
#' @return A sample-wise summary as a list.
#' @export
#'
#' @examples
#' toy_metaboscape %>%
#'   join_metadata(toy_metaboscape_metadata) %>%
#'   summary_featuretable()
summary_featuretable <- function(data, n_samples_max = 5, n_features_max = 5, n_groups_max = 5, n_batches_max = 5) {
  column_names <- colnames(data)

  samples <- summary_featuretable_pull(data = data, select_what = "Sample")
  features <- summary_featuretable_pull(data = data, select_what = 2)

  n_samples <- length(samples)
  n_features <- length(features)

  summary_featuretable_cat(txt = samples, title = "Samples", n = n_samples, n_max = n_samples_max)
  summary_featuretable_cat(txt = features, title = "Features", n = n_features, n_max = n_samples_max)

  # is metadata present?
  if ("Group" %in% column_names) {
    groups <- summary_featuretable_pull(data = data, select_what = "Group")

    n_groups <- groups %>%
      length()

    summary_featuretable_cat(txt = groups, title = "Groups", n = n_groups, n_max = n_groups_max)
  }

  if ("Replicate" %in% column_names) {
    replicates <- summary_featuretable_pull(data = data, select_what = "Replicate")

    n_replicates <- length(replicates)

    if (n_replicates > 1) {
      rlang::inform(message = paste0(crayon::blue("Replicates detected: ", min(replicates), "...", max(replicates), "\n", sep = "")))
    }
  }

  if ("Batch" %in% column_names) {
    batches <- summary_featuretable_pull(data = data, select_what = "Batch")

    n_batches <- length(batches)

    summary_featuretable_cat(txt = batches, title = "Batches", n = n_batches, n_max = n_batches_max)
  }

  n_nas <- data %>%
    dplyr::filter(is.na(.data$Intensity)) %>%
    nrow()

  n_total <- nrow(data)

  rlang::inform(message = paste0(crayon::green(paste0(as.character(round((n_nas / n_total) * 100)), " % missing values (NA): ", sep = "")), paste0(as.character(n_nas), " out of ", as.character(n_total), "."), "\n", sep = ""))

  data <- data %>%
    dplyr::group_by(.data$Sample) %>%
    tidyr::nest() %>%
    dplyr::mutate(summary = purrr::map(.data$data, function(x) {
      summary(x$Intensity)
    })) %>%
    dplyr::pull(summary)

  names(data) <- samples

  data
}


summary_featuretable_pull <- function(data, select_what) {
  data %>%
    dplyr::select(dplyr::all_of(select_what)) %>%
    dplyr::distinct() %>%
    dplyr::pull()
}


summary_featuretable_cat <- function(txt, title, n, n_max) {
  if (n > n_max) {
    txt <- c(utils::head(txt, n = n_max))
  }

  rlang::inform(message = paste0(crayon::blue(as.character(n), " ", title, ": ", sep = ""), paste(txt, collapse = ", "), "\n", sep = ""))
  if (n > n_max) {
    rlang::inform(message = paste0(crayon::silver("# ", as.character(n - n_max), " more ", tolower(title), "\n", sep = "")))
    rlang::inform(message = paste0(crayon::silver("# ", "Use the n_", tolower(title), "_max", " argument to see more", "\n", sep = "")))
  }
}


#' Calculate the monoisotopic mass from a given formula
#'
#' @description
#' Calculates the monoisotopic mass from a given formula. If only the element symbols are provided, the calculated mass corresponds to that of a molecule made up from the most abundant isotopes.
#' Other isotopes can also be provided (e.g., <sup>13</sup>C, instead of the naturally most abundant <sup>12</sup>C). See the samples for details.
#'
#'
#' @param formula A formula as a string.
#'
#' @returns The monoisotopic mass of the formula.
#' @export
#'
#' @examples
#' # The monoisotopic mass is calculated with the most abundant isotopes
#' # if only the element symbols are provided:
#' formula_to_mass("CH4")
#' formula_to_mass("NH3")
#' formula_to_mass("C10H17N3O6S")
#'
#' # Other isotopes can be provided as follows:
#' formula_to_mass("[13C]H4")
#' formula_to_mass("[15N]H3")
#'
#' # Every isotope, including the most abundant ones, can be named explicitly.
#' # Compare:
#' formula_to_mass("[14N][1H]3")
#' formula_to_mass("NH3")
#'
#' # The function also supports brackets and nested brackets:
#' formula_to_mass("(CH3)2")
#' formula_to_mass("(((CH3)2N)3C)2")
#' formula_to_mass("((([13C]H3)2N)3C)2")
formula_to_mass <- function(formula) {
  internal_formula_to_mass(formula,
                           special_isos_lookup = exact_special_isos_lookup,
                           other_atoms_multi_lookup = exact_other_atoms_multi_lookup,
                           other_atoms_single_lookup = exact_other_atoms_single_lookup)
}


#' Calculate the Kendrick mass
#'
#' @description
#' Calculate the Kendrick mass for a given mass (or m/z) and repeating unit.
#' The Kendrick mass is a rescaled mass, that usually sets CH2 = 14 but other
#' repeating units can also be used. It is usefull for the visual identification
#' of potential homologues. See the References section for more information.
#' The Kendrick mass is not to be confused with the Kendrick mass defect
#' (KMD, \code{\link[metamorphr]{calc_kmd}}) and
#' the nominal Kendrick mass (\code{\link[metamorphr]{calc_nominal_km}}).
#'
#'
#' @param mass A molecular mass (or m/z).
#' @param repeating_unit The formula of the repeating unit, given as a string.
#'
#' @returns The Kendrick mass.
#' @export
#'
#' @references \itemize{
#'  \item \href{https://en.wikipedia.org/wiki/Kendrick_mass}{Kendrick mass on Wikipedia}
#'  \item Edward Kendrick, \emph{Anal. Chem.} \strong{1963}, \emph{35}, 2146–2154.
#'  \item C. A. Hughey, C. L. Hendrickson, R. P. Rodgers, A. G. Marshall, K. Qian, \emph{Anal. Chem.} \strong{2001}, \emph{73}, 4676–4681.
#' }
#'
#' @examples
#' # Calculate the Kendrick masses for two measured masses with
#' # CH2 as the repeating unit.
#' # See Hughey et al. in the References section above
#'
#' calc_km(c(351.3269, 365.3425))
#'
#' # Construct a KMD plot from m/z values.
#' # RT is mapped to color and the feature-wise maximum intensity to size.
#' # Note that in the publication by Hughey et al., the nominal Kendrick mass
#' # is used on the x-axis instead of the exact Kendrick mass.
#' # See ?calc_nominal_km.
#'
#' toy_metaboscape %>%
#'   dplyr::group_by(UID, `m/z`, RT) %>%
#'   dplyr::summarise(max_int = max(Intensity, na.rm = TRUE)) %>%
#'   dplyr::ungroup() %>%
#'   dplyr::mutate(KMD = calc_kmd(`m/z`),
#'                 KM = calc_km(`m/z`)) %>%
#'   ggplot2::ggplot(ggplot2::aes(x = KM,
#'                                y = KMD,
#'                                size = max_int,
#'                                color = RT)) +
#'     ggplot2::geom_point()
calc_km <- function(mass, repeating_unit = "CH2") {
  mass * internal_formula_to_mass(repeating_unit,
                                  iso_special_isos_lookup,
                                  iso_other_atoms_multi_lookup,
                                  iso_other_atoms_single_lookup) /
    internal_formula_to_mass(repeating_unit,
                             exact_special_isos_lookup,
                             exact_other_atoms_multi_lookup,
                             exact_other_atoms_single_lookup)
}

#' Calculate the nominal Kendrick mass
#'
#' @description
#' The nominal Kendrick mass is the Kendrick mass
#' (\code{\link[metamorphr]{calc_km}}), rounded up to the nearest
#' whole number. The nominal Kendrick mass and the Kendrick mass are both required
#' to calculate the Kendrick mass defect (KMD).
#' The nominal Kendrick mass is not to be confused with the Kendrick mass defect
#' (\code{\link[metamorphr]{calc_kmd}}) and
#' the Kendrick mass (\code{\link[metamorphr]{calc_km}}).
#'
#' @param mass A molecular mass (or m/z).
#' @param repeating_unit The formula of the repeating unit, given as a string.
#'
#' @returns The nominal Kendrick mass.
#' @export
#'
#' @references \itemize{
#'  \item \href{https://en.wikipedia.org/wiki/Kendrick_mass}{Kendrick mass on Wikipedia}
#'  \item Edward Kendrick, \emph{Anal. Chem.} \strong{1963}, \emph{35}, 2146–2154.
#'  \item C. A. Hughey, C. L. Hendrickson, R. P. Rodgers, A. G. Marshall, K. Qian, \emph{Anal. Chem.} \strong{2001}, \emph{73}, 4676–4681.
#' }
#'
#' @examples
#' # Calculate the nominal Kendrick masses for two measured masses with
#' # CH2 as the repeating unit.
#' # See Hughey et al. in the References section above
#'
#' calc_nominal_km(c(351.3269, 365.3425))
#'
#' # Construct a KMD plot from m/z values.
#' # RT is mapped to color and the feature-wise maximum intensity to size.
#'
#' toy_metaboscape %>%
#'   dplyr::group_by(UID, `m/z`, RT) %>%
#'   dplyr::summarise(max_int = max(Intensity, na.rm = TRUE)) %>%
#'   dplyr::ungroup() %>%
#'   dplyr::mutate(KMD = calc_kmd(`m/z`),
#'                 `nominal KM` = calc_nominal_km(`m/z`)) %>%
#'   ggplot2::ggplot(ggplot2::aes(x = `nominal KM`,
#'                                y = KMD,
#'                                size = max_int,
#'                                color = RT)) +
#'     ggplot2::geom_point()
calc_nominal_km <- function(mass, repeating_unit = "CH2") {
  ceiling(calc_km(mass, repeating_unit = repeating_unit))
}

#' Calculate the Kendrick mass defect (KMD)
#'
#' @description
#' The Kendrick mass defect (KMD) is calculated by subtracting the Kendrick mass
#' (\code{\link[metamorphr]{calc_km}}) from the nominal Kendrick mass
#' (\code{\link[metamorphr]{calc_nominal_km}}). See the References section for
#' more information.
#'
#'
#' @param mass A molecular mass (or m/z).
#' @param repeating_unit The formula of the repeating unit, given as a string.
#'
#' @returns The Kendrick mass defect (KMD)
#' @export
#'
#' @references \itemize{
#'  \item \href{https://en.wikipedia.org/wiki/Kendrick_mass}{Kendrick mass on Wikipedia}
#'  \item Edward Kendrick, \emph{Anal. Chem.} \strong{1963}, \emph{35}, 2146–2154.
#'  \item C. A. Hughey, C. L. Hendrickson, R. P. Rodgers, A. G. Marshall, K. Qian, \emph{Anal. Chem.} \strong{2001}, \emph{73}, 4676–4681.
#' }
#'
#' @examples
#' # Calculate the Kendrick mass defects for two measured masses with
#' # CH2 as the repeating unit.
#' # See Hughey et al. in the References section above
#'
#' calc_kmd(c(351.3269, 365.3425))
#'
#' # Construct a KMD plot from m/z values.
#' # RT is mapped to color and the feature-wise maximum intensity to size.
#'
#' toy_metaboscape %>%
#'   dplyr::group_by(UID, `m/z`, RT) %>%
#'   dplyr::summarise(max_int = max(Intensity, na.rm = TRUE)) %>%
#'   dplyr::ungroup() %>%
#'   dplyr::mutate(KMD = calc_kmd(`m/z`),
#'                 `nominal KM` = calc_nominal_km(`m/z`)) %>%
#'   ggplot2::ggplot(ggplot2::aes(x = `nominal KM`,
#'                                y = KMD,
#'                                size = max_int,
#'                                color = RT)) +
#'     ggplot2::geom_point()
calc_kmd <- function(mass, repeating_unit = "CH2") {
  calc_nominal_km(mass, repeating_unit = repeating_unit) - calc_km(mass, repeating_unit = repeating_unit)
}


#' Remove empty columns from a tibble or data frame
#'
#' @description
#' Remove empty columns (i.e., columns that _only_ contain `NA`) from a tibble or data frame.
#'
#' @param data A tibble or data frame in wide format.
#' @param show_removed_cols If `TRUE` prints a message that shows which columns were removed.
#' @param always_keep Specify columns that should *always* be kept, regardless if they only contain `NA` or not. Columns can be specified as a vector with column indices (e.g., c(1, 2)), column names as characters (e.g., c("a", "b")), as symbols (e.g., c(a, b)), or combinations thereof (e.g., c(1, b))
#'
#' @returns A tibble or data frame in wide format without empty columns.
#' @export
#'
#' @examples
#' # Columns `a` and `d` contains only `NA` and should be removed
#' na_tibble <- tibble::tibble(
#'   a = c(NA, NA, NA),
#'   b = c(1, 2, 3),
#'   c = c(NA, 2, 3),
#'   d = c(NA, NA, 3),
#'   e = c(NA, NA, NA)
#' )
#'
#' remove_empty_cols(na_tibble)
#'
#' # Columns `a` and `d` contains only `NA` but `a` should be kept anyways
#' remove_empty_cols(na_tibble, always_keep = a)
remove_empty_cols <- function(data, always_keep = NULL, show_removed_cols = TRUE) {
  col_order <- colnames(data)
  empty_cols <- purrr::map(data, function(x) all(is.na(x)))
  empty_cols <- unlist(empty_cols)
  empty_cols <- empty_cols[empty_cols]
  empty_col_names <- names(empty_cols)
  data <- data %>%
    dplyr::select(-dplyr::all_of(empty_col_names), {{ always_keep }}) %>%
    dplyr::relocate(dplyr::any_of(col_order))

  if (show_removed_cols == TRUE) {
    removed_cols <- setdiff(col_order, colnames(data))
    if(length(removed_cols) > 0) {
      if (length(removed_cols) > 1) {
        rlang::inform(paste0("The following columns were removed: ", paste(paste0("`", removed_cols, "`"), collapse = ", "), "."))
      } else {
        rlang::inform(paste0("The following column was removed: ", paste0("`", removed_cols, "`"), "."))
      }
    } else {
      rlang::inform(paste0("No columns were removed."))
    }
  }

  data
}

Try the metamorphr package in your browser

Any scripts or data that you put into this service are public.

metamorphr documentation built on June 10, 2026, 5:07 p.m.