R/utility-functions.R

Defines functions .extract_field_from_string expandMzIntensity .expand_spectrum_df .collapse_spectrum_df

Documented in expandMzIntensity

#' @description
#'
#' Function to collapse `mz` and `intensity` values from a `data.frame` per
#' spectrum (identified by column `"spectrum_id"`. The result will be a
#' `data.frame` with one row per spectrum and columns `"mz"` and `"intensity"`
#' being of type `list`.
#'
#' @param x `data.frame` with spectrum data for **one** spectrum.
#'
#' @return single row `data.frame` with columns `"mz"` and `"intensity"` being
#'     of type `list`.
#'
#' @noRd
#'
#' @md
#'
#' @author Johannes Rainer
.collapse_spectrum_df <- function(x) {
    ## Simple solution is to split by required column $spectrum_id that has to
    ## uniquely identify values belonging to one spectrum. More complicated
    ## solution would be to build a unique identified from all columns except
    ## mz and intensity.
    x_new <- base::unique(x[, !colnames(x) %in% c("mz", "intensity")])
    idf <- factor(x$spectrum_id, levels = base::unique(x$spectrum_id))
    x_new$mz <- unname(base::split(x$mz, idf))
    x_new$intensity <- unname(base::split(x$intensity, idf))
    rownames(x_new) <- NULL
    x_new[, colnames(x)]
}

#' @description
#'
#' Function to expand a collapsed spectrum `data.frame` (such as collapsed with
#' the `.collapse_spectrum_df` function) or returned by the [msms_spectra_hmdb]
#' function.
#'
#' @param x `data.frame` with columns `"mz"` and `"intensity"` being of type
#'     `list`.
#'
#' @noRd
#'
#' @md
#'
#' @author Johannes Rainer
.expand_spectrum_df <- function(x) {
    x_exp <- x[rep(seq_len(nrow(x)), lengths(x$mz)),
               !colnames(x) %in% c("mz", "intensity"), drop = FALSE]
    rownames(x_exp) <- NULL
    x_exp$mz <- unlist(x$mz)
    x_exp$intensity <- unlist(x$intensity)
    x_exp[, colnames(x)]
}

#' @title Expand m/z and intensity values in a data.frame
#'
#' @description
#'
#' `expandMzIntensity` *expands* a `data.frame` with m/z and/or intensity values
#' stored as a `list` in columns `"mz"` and `"intensity"`. The resulting
#' `data.frame` has the m/z and intensity values stored as `numeric` in columns
#' `"mz"` and `"intensity"`, one value per row, with the content of other
#' columns repeated as many times as there are m/z and intensity values.
#'
#' @param x `data.frame` with *collapsed* m/z and intensity values in columns
#'     `"mz"` and `"intensity"`, such as returned by [msms_spectra_hmdb()] with
#'     parameter `collapsed = TRUE`, or by [spectra()] or [compounds()] calls.
#'
#' @return `data.frame` with `"mz"` and `"intensity"` columns *expanded*. See
#'     description for details.
#'
#' @export
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## Read a data.frame with collapsed columns mz and intensity columns
#' dr <- system.file("xml/", package = "CompoundDb")
#' msms_spctra <- msms_spectra_hmdb(dr)
#'
#' msms_spctra
#'
#' ## Columns mz and intensity are "collased"
#' msms_spctra$mz
#'
#' ## Expand the data.frame to get one row per m/z and intensity value
#' spctra_exp <- expandMzIntensity(msms_spctra)
#' spctra_exp
expandMzIntensity <- function(x) {
    if (!is.data.frame(x))
        stop("'x' is expected to be a data.frame")
    .expand_spectrum_df(x)
}

#' Use a pattern search to extract the value for a field from a string
#'
#' @param x `character` string
#'
#' @param field `character` defining how the field can be identified
#'
#' @param delimiter `character` defining how the field's end is defined.
#'
#' @author Johannes Rainer
#'
#' @return `character` with the field or `NA` if the field is not present.
#'
#' @noRd
#'
#' @examples
#'
#' strng <- "some=bl df;other=some nice thing;last=the last entry"
#' .extract_field_from_string(strng, "some=", delimiter = ";")
#' .extract_field_from_string(strng, "other=", delimiter = ";")
#' .extract_field_from_string(strng, "last=", delimiter = ";")
.extract_field_from_string <- function(x, field, delimiter = " __ ") {
    res <- sub(paste0(".*?", field, "(.*?)(", delimiter, ".*|$)"), "\\1", x)
    res[res == x] <- NA_character_
    res
}

## #' Convert inchi keys to an artificial compound ID
## #'
## #' @noRd
## .inchikey2id <- function(x, prefix = "CMP") {
##     x <- factor(x, levels = unique(x))
##     sprintf(paste0(prefix, "%0", ceiling(log10(length(levels(x)) + 1)), "d"),
##             as.integer(x))
## }

## #' Make a unique mona compounds table
## #'
## #' @noRd
## .reduce_mona_compounds <- function() {
##     ## HECK! don't have an inchikey for each!
##     cmp_id <- .inchikey2id(x$inchi)
##     tbl <- split.data.frame(x, f = cmp_id)
##     tbl_red <- lapply(tbl, function(z) {
##         if (length(unique(z$formula)) > 1)
##             stop("Formula is not unique")
##         data_frame(compound_id = NA_character_,
##                    compound_name = z$compound_name[1],
##                    inchi = z$inchi[1],
##                    formula = z$formula[1],
##                    mass = .aggregate_nums(z$mass),
##                    synonyms = unique(c(z$compound_name,
##                                        unlist(z$synonyms,
## use.names = FALSE))))
##     })
## }

## .aggregate_nums <- function(x, tolerance = 0.000001) {
##     if (length(x) == 1)
##         return(x)
##     if (all(diff(x) < tolerance))
##         x[1]
##     else stop("Difference of values is larger than allowed")
## }
EuracBiomedicalResearch/CompoundDb documentation built on March 5, 2024, 5:26 a.m.