R/chem_FUN.R

Defines functions ppm_to_mda mda_to_ppm get_mass_range mzs_to_mass get_basepeak_mass format_formula_vec format_formula check_cas check_inchikey format_inchikey

Documented in check_cas check_inchikey format_formula format_inchikey get_basepeak_mass get_mass_range mzs_to_mass

#' @title Format InChIKeys
#'
#' @description
#' check validity of InChIKey, return NA for those which are not valid
#'
#' @param inchikeys vector of characters
#'
#' @return vector of characters (InChIKeys), filled with NA values where InChIKey were not valid
#'
#' @examples
#' \dontrun{format_inchikey("BQJCRHHNABKAKU-KBQPJGBKSA-N")}
format_inchikey <- function(inchikeys) {
    valid <- check_inchikey(inchikeys)
    inchikeys[which(!valid)] <- NA
    inchikeys
}

#' @title Test validity of InChIKey(s)
#'
#' @description
#' Test if InChIKey is valid or not. Inchikeys must respect some rules:
#' \itemize{
#'      \item only uppercase letters
#'      \item length of 27
#'      \item 3 parts sep by "-"
#'      \item 1st part of 14, 2nd of 10, 3rd of one
#'      \item 24th must be S or N
#'      \item 25th must be A.
#' }
#'
#' @param inchikeys vector of characters
#'
#' @return vector of boolean of same length than the inchikey parameter (TRUE if valid, FALSE if not)
#'
#' @examples
#' \dontrun{check_inchikey("BQJCRHHNABKAKU-KBQPJGBKSA-N")}
check_inchikey <- function(inchikeys) str_detect(inchikeys, 
    "^[:upper:]{14}-[:upper:]{8}[SN]A-[:upper:]$")

#' @title Test validity of CAS
#'
#' @description
#' Test the vaidity of CAS. CAS must respect some rules:
#' \itemize{
#'      \item only digit
#'      \item cas must be 3 parts sep by "-"
#'      \item 1st part of 2 up to 7 digits
#'      \item 2nd of 2 digits
#'      \item 3rd of 1 (check digit)
#'      \item the check digit must be equal to the last digit times 1, 
#'          the preceding digit times 2, the preceding digit times 3 etc, ..., 
#'          adding all these up and computing the sum modulo 10.
#' }
#'
#' @param cas vector of characters
#'
#' @return vector of boolean of same length than the cas parameter (TRUE if valid, FALSE if not)
#'
#' @examples
#' \dontrun{check_cas("64-17-5")}
check_cas <- function(cas) str_detect(cas, 
    "^[:digit:]{2,7}-[:digit:]{2}-[:digit:]$") & 
        sapply(stringr::str_split(gsub("-", "", cas), ""), function(x) {
            x <- as.numeric(rev(x))
            x[1] == sum(x[-1] * 1:(length(x) - 1)) %% 10
        })

#' @title Format chemical formulas
#'
#' @description
#' Format chemical formulas
#' \itemize{
#'      \item remove white spaces at begin & end
#'      \item check validity of formula:
#'     \enumerate{
#'          \item isotope designation: number between []
#'          \item element: an upper letter followed sometimes by a lower letter
#'          \item number(s)
#'      }
#' }
#'
#' @param formulas vector of characters of chemical formulas
#'
#' @return vector of characters (chemical formulas), filled with NA value where the formula were not valid
#'
#' @examples
#' \dontrun{format_formula(c("C12H18Br6", "C12[2]H18Br6"))}
format_formula <- function(formulas) {
    formulas[which(!str_detect(formulas, 
        "^((\\[[:digit:]+\\])?[:upper:][:lower:]?[:digit:]*)+$"))] <- NA
    formulas <- stringr::str_extract_all(formulas, 
        "(\\[[:digit:]+\\])?[:upper:][:lower:]?[:digit:]*")
    utils::data(isotopes_df, package = "metabSeek")
    sapply(formulas, format_formula_vec, isotopes_df)
}        
format_formula_vec <- function(formula_vec, isotopes_df) {
    if (any(is.na(formula_vec))) return(NA)
    form <- matrix(c(
        stringr::str_extract(formula_vec, "^(\\[[:digit:]+\\])?[:upper:][:lower:]?"), 
        stringr::str_extract(formula_vec, "[:digit:]*$")), ncol = 2)
    form <- form[which(form[, 1] %in% isotopes_df[, 1]), , drop = FALSE]
    if (nrow(form) == 0) return(NA)
    form[which(form[, 2] == ""), 2] <- 1
    paste(t(form), sep = "", collapse = "")
}

#' @title Get basepeak m/z
#' 
#' @description
#' Get basepeak m/z
#'
#' @param formulas vector of characters representing chemical formulas
#' @param charges vector of integers representing charges for each chemical formulas (same length as the parameter formulas)
#'
#' @return vector of floats (basepeak m/z for each couple of formula - charge)
#'
#' @examples
#' \dontrun{get_basepeak_mass("C12H18Br6", 0)}
get_basepeak_mass <- function(formulas, charges) {
    utils::data(isotopes_df, package = "metabSeek")
    masses <- rep(NA, length(formulas))
    # envipat cannot deal at the same time molecule with a charge of 0 and naturally charged compounds ??!!!
    for (charge in unique(charges)) {
        if (is.na(charge)) next
        ids <- which(charges == charge & !is.na(formulas))
        isotopic_patterns <- enviPat::isopattern(isotopes_df, formulas[ids], 
            threshold = 90, charge = charge, verbose = FALSE)
        masses[ids] <- sapply(isotopic_patterns, function(x) x[1, 1])
    }
    masses
}

#' @title convert charged m/z to neutral mass
#'
#' @description 
#'  convert charged m/z to neutral mass with a list of predefined adducts
#'      reject those with a mass under or equal to 0
#'
#' @param mzs vector of floats, represent m/z
#' @param adducts vector of strings, represent name of the adduct
#'
#' @return matrix with columns
#'  \enumerate{
#'      \item adduct
#'      \item ion_mz
#'      \item mass
#' }
#'
#' @examples
#' \dontrun{mzs_to_mass(443.1247895, c("M+H", "M+2H"))}
mzs_to_mass <- function(mzs, adducts) {
    utils::data(adducts_df, package = "metabSeek")
    adducts <- adducts_df[which(adducts_df[, 1] %in% adducts), ]
    result <- do.call(rbind, lapply(mzs, function(mz) 
        matrix(c(rep(mz, nrow(adducts)), adducts[, 1], 
            mz * adducts$Charge / adducts$Mult + adducts$Mass), 
            ncol = 3, dimnames = list(c(), c("mz", "adduct", "mass")))))
    result[which(as.numeric(result[, "mass"]) > 0), , drop = FALSE]
}

#' @title Get mass error range
#' 
#' @description.
#' Compute mass error range with a tolerance in ppm and / or in mda
#'
#' @param mass vector of floats representing masses
#' @param ppm float, optional, 0 by default
#' @param mda float, optional, 0 by default
#'
#' @return matrix with two column: one row for each mass provided in the masses param, columns represent borns min & max
#'
#' @examples
#' \dontrun{get_mass_range(c(443.1247985, 190.053578), ppm = 2)}
#' \dontrun{get_mass_range(c(443.1247985, 190.053578), mda = 1)}
get_mass_range <- function(mass, ppm = 0, mda = 0) {
    mass_range <- if (ppm > 0) matrix(c(mass - (mass * ppm * 10**-6), 
            mass + (mass * ppm * 10**-6)), ncol = 2)
        else matrix(rep(mass, 2), ncol = 2)
    mass_range[, 1] <- mass_range[, 1] - (mda * 10**-3)
    mass_range[, 2] <- mass_range[, 2] + (mda * 10**-3)
    mass_range
}

# just a remember
mda_to_ppm <- function(mass, mda) mda * 10**3 * mass
ppm_to_mda <- function(mass, ppm) ppm * 10**-3 / mass
    
shutinet/metabSeek documentation built on Sept. 5, 2020, 12:57 a.m.