R/query_db.R

Defines functions do_hit_mzs hit_mzs

Documented in hit_mzs

#' @title Search correspondance between m/z measured & the database
#'
#' @description
#' Search in the database m/z correspondance according a tolerance
#' 
#' @param mzs vector of floats, m/z
#' @param mda float, mDa tolerance to apply, optional
#' @param ppm float, ppm tolerance to apply, optional
#' @param databases vector of characters, which database(s) to use, available are ChEBI
#' @param adducts vector of characters, hypothetic adducts, availables are: 
#'      "M+H", "M+NH4", "M+Na", "M+K", "M+", "M-H", "M-2H", "M-3H", 
#'      "M+FA-H", "M+Hac-H", "M-", "M+3H", "M+2H+Na", "M+H+2Na", "M+3Na", "M+2H", 
#'      "M+H+NH4", "M+H+Na", "M+H+K", "M+ACN+2H", "M+2Na", "M+2ACN+2H", 
#'      "M+3ACN+2H", "M+CH3OH+H", "M+ACN+H", "M+2Na-H", "M+IsoProp+H", "M+ACN+Na", 
#'      "M+2K-H", "M+DMSO+H", "M+2ACN+H", "M+IsoProp+Na+H", "2M+H", "2M+NH4", "2M+Na", 
#'      "2M+3H2O+2H", "2M+K", "2M+ACN+H", "2M+ACN+Na", "M-H2O-H", "M+Na-2H", "M+Cl", 
#'      "M+K-2H", "M+Br", "M+TFA-H", "2M-H", "2M+FA-H", "2M+Hac-H", "3M-H"
#'
#' @return a list (each entry correspond to a m/z hit) of dataframes with columns:
#'  \itemize{
#'      \item database
#'      \item id
#'      \item link
#'      \item formula
#'      \item adduct
#'      \item name
#'      \item synonyms
#'      \item smiles
#'      \item inchikey
#'      \item m/z deviation mda
#'      \item m/z deviation ppm
#' }
#' 
#' @export
#' @examples
#' hit_mzs(c(443.1247985, 190.053578), ppm = 2, 
#'      adducts = c("M+H", "M+NH4", "M+Na", "M+K", "M+", "M+2H"))
hit_mzs <- function(mzs = c(), mda = 0, ppm = 0, databases = c("ChEBI"), 
        adducts = c("M+H", "M+NH4", "M+Na", "M+K", "M+", "M-H", "M-2H", "M-3H", 
            "M+FA-H", "M+Hac-H", "M-", "M+3H", "M+2H+Na", "M+H+2Na", "M+3Na", "M+2H", 
            "M+H+NH4", "M+H+Na", "M+H+K", "M+ACN+2H", "M+2Na", "M+2ACN+2H", 
            "M+3ACN+2H", "M+CH3OH+H", "M+ACN+H", "M+2Na-H", "M+IsoProp+H", "M+ACN+Na", 
            "M+2K-H", "M+DMSO+H", "M+2ACN+H", "M+IsoProp+Na+H", "2M+H", "2M+NH4", "2M+Na", 
            "2M+3H2O+2H", "2M+K", "2M+ACN+H", "2M+ACN+Na", "M-H2O-H", "M+Na-2H", "M+Cl", 
            "M+K-2H", "M+Br", "M+TFA-H", "2M-H", "2M+FA-H", "2M+Hac-H", "3M-H")) {
    try_it({
    utils::data(adducts_df, package = "metabSeek")
    databases <- as.character(databases)
    adducts <- as.character(adducts)
    db <- DBI::dbConnect(RSQLite::SQLite(), 
        system.file("extdata", "database.sqlite", package = "metabSeek"))
    current_databases <- DBI::dbListTables(db)
    DBI::dbDisconnect(db)
    
    check_args(c(length(mzs) > 0, sapply(mzs, is.numeric), 
            length(mda) == 1, is.numeric(mda[1]), 
            length(ppm) == 1, is.numeric(ppm[1]), 
            length(adducts) > 0, length(databases) > 0), 
        c("you must provide at least one m/z", paste("m/z", mzs, "is not numerical"), 
            "mda muste be ONE number", "mda is not numeric", 
            "ppm muste be ONE number", "ppm is not numeric", 
            "you must choose at least one database", 
            "you must provide at least one adduct"))
    check_args(c(mzs > 0, mda >= 0, ppm >= 0, 
        databases %in% current_databases, adducts %in% adducts_df[, 1]), 
        c(paste(mzs, "is not a valid m/z"), "mda must be a positive number", 
            "ppm must be a positive number", 
            paste(databases, "not available yet"), 
            paste(adducts, "not in the adduct choices")))
    
    do_hit_mzs(mzs, mda, ppm, databases, adducts)
    }, list(func = "hit_mzs", params = list(mzs = mzs, mda = mda, ppm = ppm, adducts = adducts, databases = databases)))
}
do_hit_mzs <- function(mzs, mda, ppm, databases, adducts) {
    data <- mzs_to_mass(mzs, adducts)
    if (nrow(data) == 0) custom_stop("invalid", "impossible to compute neutral mass with m/z & adducts provided")
    mass_ranges <- get_mass_range(as.numeric(data[, "mass"]), mda = mda, ppm = ppm)
    db <- DBI::dbConnect(RSQLite::SQLite(), 
        system.file("extdata", "database.sqlite", package = "metabSeek"))
    hits <- do.call(rbind, lapply(databases, function(database) 
        cbind_forced(DBI::dbGetQuery(db, sprintf(
            "select id, link, name, synonyms, smiles, inchikey, formula, 
                basepeak_mass from %s where %s;", 
            database, paste(sprintf("(basepeak_mass between %s and %s)", 
                mass_ranges[, 1], mass_ranges[, 2]), 
        collapse = " or "))), data.frame(database = database))))
    DBI::dbDisconnect(db)
    if (nrow(hits) == 0) custom_stop("invalid", "no hit with the database")
    result <- do.call(rbind, lapply(split(hits, hits[, "basepeak_mass"]), function(x) 
        cbind(x, data[which(
            mass_ranges[, 1] <= as.numeric(x[1, "basepeak_mass"]) & 
            mass_ranges[, 2] >= as.numeric(x[1, "basepeak_mass"])), 
                c("mz", "adduct", "mass"), drop = FALSE], stringsAsFactors = FALSE)))
    result[, c("mass", "mz")] <- lapply(result[, c("mass", "mz")], as.numeric)
    result[, "m/z deviation mda"] <- abs(result[, "mass"] - result[, "basepeak_mass"]) * 10**3
    result[, "m/z deviation ppm"] <- result[, "m/z deviation mda"] * 10**3 / result[, "mass"]
    result <- result[order(result[, "m/z deviation ppm"]), 
        c("mz", "database", "id", "link", "formula", "adduct", "name", 
        "synonyms", "smiles", "inchikey", "m/z deviation mda", "m/z deviation ppm")]
    lapply(split(result, result[, "mz"]), function(x) x[, -1])
}
shutinet/metabSeek documentation built on Sept. 5, 2020, 12:57 a.m.