R/createCompDbPackage.R

Defines functions import_mona_sdf make_metadata .is_sdf_filename createCompDbPackage .valid_compound .valid_metadata .db_file_from_metadata .valid_msms_spectrum .insert_msms_spectra .add_mz_range_column createCompDb .import_lipidblast .guess_sdf_source .simple_extract_compounds_sdf compound_tbl_lipidblast compound_tbl_sdf

Documented in compound_tbl_lipidblast compound_tbl_sdf createCompDb createCompDbPackage import_mona_sdf make_metadata

#' @title Extract compound data from a file in SDF format
#'
#' @description
#'
#' `compound_tbl_sdf` extracts basic compound annotations from a file in SDF
#' format (structure-data file). The function currently supports SDF files from:
#' + HMDB (Human Metabolome Database): http://www.hmdb.ca
#' + ChEBI (Chemical Entities of Biological Interest): http://ebi.ac.uk/chebi
#' + LMSD (LIPID MAPS Structure Database): http://www.lipidmaps.org
#' + PubChem: https://pubchem.ncbi.nlm.nih.gov/
#' + MoNa: http://mona.fiehnlab.ucdavis.edu/ (see notes below!)
#'
#' @details
#'
#' Column `"compound_name"` reports for HMDB files the `"GENERIC_NAME"`, for
#' ChEBI the `"ChEBI Name"`, for PubChem the `"PUBCHEM_IUPAC_TRADITIONAL_NAME"`,
#' and for Lipid Maps the `"COMMON_NAME"`, if that is
#' not available, the first of the compounds synonyms and, if that is also not
#' provided, the `"SYSTEMATIC_NAME"`.
#'
#' @note
#'
#' `compound_tbl_sdf` supports also to read/process gzipped files.
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compount
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' @param file `character(1)` with the name of the SDF file.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#'     values in the columns `"synonyms"`. See examples for details.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#' + `compound_id`: the ID of the compound.
#' + `compound_name`: the compound's name.
#' + `inchi`: the InChI of the compound.
#' + `inchikey`: the InChI key.
#' + `formula`: the chemical formula.
#' + `mass`: the compound's mass.
#' + `synonyms`: the compound's synonyms (aliases). This type of this column is
#'   by default a `list` to support multiple aliases per compound, unless
#'   argument `collapse` is provided, in which case multiple synonyms are pasted
#'   into a single element separated by the value of `collapse`.
#' + `smiles`: the compound's SMILES (if provided).
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer and Jan Stanstrup
#'
#' @export
#'
#' @seealso [createCompDb()] for a function to create a SQLite-based compound
#'     database.
#'
#' @importFrom ChemmineR read.SDFset datablock datablock2ma
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#' cmps
#'
#' ## Column synonyms contains a list
#' cmps$synonyms
#'
#' ## If we provide the optional argument collapse, multiple entries will be
#' ## collapsed.
#' cmps <- compound_tbl_sdf(fl, collapse = "|")
#' cmps
#' cmps$synonyms
compound_tbl_sdf <- function(file, collapse) {
    if (missing(file))
        stop("Please provide the file name using 'file'")
    if (!file.exists(file))
        stop("Can not fine file ", file)
    res <- .simple_extract_compounds_sdf(
        datablock2ma(datablock(read.SDFset(file))))
    if (!missing(collapse)) {
        ## collapse elements from lists.
        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
                               FUN.VALUE = "character")
    }
    res
}

#' @title Extract compound data from LipidBlast
#'
#' @description
#'
#' `compound_tbl_lipidblast` extracts basic comopund annotations from a
#' LipidBlast file in (json format) downloaded from
#' http://mona.fiehnlab.ucdavis.edu/downloads
#'
#' @param file `character(1)` with the name of the file name.
#'
#' @param collapse optional `character(1)` to be used to collapse multiple
#'     values in the columns `"synonyms"`. See examples for details.
#'
#' @return A [tibble::tibble] with general compound information (one row per
#' compound):
#' + `compound_id`: the ID of the compound.
#' + `compound_name`: the compound's name.
#' + `inchi`: the InChI of the compound.
#' + `inchikey`: the InChI key.
#' + `formula`: the chemical formula.
#' + `mass`: the compound's mass.
#' + `synonyms`: the compound's synonyms (aliases). This type of this column is
#'   by default a `list` to support multiple aliases per compound, unless
#'   argument `collapse` is provided, in which case multiple synonyms are pasted
#'   into a single element separated by the value of `collapse`.
#'
#' @family compound table creation functions
#'
#' @author Johannes Rainer and Jan Stanstrup
#'
#' @export
#'
#' @examples
#'
#' ## Read compound information from a subset of HMDB
#' fl <- system.file("json/MoNa-LipidBlast_sub.json", package = "CompoundDb")
#' cmps <- compound_tbl_lipidblast(fl)
#' cmps
compound_tbl_lipidblast <- function(file, collapse) {
    if (missing(file))
        stop("Please provide the file name using 'file'")
    if (!file.exists(file))
        stop("Can not fine file ", file)
    res <- .import_lipidblast(file)
    if (!missing(collapse)) {
        ## collapse elements from lists.
        res$synonyms <- vapply(res$synonyms, paste0, collapse = collapse,
                               FUN.VALUE = "character")
    }
    res
}

#' @description
#'
#' Internal function to extract compound information from a file in SDF format.
#'
#' @param x what is returned by datablock2ma(datablock(read.SDFset)).
#'
#' @return A [tibble::tibble] with columns `"compound_id"`, `"compound_name"`,
#'     `"inchi"`, `"formula"`, `"mass"`.
#'
#' @importFrom tibble data_frame
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.simple_extract_compounds_sdf <- function(x) {
    source_db <- .guess_sdf_source(colnames(x))
    if (is.null(source_db))
        stop("The SDF file is not supported. Supported are SDF files from ",
             "HMDB, ChEBI and LipidMaps.")
    colmap <- get(paste0(".", source_db, "_colmap"))
    sep <- get(paste0(".", source_db, "_separator"))
    ## Fix missing COMMON_NAME entries in LipidMaps (see issue #1)
    nms <- x[, colmap["name"]]
    syns <- strsplit(x[, colmap["synonyms"]], split = sep)
    if (source_db == "lipidmaps") {
        nas <- is.na(nms)
        if (any(nas))
            nms[nas] <- vapply(syns[nas], `[[`, 1, FUN.VALUE = "character")
        nas <- is.na(nms)
        if (any(nas))
            nms[nas] <- x[nas, "SYSTEMATIC_NAME"]
    }
    res <- data_frame(compound_id = x[, colmap["id"]],
                      compound_name = nms,
                      inchi = x[, colmap["inchi"]],
                      inchi_key = x[, colmap["inchi_key"]],
                      formula = x[, colmap["formula"]],
                      mass = as.numeric(x[, colmap["mass"]]),
                      synonyms = syns
                      )
    if (is.na(colmap["smiles"])) {
        res$smiles <- NA_character_
    } else
        res$smiles <- x[, colmap["smiles"]]
    if (source_db == "mona") {
        warning("MoNa data can currently not be normalized and the ",
                "compound table contains thus highly redundant data.",
                call. = FALSE)
        ## Eventually reduce and normalize.
        inchis <- .extract_field_from_string(x[, "COMMENT"], "InChI")
        nona <- !is.na(inchis)
        inchis[nona] <- paste0("InChI", inchis[nona])
        res$inchi <- inchis
        res$compound_id <- .compound_id_from_mona_sdf(x)
        res
    } else res
}

#' @description
#'
#' Based on the provided `colnames` guess whether the file is from HMDB,
#' ChEBI or LipidBlast.
#'
#' @param x `character` with the column names of the data table.
#'
#' @return `character(1)` with the name of the resource or `NULL`.
#'
#' @md
#'
#' @author Johannes Rainer
#'
#' @noRd
.guess_sdf_source <- function(x) {
    if (any(x == "HMDB_ID"))
        return("hmdb")
    if (any(x == "ChEBI ID"))
        return("chebi")
    if (any(x == "LM_ID"))
        return("lipidmaps")
    if (any(x == "PUBCHEM_COMPOUND_CID"))
        return("pubchem")
    if (any(x == "ID")) {
        message("Guessing we've got a SDF file from MoNa.")
        return("mona")
    }
    NULL
}


.hmdb_colmap <- c(id = "HMDB_ID",
                  name = "GENERIC_NAME",
                  inchi = "INCHI_IDENTIFIER",
                  inchi_key = "INCHI_KEY",
                  formula = "FORMULA",
                  mass = "EXACT_MASS",
                  synonyms = "SYNONYMS",
                  smiles = "SMILES"
                  )
.hmdb_separator <- "; "
.chebi_colmap <- c(id = "ChEBI ID",
                   name = "ChEBI Name",
                   inchi = "InChI",
                   inchi_key = "InChIKey",
                   formula = "Formulae",
                   mass = "Monoisotopic Mass",
                   synonyms = "Synonyms",
                   smiles = "SMILES"
                   )
.chebi_separator <- " __ "
.lipidmaps_colmap <- c(id = "LM_ID",
                       name = "COMMON_NAME",
                       inchi = "INCHI",
                       inchi_key = "INCHI_KEY",
                       formula = "FORMULA",
                       mass = "EXACT_MASS",
                       synonyms = "SYNONYMS",
                       smiles = NA
                       )
.lipidmaps_separator <- "; "
.pubchem_colmap <- c(id = "PUBCHEM_COMPOUND_CID",
                     name = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
                     inchi = "PUBCHEM_IUPAC_INCHI",
                     inchi_key = "PUBCHEM_IUPAC_INCHIKEY",
                     formula = "PUBCHEM_MOLECULAR_FORMULA",
                     mass = "PUBCHEM_EXACT_MASS",
                     synonyms = "PUBCHEM_IUPAC_TRADITIONAL_NAME",
                     smiles = "PUBCHEM_OPENEYE_CAN_SMILES"
                       # Others:
                                        # PUBCHEM_IUPAC_SYSTEMATIC_NAME,
                                        # PUBCHEM_IUPAC_CAS_NAME,
                                        # PUBCHEM_IUPAC_OPENEYE_NAME,
                                        # PUBCHEM_IUPAC_NAME
                     )
.pubchem_separator <- "; " # there seems to be none
.mona_colmap <- c(id = "ID",
                  name = "NAME",
                  inchi = "INCHIKEY",
                  inchi_key = "INCHIKEY",
                  formula = "FORMULA",
                  mass = "EXACT MASS",
                  synonyms = "SYNONYMS",
                  smiles = NA
                  )
.mona_separator <- " __ "

#' @description Import compound information from a LipidBlast file in json
#'     format.
#'
#' @note This is a modified version from Jan's generate_lipidblast_tbl that
#'     extracts the mass also from the json and does not calculate it.
#'
#' @author Jan Stanstrup and Johannes Rainer
#'
#' @importFrom jsonlite read_json
#' @importFrom dplyr bind_rows
#'
#' @md
#'
#' @noRd
.import_lipidblast <- function(file) {
    lipidb <- read_json(file)

    parse_element <- function(x) {
        id <- x$id
        cmp <- x$compound[[1]]
        ## get the name(s) -> name + aliases
        nms <- vapply(cmp$names, `[[`, "name", FUN.VALUE = "character")
        mass <- unlist(lapply(cmp$metaData, function(z) {
            if (z$name == "total exact mass")
                z$value
        }))
        if (is.null(mass))
            mass <- NA_character_
        frml <- unlist(lapply(cmp$metaData, function(z) {
            if (z$name == "molecular formula")
                z$value
        }))
        if (is.null(frml))
            mass <- NA_character_
        list(
            compound_id = x$id,
            compound_name = nms[1],
            inchi = cmp$inchi,
            inchi_key = NA_character_,
            formula = frml,
            mass = mass,
            synonyms = nms[-1]
        )
    }

    res <- lapply(lipidb, parse_element)
    bind_rows(res)
}

#' @title Create a CompDb database
#'
#' @description
#'
#' `createCompDb` creates a `SQLite`-based [`CompDb`] object/database
#' from a compound resource provided as a `data.frame` or `tbl`. Alternatively,
#' the name(s) of the file(s) from which the annotation should be extracted can
#' be provided. Supported are SDF files (such as those from the
#' *Human Metabolome Database* HMDB) that can be read using the
#' [compound_tbl_sdf()] or LipidBlast files (see [compound_tbl_lipidblast()].
#'
#' An additional `data.frame` providing metadata information including the data
#' source, date, version and organism is mandatory.
#'
#' Optionally MS/MS (MS2) spectra for compounds can be also stored in the
#' database. Currently only MS/MS spectra from HMDB are supported. These can
#' be downloaded in XML format from HMDB (http://www.hmdb.ca), loaded with
#' the [msms_spectra_hmdb()] or [msms_spectra_mona()] function and passed to
#' the function with the `msms_spectra` argument. See [msms_spectra_hmdb()] or
#' [msms_spectra_mona()] for information on the expected columns and format.
#'
#' Required columns for the `data.frame` providing the compound information (
#' parameter `x`) are:
#' + `compound_id`: the ID of the compound.
#' + `compound_name`: the compound's name.
#' + `inchi`: the InChI of the compound.
#' + `inchikey`: the InChI key.
#' + `formula`: the chemical formula.
#' + `mass`: the compound's mass.
#' + `"synonyms"`: additional synonyms/aliases for the compound. Should be
#'   either a single character or a list of values for each compound.
#' + `"smiles"`: the compound's SMILES.
#'
#' See e.g. [compound_tbl_sdf()] or [compound_tbl_lipidblast()] for functions
#' creating such compound tables.
#'
#' The metadata `data.frame` is supposed to have two columns named `"name"` and
#' `"value"` providing the following minimal information as key-value pairs
#' (see `make_metadata` for a unitlity function to create such a `data.frame):
#' + `"source"`: the source from which the data was retrieved (e.g. `"HMDB"`).
#' + `"url"`: the url from which the original data was retrieved.
#' + `"source_version"`: the version from the original data source
#'   (e.g. `"v4"`).
#' + `"source_date"`: the date when the original data source was generated.
#' + `"organism"`: the organism. Should be in the form `"Hsapiens"` or
#'   `"Mmusculus"`.
#'
#' @details
#'
#' Metadata information is also used to create the file name for the database
#' file. The name starts with `"CompDb"`, followed by the organism, the
#' data source and its version. A compound database file for HMDB version 4
#' with human metabolites will thus be named: `"CompDb.Hsapiens.HMDB.v4"`.
#'
#' A single `CompDb` database is created from multiple SDF files (e.g. for
#' *PubChem*) if all the file names are provided with parameter `x`. Parallel
#' processing is currently not enabled because SQLite does not support it yet
#' natively.
#'
#' @param x For `createCompDb`: `data.frame` or `tbl` with the compound
#'     annotations or `character` with the file name(s) from which the compound
#'     annotations should be retrieved. See description for details.
#'
#'     For `createCompDbPackage`: `character(1)` with the file name of the
#'     `CompDb` SQLite file (created by `createCompDb`).
#'
#' @param metadata For `createCompDb`: `data.frame` with metadata
#'     information. See description for details.
#'
#' @param msms_spectra For `createCompDb`: `data.frame` with MS/MS spectrum
#'     data. See [msms_spectra_hmdb()] for the expected format and a function
#'     to import such data from spectrum xml files from HMDB.
#'
#' @param path `character(1)` with the path to the directory where the database
#'     file or package folder should be written. Defaults to the current
#'     directory.
#'
#' @return For `createCompDb`: a `character(1)` with the database name
#'     (invisibly).
#'
#' @importFrom DBI dbDriver dbWriteTable dbExecute dbDisconnect
#' @importFrom RSQLite dbConnect
#' @importFrom dplyr bind_cols bind_rows
#'
#' @export
#'
#' @author Johannes Rainer
#'
#' @seealso
#'
#' [compound_tbl_sdf()] and [compound_tbl_lipidblast()] for functions
#' to extract compound annotations from files in SDF format, or files from
#' LipidBlast.
#'
#' [import_mona_sdf()] to import both the compound and spectrum data from a
#' SDF file from MoNa (Massbank of North America) in one call.
#'
#' [msms_spectra_hmdb()] and [msms_spectra_mona()] for functions to import
#' MS/MS spectrum data from xml files from HMDB or an SDF file from MoNa.
#'
#' [CompDb()] for how to use a compound database.
#'
#' @examples
#'
#' ## Read compounds for a HMDB subset
#' fl <- system.file("sdf/HMDB_sub.sdf.gz", package = "CompoundDb")
#' cmps <- compound_tbl_sdf(fl)
#'
#' ## Create a metadata data.frame for the compounds.
#' metad <- data.frame(name = c("source", "url", "source_version",
#'     "source_date", "organism"), value = c("HMDB", "http://www.hmdb.ca",
#'     "v4", "2017-08-27", "Hsapiens"))
#'
#' ## Alternatively use the make_metadata helper function
#' metad <- make_metadata(source = "HMDB", source_version = "v4",
#'     source_date = "2017-08", organism = "Hsapiens",
#'     url = "http://www.hmdb.ca")
#' ## Create a SQLite database in the temporary folder
#' db_f <- createCompDb(cmps, metadata = metad, path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db <- CompDb(db_f)
#' db
#'
#' ## Create a database for HMDB that includes also MS/MS spectrum data
#' metad2 <- make_metadata(source = "HMDB_with_spectra", source_version = "v4",
#'     source_date = "2017-08", organism = "Hsapiens",
#'     url = "http://www.hmdb.ca")
#'
#' ## Import spectrum information from selected MS/MS xml files from HMDB
#' ## that are provided in the package
#' xml_path <- system.file("xml", package = "CompoundDb")
#' spctra <- msms_spectra_hmdb(xml_path)
#'
#' ## Create a SQLite database in the temporary folder
#' db_f2 <- createCompDb(cmps, metadata = metad2, msms_spectra = spctra,
#'     path = tempdir())
#'
#' ## The database can be loaded and accessed with a CompDb object
#' db2 <- CompDb(db_f2)
#' db2
#'
#' ## Does the database contain MS/MS spectrum data?
#' hasMsMsSpectra(db2)
#'
#' ## Create a database for a ChEBI subset providing the file name of the
#' ## corresponding SDF file
#' metad <- make_metadata(source = "ChEBI_sub", source_version = "2",
#'     source_date = NA, organism = "Hsapiens", url = "www.ebi.ac.uk/chebi")
#' db_f <- createCompDb(system.file("sdf/ChEBI_sub.sdf.gz",
#'     package = "CompoundDb"), metadata = metad, path = tempdir())
#' db <- CompDb(db_f)
#' db
#'
#' compounds(db)
#'
#' ## connect to the database and query it's tables using RSQlite
#' library(RSQLite)
#' con <- dbConnect(dbDriver("SQLite"), db_f)
#'
#' dbGetQuery(con, "select * from metadata")
#' dbGetQuery(con, "select * from compound")
#'
#' ## To create a CompDb R-package we could simply use the
#' ## createCompDbPackage function on the SQLite database file name.
createCompDb <- function(x, metadata, msms_spectra, path = ".") {
    .valid_metadata(metadata)
    db_file <- paste0(path, "/", .db_file_from_metadata(metadata), ".sqlite")
    con <- dbConnect(dbDriver("SQLite"), dbname = db_file)
    ## Add additional metadata info
    metadata$name <- as.character(metadata$name)
    metadata$value <- as.character(metadata$value)
    metadata <- bind_rows(metadata, c(name = "db_creation_date",
                                      value = date()))
    metadata <- bind_rows(metadata, c(name = "supporting_package",
                                      value = "CompoundDb"))
    metadata <- bind_rows(metadata, c(name = "supporting_object",
                                      value = "CompDb"))
    dbWriteTable(con, name = "metadata", metadata, row.names = FALSE)
    ## compound table
    if (is.character(x)) {
        lapply(x, function(z) {
            message("Import data from ", basename(z), " ...", appendLF = FALSE)
            if (.is_sdf_filename(z)) {
                tbl <- compound_tbl_sdf(z)
            } else {
                ## Assume this is a json file... might simply be wrong.
                tbl <- compound_tbl_lipidblast(z)
            }
            message("OK")
            .valid_compound(tbl, db = FALSE)
            tbl_syn <- bind_cols(compound_id = rep(tbl$compound_id,
                                                   lengths(tbl$synonyms)),
                                 synonym = unlist(tbl$synonyms))
            tbl <- tbl[, colnames(tbl) != "synonyms"]
            dbWriteTable(con, name = "synonym", tbl_syn, row.names = FALSE,
                         append = TRUE)
            dbWriteTable(con, name = "compound", row.names = FALSE,
                         tbl[, colnames(tbl) != "synonyms"], append = TRUE)
        })
    } else {
        .valid_compound(x, db = FALSE)
        x_synonym <- bind_cols(compound_id = rep(x$compound_id,
                                                 lengths(x$synonyms)),
                               synonym = unlist(x$synonyms))
        dbWriteTable(con, name = "synonym", x_synonym, row.names = FALSE)
        dbWriteTable(con, name = "compound", x[, colnames(x) != "synonyms"],
                     row.names = FALSE)
    }
    ## Creating indices
    dbExecute(con, "create index compound_id_idx on compound (compound_id)")
    dbExecute(con, "create index compound_name_idx on compound (compound_name)")
    ## Process spectra.
    if (!missing(msms_spectra) && is.data.frame(msms_spectra)) {
        comp_ids <- unique(x$compound_id)
        if (!all(msms_spectra$compound_id %in% comp_ids))
            stop("All compound identifiers in 'msms_spectra' have to be ",
                 "present also in 'x'")
        msms_spectra <- msms_spectra[msms_spectra$compound_id %in% comp_ids, ]
        .insert_msms_spectra(con, msms_spectra)
    }
    dbDisconnect(con)
    invisible(db_file)
}

#' Add mz_min and mz_max columns to data.frame x
#'
#' @param x `data.frame` with MS MS spectrum data such as returned by
#'     [msms_spectra_hmdb()] **and** eventually collapsed with
#'     [.collapse_spectrum_df].
#'
#' @noRd
#'
#' @author Johannes Rainer
.add_mz_range_column <- function(x) {
    if (!any(colnames(x) == "mz"))
        stop("Required column 'mz' not found")
    if (is.list(x$mz)) {
        mzr <- do.call(rbind, lapply(x$mz, range))
        x$msms_mz_range_min <- mzr[, 1]
        x$msms_mz_range_max <- mzr[, 2]
        x
    } else {
        fct <- factor(x$spectrum_id, levels = unique(x$spectrum_id))
        mzs <- split(x$mz, f = fct)
        mzr <- do.call(rbind, lapply(mzs, range, na.rm = TRUE))
        x$msms_mz_range_min <- unsplit(mzr[, 1], fct)
        x$msms_mz_range_max <- unsplit(mzr[, 2], fct)
        x
    }
}

#' Function to insert MS/MS spectrum data into two database tables:
#' msms_spectrum_peak with the individual m/z and intensity values and
#' msms_spectrum_metadata with the spectrum annotations. Both tables are linked
#' via the spectrum_id.
#'
#' @param con database connection
#'
#' @param x `data.frame` with the spectrum data, such as returned by
#' [msms_spectra_hmdb]
#'
#' @noRd
#'
#' @author Johannes Rainer
.insert_msms_spectra <- function(con, x) {
    x$collision_energy <- as.character(x$collision_energy)
    .valid_msms_spectrum(x, blob = is.list(x$mz))
    x <- .add_mz_range_column(x)
    msms_spectrum_peak <- x[, c("spectrum_id", "mz", "intensity")]
    if (is.list(msms_spectrum_peak$mz))
        msms_spectrum_peak <- .expand_spectrum_df(msms_spectrum_peak)
    if (!is.numeric(msms_spectrum_peak$mz) ||
        !is.numeric(msms_spectrum_peak$intensity))
        stop("Columns 'mz' and 'intensity' should only contain numeric values")
    msms_spectrum_peak <- msms_spectrum_peak[
        order(msms_spectrum_peak$spectrum_id, msms_spectrum_peak$mz), ]
    msms_spectrum_peak$peak_id <- seq_len(nrow(msms_spectrum_peak))
    x <- unique(x[, !(colnames(x) %in% c("mz", "intensity"))])
    dbWriteTable(con, name = "msms_spectrum_peak", msms_spectrum_peak,
                 row.names = FALSE)
    dbWriteTable(con, name = "msms_spectrum", x, row.names = FALSE)
    dbExecute(con, "create index msms_id_idx on msms_spectrum_peak (spectrum_id)")
    dbExecute(con, "create index msms_pid_idx on msms_spectrum_peak (peak_id)")
    dbExecute(con, "create index msms_mid_idx on msms_spectrum (spectrum_id)")
    dbExecute(con, "create index msms_cid_idx on msms_spectrum (compound_id)")
}


.required_metadata_keys <- c("source", "url", "source_version", "source_date",
                             "organism")
.required_compound_db_columns <- c("compound_id", "compound_name", "inchi",
                                   "inchi_key", "formula", "mass")
.required_compound_columns <- c(.required_compound_db_columns, "synonyms")

.required_msms_spectrum_columns <- c(spectrum_id = "integer",
                                     compound_id = "character",
                                     polarity = "integer",
                                     collision_energy = "character",
                                     predicted = "logical",
                                     splash = "character",
                                     instrument_type = "character",
                                     instrument = "character",
                                     precursor_mz = "numeric",
                                     mz = "numeric",
                                     intensity = "numeric"
                                     )

.required_msms_spectrum_columns_blob <- c(spectrum_id = "integer",
                                          compound_id = "character",
                                          polarity = "integer",
                                          collision_energy = "character",
                                          predicted = "logical",
                                          splash = "character",
                                          instrument_type = "character",
                                          instrument = "character",
                                          precursor_mz = "numeric",
                                          mz = "list",
                                          intensity = "list"
                                          )

.valid_msms_spectrum <- function(x, blob = FALSE, error = TRUE) {
    coltypes <- unlist(lapply(x, class))
    msg <- NULL
    if (blob)
        req_cols <- .required_msms_spectrum_columns_blob
    else req_cols <- .required_msms_spectrum_columns
    not_found <- which(!(names(req_cols) %in% names(coltypes)))
    if (length(not_found))
        msg <- c(msg, paste0("Required column(s) ",
                             paste0("'", names(req_cols)[not_found], "'"),
                             " not found in 'msms_spectra'"))
    common_cols <- union(names(coltypes), names(req_cols))
    wrong_type <- which(coltypes[common_cols] != req_cols[common_cols])
    if (length(wrong_type))
        msg <- c(msg,
                 paste0("One or more columns contain data of the wrong type: ",
                        paste(names(coltypes[names(wrong_type)]),
                              coltypes[names(wrong_type)], sep = ":",
                              collapse = ", "),
                        ". Please refer to the documentation of createCompDb ",
                        "for the correct data types."))
    if (length(msg))
        if (error) stop(msg)
        else msg
    else TRUE
}

#' @description Create the database file name from the metadata `data.frame`.
#'     The function checks also for the presence of all required metadata fields
#'     ensuring that these are also not `NA` or `NULL`.
#'
#' @noRd
.db_file_from_metadata <- function(x) {
    paste0("CompDb.", x$value[x$name == "organism"], ".",
           x$value[x$name == "source"], ".",
           x$value[x$name == "source_version"])
}

#' @description Check the metadata `data.frame` for required columns.
#'
#' @noRd
.valid_metadata <- function(metadata, error = TRUE) {
    txt <- character()
    if (!is.data.frame(metadata))
        txt <- c(txt, "'metadata' is expected to be a data.frame")
    if (all(c("name", "value") %in% colnames(metadata))) {
        keys <- metadata$name
        vals <- metadata$value
        got_it <- .required_metadata_keys %in% keys
        if (!all(got_it))
            txt <- c(txt, paste0("required fields ",
                                 paste0(.required_metadata_keys[!got_it],
                                        collapse = ", "),
                                 " not found in metadata data.frame"))
        vals <- vals[keys %in% .required_metadata_keys]
    } else {
        txt <- c(txt, paste0("metadata data.frame needs to have columns ",
                             "named 'name' and 'value'"))
    }
    if (length(txt))
        if (error)
            stop(paste(txt, collapse = "\n"))
        else txt
    else TRUE
}

#' @description Check that the compounds table contains all required data.
#'
#' @param db `logical(1)` whether validity should be checked on the internal
#'     database table instead of the input file.
#' @md
#'
#' @noRd
.valid_compound <- function(x, error = TRUE, db = TRUE) {
    txt <- character()
    if (!is.data.frame(x))
        txt <- c(txt, "'x' is supposed to be a data.frame")
    .req_cols <- .required_compound_db_columns
    if (!db)
        .req_cols <- .required_compound_columns
    got_it <- .req_cols %in% colnames(x)
    if (!all(got_it)) {
        txt <- c(txt, paste0("Miss required columns: ",
                             paste0(.required_compound_columns[!got_it],
                                    collapse = ", ")))
    } else {
        if (!is.numeric(x$mass))
            txt <- c(txt, "Column 'mass' should be numeric")
    }
    ## if (db) {
    ##     ## Do not allow more columns than expected!
    ##     is_ok <- colnames(x) %in% .req_cols
    ##     if (any(!is_ok)) {
    ##         txt <- c(txt, paste0("Column(s) ", paste(colnames(x)[!is_ok]),
    ##                              " are not expected"))
    ##     }
    ## }
    if (length(txt))
        if (error)
            stop(paste(txt, collapse = "\n"))
        else txt
    else TRUE
}

#' @description
#'
#' `createCompDbPackage` creates an R data package with the data from a
#' [`CompDb`] object.
#'
#' @importFrom Biobase createPackage
#'
#' @param version For `createCompDbPackage`: `character(1)` with the version
#'     of the package (ideally in the format `"x.y.z"`).
#'
#' @param maintainer For `createCompDbPackage`: `character(1)` with the
#'     name and email address of the package maintainer (in the form
#'     `"First Last <first.last@provider.com>"`.
#'
#' @param author For `createCompDbPackage`: `character(1)` with the name
#'     of the package author.
#'
#' @param license For `createCompDbPackage`: `character(1)` with the
#'     license of the package respectively the originating provider.
#'
#' @export
#'
#' @rdname createCompDb
createCompDbPackage <- function(x, version, maintainer, author,
                                    path = ".", license = "Artistic-2.0") {
    if (missing(x) | missing(version) | missing(maintainer) | missing(author))
        stop("'x', 'version', 'maintainer' and 'author' are required")
    if (!is.character(x))
        stop("'x' is supposed to be the file name of the CompDb")
    cdb <- CompDb(x)
    metad <- .metadata(cdb)
    pkg_name <- .db_file_from_metadata(metad)
    m_source <- .metadata_value(cdb, "source")
    m_source_version <- .metadata_value(cdb, "source_version")
    m_source_date <- .metadata_value(cdb, "source_date")
    m_organism <- .metadata_value(cdb, "organism")
    m_source_url <- .metadata_value(cdb, "url")
    template_path <- system.file("pkg-template", package = "CompoundDb")
    symvals <- list(
        PKGTITLE = paste0(m_source, " compound annotation package"),
        PKGDESCRIPTION = paste0("Exposes a CompDb compound annotation ",
                                "databases with annotations retrieved from ",
                                m_source, "."),
        PKGVERSION  = version,
        AUTHOR = author,
        MAINTAINER = maintainer,
        LIC = license,
        ORGANISM = m_organism,
        SPECIES = m_organism,
        SOURCE = m_source,
        SOURCEVERSION = as.character(m_source_version),
        RELEASEDATE = m_source_date,
        SOURCEURL =  m_source_url,
        DBOBJNAME = pkg_name
    )
    createPackage(pkgname = pkg_name, destinationDir = path,
                  originDir = template_path, symbolValues = symvals)
    sqlite_path <- file.path(path, pkg_name, "inst", "extdata")
    dir.create(sqlite_path, showWarnings = FALSE, recursive = TRUE)
    file.copy(x, to = file.path(sqlite_path, paste0(pkg_name, ".sqlite")))
    invisible(file.path(path, pkg_name))
}

#' @description Simple test function to evaluate whether a file is an SDF file
#'     based on its file name.
#'
#' @param x `character(1)` the file name to be tested.
#'
#' @noRd
#'
#' @md
#'
#' @author Johannes Rainer
.is_sdf_filename <- function(x) {
    grepl(".sdf($|.gz$)", x, ignore.case = TRUE)
}

#' @description `make_metadata` helps generating a metadata `data.frame` in the
#'     correct format expected by the `createCompDb` function. The function
#'     returns a `data.frame`.
#'
#' @param source For `make_metadata`: `character(1)` with the name of the
#'     resource that provided the compound annotation.
#'
#' @param url For `make_metadata`: `character(1)` with the url to the original
#'     resource.
#'
#' @param source_version For `make_metadata`: `character(1)` with the version
#'     of the original resource providing the annotation.
#'
#' @param source_date For `make_metadata`: `character(1)` with the date of the
#'     resource's release.
#'
#' @param organism For `make_metadata`: `character(1)` with the name of the
#'     organism. This should be in the format `"Hsapiens"` for human,
#'     `"Mmusculus"` for mouse etc.
#'
#' @export
#'
#' @rdname createCompDb
make_metadata <- function(source, url, source_version, source_date, organism) {
    if (any(c(missing(source), missing(url), missing(source_version),
              missing(source_date), missing(organism))))
        stop("Arguments 'source', 'url', 'source_version', 'source_date', ",
             "'organism' are required")
    if (any(c(is.null(source), is.null(url), is.null(source_version),
              is.null(source_date), is.null(organism))))
        stop("'NULL' is not allowed for 'source', 'url', 'source_version', ",
             "'organism'")
    data.frame(name = c("source", "url", "source_version", "source_date",
                        "organism"),
               value = as.character(c(source, url, source_version,
                                      source_date, organism)),
               stringsAsFactors = FALSE)
}

#' @title Import compound and spectrum information from MoNa
#'
#' `import_mona_sdf` allows to import compound and spectrum information from an
#' SDF file from MoNa (Massbank of North America
#' http://mona.fiehnlab.ucdavis.edu/). This function is a convenience function
#' using the [compound_tbl_sdf()] and [msms_spectra_mona()] functions for data
#' import but avoiding to read the SDF files twice.
#'
#' @note
#'
#' MoNa SDF files organize the data by individual spectra (i.e. each element
#' is one spectrum) and individual compounds can not easily and consistently
#' defined (i.e. not all entries have an InChI ID or other means to uniquely
#' identify compounds). Thus, the function returns a highly redundant compount
#' table. Feedback on how to reduce this redundancy would be highly welcome!
#'
#' @param x `character(1)` being the SDF file name.
#'
#' @param nonStop `logical(1)` wheter file content specific errors should
#'     only reported as warnings and not break the full import process.
#'
#' @author Johannes Rainer
#'
#' @return A `list` with elements `"compound"` and `"msms_spectrum"` containing
#'     data.frames with compound and MS/MS spectrum data, respectively.
#'
#' @export
#'
#' @seealso
#'
#' [compound_tbl_sdf()] to read only the compound information.
#'
#' [msms_spectra_mona()] to read only the spectrum data.
#'
#' @examples
#'
#' ## Define the test file containing a small subset from MoNa
#' fl <- system.file("sdf/MoNa_export-All_Spectra_sub.sdf.gz",
#'     package = "CompoundDb")
#'
#' ## Import the data
#' res <- import_mona_sdf(fl)
import_mona_sdf <- function(x, nonStop = TRUE) {
    message("Reading SDF file ... ", appendLF = FALSE)
    sdfs <- datablock2ma(datablock(read.SDFset(x, skipErrors = nonStop)))
    if (!any(colnames(sdfs) == "MASS SPECTRAL PEAKS"))
        stop("The provided file does not contain \"MASS SPECTRAL PEAKS\" ",
             "elements. Is this an SDF file from MoNa?")
    message("OK")
    message("Extracting compound information ... ", appendLF = FALSE)
    suppressMessages(cmps <- .simple_extract_compounds_sdf(sdfs))
    message("OK")
    message("Extracting spectrum information ... ", appendLF = FALSE)
    spctra <- .extract_spectra_mona_sdf(sdfs)
    message("OK")
    list(compound = cmps, msms_spectrum = spctra)
}
michaelwitting/CompoundDb documentation built on April 29, 2020, 8:42 p.m.