R/generate_db.R

Defines functions get_PUBMED_db format_db get_CHEBI_db build_db

Documented in build_db format_db get_CHEBI_db

#' @title Build database
#'
#' @description
#' Build sqlite database, integrate:
#' \itemize{
#'       \item ChEBI
#' }
build_db <- function() {
    db_path <- "inst/extdata/database.sqlite"
    if (file.exists(db_path)) file.remove(db_path)
    db <- DBI::dbConnect(RSQLite::SQLite(), db_path)
    message("--------------------- BUILD CHeBI database ----------------------")
    DBI::dbWriteTable(db, "ChEBI", get_CHEBI_db())
    message("--------------------- BUILD PubMed database ----------------------")
    DBI::dbWriteTable(db, "PubMed", get_PUBMED_db())
    DBI::dbDisconnect(db)
    message("-----------------------------------------------------------------")
}

#' @title Get the ChEBI database
#'
#' @description
#' Get the ChEBI database by downloading the sdf file & parsing/formatting columns @seealso format_db
#' use ChemmineR to parse the sdf file
#'
#' @return dataframe with columns
#' \itemize{
#'      \item id
#'      \item name
#'      \item inchikey
#'      \item formula
#'      \item synonyms
#'      \item smiles
#'      \item charge
#'      \item ...
#' }
get_CHEBI_db <- function() {
    outdir <- "data-raw"
    file_url <- "ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete.sdf.gz"
    sdf_file <- file.path(outdir, "chebi.sdf")
    # utils::download.file(file_url, sdf_file)
    
    message("open sdf file")
    data <- ChemmineR::datablock2ma(
        ChemmineR::datablock(
            ChemmineR::read.SDFset(sdf_file, skipErrors = TRUE)))
    data <- data[, c(1, 2, 14, 8, 7, 9, 10, 3:6, 8, 11:13, 15:118)]
    colnames(data)[1:7] <- c("id", "name", "synonyms", "smiles", 
        "inchikey", "formula", "charge")
    format_db(data)
}

#' @title Format dataframe of compounds from a database
#'
#' @description
#' Format each entry of a compound database
#' \enumerate{
#'      \item remove duplicated column according their names
#'      \item replace each space in column names by an underscore
#'      \item remove duplicated or empty primary key (first column)
#'      \item format each text entry @seealso format_text
#'      \item format charge entries @seealso format_int
#'      \item format chemical formulas @seealso format_formula
#'      \item format InChIKeys @seealso format_inchikey
#'      \item compute basepeak masses with enviPat
#' }
#'
#' @param data matrix, must contains columns:
#'  \itemize{
#'      \item id
#'      \item name
#'      \item synonyms
#'      \item smiles
#'      \item inchikey
#'      \item formula
#'      \item charge
#'      \item ...
#' }
#'
#' @return dataframe with formatted columns
#'  \itemize{
#'      \item id
#'      \item name
#'      \item synonym
#'      \item smiles
#'      \item inchikey
#'      \item formula
#'      \item charge
#'      \item ...
#'      \item basepeak_mass
#'      \item link
#' }
format_db <- function(data) {
    column_names <- gsub(" ", "_", format_text(colnames(data)))
    unique_columns <- which(!duplicated(stringr::str_to_lower(column_names)) & 
        !is.na(column_names))
    data <- data[, unique_columns, drop = FALSE]
    message("format text")
    data <- data.frame(apply(data, 2, format_text), stringsAsFactors = FALSE)
    colnames(data) <- column_names[unique_columns]
    unique_primary_key <- which(!duplicated(stringr::str_to_lower(data[, "id"])) & 
        !is.na(data[, "id"]))
    data <- data[unique_primary_key, ]
    data[, "charge"] <- format_int(data[, "charge"])
    message("format chemical formulas")
    data[, "formula"] <- format_formula(data[, "formula"])
    data[, "inchikey"] <- format_inchikey(data[, "inchikey"])
    message("compute basepeak masses")
    data[, "basepeak_mass"] <- get_basepeak_mass(data[, "formula"], data[, "charge"])
    data[, "link"] <- paste("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=", 
        data[, "id"], sep = "")
    message("save in sqlite database")
    data
}

#' @title Get the PubMed database
#'
#' @description
#' download all the database PubMed by requesting all articles published between year 0 to year 3000
#'
#' @return dataframe with columns
#' \itemize{
#'      \item id
#'      \item name
#'      \item inchikey
#'      \item formula
#'      \item synonyms
#'      \item smiles
#'      \item charge
#'      \item ...
#' }
get_PUBMED_db <- function() {
    outdir <- "data-raw/PubMed"
    if (!dir.exists(outdir)) dir.create(outdir)
    pubmed_files <- easyPubMed::batch_pubmed_download(
        '"0000/01/01"[PDAT] : "3000/12/31"[PDAT]', dest_dir = outdir, 
        batch_size = 5000)
    do.call(rbind, lapply(articles_to_list(pubmed_files), 
        article_to_df, autofill = TRUE, max_chars = -1, getKeywords = TRUE, 
        getAuthors = FALSE))
}
shutinet/metabSeek documentation built on Sept. 5, 2020, 12:57 a.m.