#' @title Build database
#'
#' @description
#' Build sqlite database, integrate:
#' \itemize{
#' \item ChEBI
#' }
build_db <- function() {
db_path <- "inst/extdata/database.sqlite"
if (file.exists(db_path)) file.remove(db_path)
db <- DBI::dbConnect(RSQLite::SQLite(), db_path)
message("--------------------- BUILD CHeBI database ----------------------")
DBI::dbWriteTable(db, "ChEBI", get_CHEBI_db())
message("--------------------- BUILD PubMed database ----------------------")
DBI::dbWriteTable(db, "PubMed", get_PUBMED_db())
DBI::dbDisconnect(db)
message("-----------------------------------------------------------------")
}
#' @title Get the ChEBI database
#'
#' @description
#' Get the ChEBI database by downloading the sdf file & parsing/formatting columns @seealso format_db
#' use ChemmineR to parse the sdf file
#'
#' @return dataframe with columns
#' \itemize{
#' \item id
#' \item name
#' \item inchikey
#' \item formula
#' \item synonyms
#' \item smiles
#' \item charge
#' \item ...
#' }
get_CHEBI_db <- function() {
outdir <- "data-raw"
file_url <- "ftp://ftp.ebi.ac.uk/pub/databases/chebi/SDF/ChEBI_complete.sdf.gz"
sdf_file <- file.path(outdir, "chebi.sdf")
# utils::download.file(file_url, sdf_file)
message("open sdf file")
data <- ChemmineR::datablock2ma(
ChemmineR::datablock(
ChemmineR::read.SDFset(sdf_file, skipErrors = TRUE)))
data <- data[, c(1, 2, 14, 8, 7, 9, 10, 3:6, 8, 11:13, 15:118)]
colnames(data)[1:7] <- c("id", "name", "synonyms", "smiles",
"inchikey", "formula", "charge")
format_db(data)
}
#' @title Format dataframe of compounds from a database
#'
#' @description
#' Format each entry of a compound database
#' \enumerate{
#' \item remove duplicated column according their names
#' \item replace each space in column names by an underscore
#' \item remove duplicated or empty primary key (first column)
#' \item format each text entry @seealso format_text
#' \item format charge entries @seealso format_int
#' \item format chemical formulas @seealso format_formula
#' \item format InChIKeys @seealso format_inchikey
#' \item compute basepeak masses with enviPat
#' }
#'
#' @param data matrix, must contains columns:
#' \itemize{
#' \item id
#' \item name
#' \item synonyms
#' \item smiles
#' \item inchikey
#' \item formula
#' \item charge
#' \item ...
#' }
#'
#' @return dataframe with formatted columns
#' \itemize{
#' \item id
#' \item name
#' \item synonym
#' \item smiles
#' \item inchikey
#' \item formula
#' \item charge
#' \item ...
#' \item basepeak_mass
#' \item link
#' }
format_db <- function(data) {
column_names <- gsub(" ", "_", format_text(colnames(data)))
unique_columns <- which(!duplicated(stringr::str_to_lower(column_names)) &
!is.na(column_names))
data <- data[, unique_columns, drop = FALSE]
message("format text")
data <- data.frame(apply(data, 2, format_text), stringsAsFactors = FALSE)
colnames(data) <- column_names[unique_columns]
unique_primary_key <- which(!duplicated(stringr::str_to_lower(data[, "id"])) &
!is.na(data[, "id"]))
data <- data[unique_primary_key, ]
data[, "charge"] <- format_int(data[, "charge"])
message("format chemical formulas")
data[, "formula"] <- format_formula(data[, "formula"])
data[, "inchikey"] <- format_inchikey(data[, "inchikey"])
message("compute basepeak masses")
data[, "basepeak_mass"] <- get_basepeak_mass(data[, "formula"], data[, "charge"])
data[, "link"] <- paste("https://www.ebi.ac.uk/chebi/searchId.do?chebiId=",
data[, "id"], sep = "")
message("save in sqlite database")
data
}
#' @title Get the PubMed database
#'
#' @description
#' download all the database PubMed by requesting all articles published between year 0 to year 3000
#'
#' @return dataframe with columns
#' \itemize{
#' \item id
#' \item name
#' \item inchikey
#' \item formula
#' \item synonyms
#' \item smiles
#' \item charge
#' \item ...
#' }
get_PUBMED_db <- function() {
outdir <- "data-raw/PubMed"
if (!dir.exists(outdir)) dir.create(outdir)
pubmed_files <- easyPubMed::batch_pubmed_download(
'"0000/01/01"[PDAT] : "3000/12/31"[PDAT]', dest_dir = outdir,
batch_size = 5000)
do.call(rbind, lapply(articles_to_list(pubmed_files),
article_to_df, autofill = TRUE, max_chars = -1, getKeywords = TRUE,
getAuthors = FALSE))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.