R/drugage_annot.R
In customCMPdb: Customize and Query Compound Annotation Database

Documented in buildDrugAgeDB processDrugage

#' Build DrugAge Annotation Database
#' 
#' This function builds the DrugAge annotation SQLite database from the 
#' 'drugage_id_mapping' table stored in the 'inst/extdata' directory of this 
#' package. The 'drugage_id_mapping.tsv' table contains the DrugAge compounds 
#' annotation information (such as species, avg_lifespan_change etc) as well 
#' as the compound name to ChEMBL id and PubChem id mappings. 
#' 
#' Part of the id mappings in the 'drugage_id_mapping.tsv' table is generated 
#' by the \code{processDrugage} function for compound names that have ChEMBL 
#' ids from the ChEMBL database (version 24). The missing IDs were added 
#' manually. A semi-manual approach was to use this web service: 
#' \url{https://cts.fiehnlab.ucdavis.edu/batch}. After the semi-manual process,
#' the left ones were manually mapped to ChEMBL, PubChem and DrugBank ids. 
#' The mixed items were commented. 
#' @param da_path character(1), file path to the tabular file generated from
#' \code{processDrugage} function and manually edited the missing IDs.
#' @param dest_path character(1), destination path of the result DrugAge 
#' annotation SQLite database
#' @return DrugAge annotation SQLite database 
#' @examples 
#' buildDrugAgeDB(dest_path=tempfile(fileext="_drugage.db"))
#' @export
#'  
buildDrugAgeDB <- function(da_path=system.file("extdata/drugage_id_mapping.tsv", 
                                    package="customCMPdb"), dest_path){
    ## Read DrugAge Mapping from inst/extdata
    
    # require(gsheet)
    # url<-"https://bit.ly/3dCWKWo"
    # drugAge_mapping <- suppressWarnings(gsheet2tbl(url))
    # ## Add columns names
    # names(drugAge_mapping) <- as.character(unlist(drugAge_mapping[1,]))
    # drugAge_mapping <- drugAge_mapping[-1,]
    
    drugAge_mapping <- read.delim(da_path)
    ## Create internal DrugAge_id named ida000xxx
    drugAge_ids <- paste0("ida",sprintf("%05d",1:nrow(drugAge_mapping)))
    drugAge_mapping2 <- data.frame(drugage_id=drugAge_ids, drugAge_mapping)
    
    ## Create DrugAge Database
    da <- dbConnect(SQLite(), dest_path)
    ## write id_mapping table.
    id_maptb <- na.omit(data.frame(drugage_id=drugAge_mapping2$drugage_id,
                                   chembl_id=drugAge_mapping2$chembl_id, 
                                   stringsAsFactors=FALSE)) 
    dbWriteTable(da,'id_mapping', id_maptb) 
    
    ## write DrugAge Annotation table
    drugAgeAnnot <- data.frame(drugAge_mapping2[1:13], drugAge_mapping2[15:16])
    dbWriteTable(da, 'drugAgeAnnot', drugAgeAnnot)
    dbDisconnect(da)
}

#' Process Source DrugAge Dataset
#' 
#' This function processes the source DrugAge datasets by adding the ChEMBL, 
#' PubChem and DrugBank id mapping information to the source DrugAge table 
#' which only has compound names without id mapping information. Source file of 
#' DrugAge is linked here: \url{http://genomics.senescence.info/drugs/dataset.zip}
#' 
#' This function only annotates compound names that have ChEMBL 
#' ids from the ChEMBL database (version 24). The missing IDs were added 
#' manually. A semi-manual approach was to use this web service: 
#' \url{https://cts.fiehnlab.ucdavis.edu/batch}. After the semi-manual process,
#' the left ones were manually mapped to ChEMBL, PubChem and DrugBank ids. 
#' The mixed items were commented. 
#' @param dest_file character(1), file path to the generated DrugAge annotation 
#' tabular file with id mappings. The default will write the file named as
#' "drugage_id_mapping.tsv" to user's current working directory.
#' @param verbose logical(1), If descriptive message and list of issues should be 
#' included as output
#' @return write the default 'drugage_id_mapping.tsv' file to user's current working
#' directory or the file path defined by users to the \code{dest_dafile} argument.
#' @importFrom rappdirs user_cache_dir
#' @importFrom utils untar
#' @import BiocFileCache
#' @examples 
#' library(ChemmineR)
#' \dontrun{
#' processDrugage(dest_file="drugage_id_mapping.tsv")
#' # Now the missing IDs need to be added manually. A semi-manual approach is to 
#' # use this web service: https://cts.fiehnlab.ucdavis.edu/batch
#' }
#' @export
#' 
processDrugage <- function(dest_file="drugage_id_mapping.tsv", verbose=TRUE) {
    # download DrugAge dataset to Cache directory
    dazip <- download_data("http://genomics.senescence.info/drugs/dataset.zip", 
                            "DrugAge_dataset_zip", verbose=verbose)
    ## read tsv file in the zip by temporally extracting zip to tempfile
    tmpdir <- tempdir()
    unzip(dazip, exdir=tmpdir)
    drugage <- read.csv(paste0(tmpdir, "/drugage.csv"))
    drugage <- drugage[!duplicated(drugage$compound_name),]
    
    ## Get mol_dict table from chembl_db
    chembldb <- download_data("ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_24/archived/chembl_24_sqlite.tar.gz", 
                           "ChEMBL24", verbose=verbose, ftpUpdate=FALSE)
    bfc <- BiocFileCache(cache=rappdirs::user_cache_dir(appname="customCMPdb"), ask=TRUE)
    cache_dir <- bfccache(bfc)
    if(!dir.exists(paste0(cache_dir, "/chembl_24")))
        untar(chembldb, exdir = cache_dir)
    mydb <- dbConnect(SQLite(), paste0(cache_dir, "/chembl_24/chembl_24_sqlite/chembl_24.db"))
    mol_dict <- dbGetQuery(mydb, 'SELECT * FROM molecule_dictionary')
    mol_dict_sub <- mol_dict[!is.na(mol_dict$pref_name),] # mol_dict from below
    prefname <- as.character(mol_dict_sub$pref_name)
    chemblid <- as.character(mol_dict_sub$chembl_id)
    fact <- tapply(chemblid, factor(prefname), paste, collapse=", ")
    
    # Load ChEMBL to PubChem CID and DrugBank ID mappings
    ## UniChem CMP ID mappings from here: 
    ##      https://www.ebi.ac.uk/unichem/ucquery/listSources
    ## download ChEMBL to DrugBank mapping from UniChem in src1src2.txt.gz: 
    chembl2db <- download_data("ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src2.txt.gz", 
                               "chembl2drugbank", verbose = verbose)
    ## ChEMBL to PubChem CID mapping in src1src22.txt.gz
    chembl2pc <- download_data("ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/src1src22.txt.gz", 
                                "chembl2pubchem", verbose = verbose)
    
    chembl2pubchem <- read.delim(gzfile(chembl2pc))
    chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[,2]), 
                                 factor(chembl2pubchem[,1]), paste, collapse=", ")
    chembl2drugbank <- read.delim(gzfile(chembl2db))
    chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[,2]), 
                                  factor(chembl2drugbank[,1]), paste, collapse=", ")
    ## Assemble results
    drugage <- cbind(drugage, pref_name=names(fact[toupper(drugage$compound_name)]), 
                     chembl_id=fact[toupper(drugage$compound_name)])
    drugage <- cbind(drugage, pubchem_cid=chembl2pubchem_vec[as.character(drugage$chembl_id)], 
                     DrugBank_id=chembl2drugbank_vec[as.character(drugage$chembl_id)])
    write.table(drugage, dest_file, row.names=FALSE, quote=FALSE, sep="\t")
}

download_data <- function(url, rname, verbose=TRUE, ftpUpdate=TRUE){
    bfc <- BiocFileCache(cache=rappdirs::user_cache_dir(appname="customCMPdb"), ask=TRUE)
    rid <- bfcquery(bfc, url)$rid
    if(!length(rid)){
        if(verbose) message(paste("Downloading", rname, "from", url))
        rid <- names(bfcadd(bfc, rname, url))
    }
    # check to see if the resource needs to be updated
    if (isTRUE(bfcneedsupdate(bfc, rid)) | ftpUpdate)
        bfcdownload(bfc, rid, ask=TRUE)
    bfcrpath(bfc, rids = rid)
}