R/manageCMAP.R
In customCMPdb: Customize and Query Compound Annotation Database

Documented in buildCMAPdb

###################################################
## SQLite Structure Database for Drugs from CMAP ##
###################################################
## Author: Thomas Girke
## Last update: 13-May-16

## Obtaining the structures and PubChem IDs for CMAP was much harder than expected since 
## cmap only provides inconsistently formatted compound names and order numbers. The following
## documents the workflow you used.

#' Build CMAP Database
#' 
#' This function builds a SQLite database named as 'cmap.db' that contains id 
#' mappings of cmap names to PubChem/DrugBank IDs as well as compound structure information.
#' 
#' For about 2/3 of the CMAP drugs, one can obtain their PubChem/DrugBank IDs from 
#' the DMAP site here: http://bio.informatics.iupui.edu/cmaps. Since this website is no
#' longer supported, the processed CMAP name to PubChem and DrugBank ID mapping table
#' is stored under the "inst/extdata" folder of this package named as "dmap_unique.txt".  
#' The SMILES strings for CMAP entries were obtained from ChemBank. Compounds 
#' were matched by names using the 'stringdist' library where cmap_name from 
#' CMAP were mapped to the closest name in ChemBank.
#' @param dest_dir character(1), destination directory under which the result 
#' SQLite database named as 'cmap.db' stored. The default is user's current
#' working directory.
#' @return write "cmap.db" SQLite database to the destination directory defined by user.
#' @import ChemmineR
#' @importFrom utils download.file
#' @importFrom utils read.delim
#' @importFrom utils write.table
#' @importFrom methods as
#' @importFrom stats na.omit
#' @examples 
#' library(ChemmineR)
#' ## Query database
#' # buildCMAPdb(dest_dir="./inst/scripts")
#' # conn <- initDb("/inst/scripts/cmap.db")
#' # results <- getAllCompoundIds(conn)
#' # sdfset <- getCompounds(conn, results, keepOrder=TRUE)
#' # sdfset
#' # as.data.frame(datablock2ma(datablock(sdfset)))[1:4,]
#' # myfeat <- listFeatures(conn)
#' # feat <- getCompoundFeatures(conn, results, myfeat)
#' # feat[1:4,]
buildCMAPdb <- function(dest_dir=".") {
        ## Join DMAP and CMAP tables
        dmap_path <- system.file("extdata/dmap_unique.txt", package="customCMPdb")
        dmap <- read.delim(dmap_path)
        row.names(dmap) <- tolower(dmap$SOURCE_DRUG)
        
        cmap_inst <- system.file("extdata/cmap_instances_02.txt", package="customCMPdb")
        cmap <- read.delim(cmap_inst)
        cmap <- cmap[!duplicated(tolower(cmap$cmap_name)),]
        row.names(cmap) <- tolower(cmap$cmap_name)
        df <- data.frame(cmap, dmap[row.names(cmap),])
        # sum(!is.na(df$SOURCE_DRUG))
        # length(unique(na.omit(df$SOURCE_DRUG)))
        # 867 cmap drugs have PubChem IDs; 442 do not have; 
        # instance_id 1345 and 2952 with two cmap_name "betulinic acid" and "betulin"
        # but the same SOURCE_DRUG "Betulinic Acid"

        ## Obtain SMILES strings for CMAP entries from ChemBank
        ## Tyler did this with help from P. Clemens from ChemBank.
        ## Compounds were matched by names using the stringdist library
        ## (here cmap_name from CMAP and the closest name in ChemBank).
        ## This is the location of the input files:
        ## /rhome/tbackman/Projects/cmap_drugs/src/mapIDs.R
        ## /rhome/tbackman/Projects/cmap_drugs/working/cmap_instances_02.xls
        ## /rhome/tbackman/Projects/cmap_drugs/working/allcompounds.smiles
        ## /rhome/tbackman/Projects/cmap_drugs/working/chembank-synonyms.txt
        ## /rhome/tbackman/Projects/cmap_drugs/working/chembank-structures.txt
        ## /rhome/tbackman/Projects/cmap_drugs/working/smilesMatches.csv

        ## Note: file smilesMatches.csv was generated by Tyler, see above
        smipath <- system.file("extdata/smilesMatches.csv", package="customCMPdb")
        smiMA <- read.csv(smipath) 
        row.names(smiMA) <- tolower(smiMA$cmap_name)
        bothDF <- cbind(df, smiMA[rownames(df),
                c("chembank_id", "chembank_name", "match_distance", "smiles")])
        #dim(bothDF[as.integer(bothDF[,"match_distance"]) == 0,]) # 1223 x 28
        # Note: compounds with match_distance > 0 need to be checked

        ## Create SDFset 
        #library(ChemmineR); library(ChemmineOB) # requires openbabel module
        smi <- as.character(bothDF$smiles)
        names(smi) <- as.character(bothDF$cmap_name)
        smi <- as(smi, "SMIset")
        sdfset <- smiles2sdf(smi)
        datablock(sdfset) <- bothDF # Stores annotation info in datablock slots

        ## Create SQLite database
        standardFeatures <- function(sdfInput) {
            data.frame(propOB(sdfInput),
                Ncharges=sapply(bonds(sdfInput, type="charge"), length),
                as.data.frame(groups(sdfInput, type="countMA")),
                as.data.frame(rings(sdfInput, upper=8, type="count", arom=TRUE)))
        }
        conn <- initDb(paste0(dest_dir, "/cmap.db"))
        ids <- loadSdf(conn, sdfset, fct=standardFeatures)
        dbDisconnect(conn)
}