R/manageCMP.R

###################################################
## SQLite Structure Database for Drugs from CMAP ##
###################################################
## Author: Thomas Gire
## Last update: 13-May-16

## Obtaining the structures and PubChem IDs for CMAP was much harder than expected since 
## cmap only provides inconsistently formatted compound names and order numbers. The following
## documents the workflow you used.

buildCMAPdb <- function(rerun) {
    if(rerun==TRUE) {
        ## (A) For about 2/3 of the CMAP drungs one can obtain their PubChem/DrugBank IDs from 
        ## the DMAP site here: http://bio.informatics.iupui.edu/cmaps).
        ## The following function downloads and preprocesses the DMAP data
        getDmap <- function(url) {
            download.file(url, "dmap.zip")
            unzip("dmap.zip", exdir="."); file.rename("DMAP\ V2.csv", "dmap.csv"); unlink("dmap.zip")
            tmp <- readLines("dmap.csv")
            tmp2 <- gsub(">,<", "\t", tmp); tmp2 <- gsub(">,", "\t", tmp2); tmp2 <- gsub(",<", "\t", tmp2); tmp2 <- gsub("^<", "", tmp2); tmp2 <- gsub(">$", "", tmp2)
            writeLines(tmp2, "dmap2.txt")
            dmap <- read.delim("dmap2.txt", quote="")
            dmap <- dmap[!duplicated(dmap$SOURCE_DRUG), ]
            write.table(dmap, "dmap_unique.txt", row.names=FALSE, quote=FALSE, sep="\t")
            unlink(c("dmap.csv", "dmap2.txt"))
        }
        ## Usage:
        getDmap(url="http://discern.uits.iu.edu:8340/PAGER/application/download/DMAP%20V2.zip")

        ## (B) Join DMAP and CMAP tables
        dmap <- read.delim("dmap_unique.txt", quote="")
        dmap <- dmap[!duplicated(tolower(dmap$SOURCE_DRUG)), ]
        row.names(dmap) <- tolower(dmap$SOURCE_DRUG)
        download.file("http://www.broadinstitute.org/cmap/cmap_instances_02.xls", "cmap_instances_02.xls")
        library(gdata) # To read excel file 
        cmap <- read.xls("cmap_instances_02.xls")[1:6100,] # Note: last lines contain cell line info
        cmap <- cmap[!duplicated(tolower(cmap$cmap_name)),]
        row.names(cmap) <- tolower(cmap$cmap_name)
        df <- data.frame(cmap, dmap[row.names(cmap),])
        ## This gives PubChem IDs for most of the compounds:
        sum(!is.na(df$SOURCE_DRUG))
        # [1] 866 # These have PubChem IDs
        sum(is.na(df$SOURCE_DRUG))
        # [1] 443 # These are missing
        unlink("cmap_instances_02.xls")
        unlink("dmap_unique.txt")

        ## (C) Obtain SMILES strings for CMAP entries from ChemBank
        ## Tyler did this with help from P. Clemens from ChemBank.
        ## Compounds were matched by names using the stringdist library
        ## (here cmap_name from CMAP and the closest name in ChemBank).
        ## This is the location of the input files:
        ## /rhome/tbackman/Projects/cmap_drugs/src/mapIDs.R
        ## /rhome/tbackman/Projects/cmap_drugs/working/cmap_instances_02.xls
        ## /rhome/tbackman/Projects/cmap_drugs/working/allcompounds.smiles
        ## /rhome/tbackman/Projects/cmap_drugs/working/chembank-synonyms.txt
        ## /rhome/tbackman/Projects/cmap_drugs/working/chembank-structures.txt
        ## /rhome/tbackman/Projects/cmap_drugs/working/smilesMatches.csv

        ## Note: file smilesMatches.csv was generated by Tyler, see above
        smipath <- system.file("extdata/cmap_support", "smilesMatches.csv", package="longevityDrugs")
        smipath <- "../inst/extdata/cmap_support/smilesMatches.csv"
        smiMA <- read.csv(smipath) 
        row.names(smiMA) <- tolower(smiMA$cmap_name)
        bothDF <- cbind(df, smiMA[rownames(df),])
        dim(bothDF[as.integer(bothDF[,"match_distance"]) == 0,]) # Note: compounds with match_distance > 0 need to be checked

        ## (D) Create SDFset 
        library(ChemmineR); library(ChemmineOB) # requires openbabel module
        smi <- as.character(bothDF$smiles)
        names(smi) <- as.character(bothDF$cmap_name)
        smi <- as(smi, "SMIset")
        sdfset <- smiles2sdf(smi)
        datablock(sdfset) <- bothDF # Stores annotation info in datablock slots

        ## (E) Create SQLite database
        standardFeatures <- function(sdfInput) {
            data.frame(propOB(sdfInput),
                Ncharges=sapply(bonds(sdfInput, type="charge"), length),
                as.data.frame(groups(sdfInput, type="countMA")),
                as.data.frame(rings(sdfInput, upper=8, type="count", arom=TRUE)))
        }
        conn <- initDb("cmap.db")
        ids <- loadSdf(conn, sdfset, fct=standardFeatures)
        print("Move cmap.db to 'inst/extdata/ of package and rebuild it.'")
    } else {
        print("To rebuild database, set 'rerun=TRUE'.")
    }
}

## Query database
# conn <- initDb("../inst/extdata/cmap.db")
# results <- getAllCompoundIds(conn)
# sdfset <- getCompounds(conn, results, keepOrder=TRUE)
# sdfset
# as.data.frame(datablock2ma(datablock(sdfset)))[1:4,]
# myfeat <- listFeatures(conn)
# feat <- getCompoundFeatures(conn, results, myfeat)
# feat[1:4,]
tgirke/longevityDrugs documentation built on May 31, 2019, 9:07 a.m.