###################################################
## SQLite Structure Database for Drugs from CMAP ##
###################################################
## Author: Thomas Gire
## Last update: 13-May-16
## Obtaining the structures and PubChem IDs for CMAP was much harder than expected since
## cmap only provides inconsistently formatted compound names and order numbers. The following
## documents the workflow you used.
buildCMAPdb <- function(rerun) {
if(rerun==TRUE) {
## (A) For about 2/3 of the CMAP drungs one can obtain their PubChem/DrugBank IDs from
## the DMAP site here: http://bio.informatics.iupui.edu/cmaps).
## The following function downloads and preprocesses the DMAP data
getDmap <- function(url) {
download.file(url, "dmap.zip")
unzip("dmap.zip", exdir="."); file.rename("DMAP\ V2.csv", "dmap.csv"); unlink("dmap.zip")
tmp <- readLines("dmap.csv")
tmp2 <- gsub(">,<", "\t", tmp); tmp2 <- gsub(">,", "\t", tmp2); tmp2 <- gsub(",<", "\t", tmp2); tmp2 <- gsub("^<", "", tmp2); tmp2 <- gsub(">$", "", tmp2)
writeLines(tmp2, "dmap2.txt")
dmap <- read.delim("dmap2.txt", quote="")
dmap <- dmap[!duplicated(dmap$SOURCE_DRUG), ]
write.table(dmap, "dmap_unique.txt", row.names=FALSE, quote=FALSE, sep="\t")
unlink(c("dmap.csv", "dmap2.txt"))
}
## Usage:
getDmap(url="http://discern.uits.iu.edu:8340/PAGER/application/download/DMAP%20V2.zip")
## (B) Join DMAP and CMAP tables
dmap <- read.delim("dmap_unique.txt", quote="")
dmap <- dmap[!duplicated(tolower(dmap$SOURCE_DRUG)), ]
row.names(dmap) <- tolower(dmap$SOURCE_DRUG)
download.file("http://www.broadinstitute.org/cmap/cmap_instances_02.xls", "cmap_instances_02.xls")
library(gdata) # To read excel file
cmap <- read.xls("cmap_instances_02.xls")[1:6100,] # Note: last lines contain cell line info
cmap <- cmap[!duplicated(tolower(cmap$cmap_name)),]
row.names(cmap) <- tolower(cmap$cmap_name)
df <- data.frame(cmap, dmap[row.names(cmap),])
## This gives PubChem IDs for most of the compounds:
sum(!is.na(df$SOURCE_DRUG))
# [1] 866 # These have PubChem IDs
sum(is.na(df$SOURCE_DRUG))
# [1] 443 # These are missing
unlink("cmap_instances_02.xls")
unlink("dmap_unique.txt")
## (C) Obtain SMILES strings for CMAP entries from ChemBank
## Tyler did this with help from P. Clemens from ChemBank.
## Compounds were matched by names using the stringdist library
## (here cmap_name from CMAP and the closest name in ChemBank).
## This is the location of the input files:
## /rhome/tbackman/Projects/cmap_drugs/src/mapIDs.R
## /rhome/tbackman/Projects/cmap_drugs/working/cmap_instances_02.xls
## /rhome/tbackman/Projects/cmap_drugs/working/allcompounds.smiles
## /rhome/tbackman/Projects/cmap_drugs/working/chembank-synonyms.txt
## /rhome/tbackman/Projects/cmap_drugs/working/chembank-structures.txt
## /rhome/tbackman/Projects/cmap_drugs/working/smilesMatches.csv
## Note: file smilesMatches.csv was generated by Tyler, see above
smipath <- system.file("extdata/cmap_support", "smilesMatches.csv", package="longevityDrugs")
smipath <- "../inst/extdata/cmap_support/smilesMatches.csv"
smiMA <- read.csv(smipath)
row.names(smiMA) <- tolower(smiMA$cmap_name)
bothDF <- cbind(df, smiMA[rownames(df),])
dim(bothDF[as.integer(bothDF[,"match_distance"]) == 0,]) # Note: compounds with match_distance > 0 need to be checked
## (D) Create SDFset
library(ChemmineR); library(ChemmineOB) # requires openbabel module
smi <- as.character(bothDF$smiles)
names(smi) <- as.character(bothDF$cmap_name)
smi <- as(smi, "SMIset")
sdfset <- smiles2sdf(smi)
datablock(sdfset) <- bothDF # Stores annotation info in datablock slots
## (E) Create SQLite database
standardFeatures <- function(sdfInput) {
data.frame(propOB(sdfInput),
Ncharges=sapply(bonds(sdfInput, type="charge"), length),
as.data.frame(groups(sdfInput, type="countMA")),
as.data.frame(rings(sdfInput, upper=8, type="count", arom=TRUE)))
}
conn <- initDb("cmap.db")
ids <- loadSdf(conn, sdfset, fct=standardFeatures)
print("Move cmap.db to 'inst/extdata/ of package and rebuild it.'")
} else {
print("To rebuild database, set 'rerun=TRUE'.")
}
}
## Query database
# conn <- initDb("../inst/extdata/cmap.db")
# results <- getAllCompoundIds(conn)
# sdfset <- getCompounds(conn, results, keepOrder=TRUE)
# sdfset
# as.data.frame(datablock2ma(datablock(sdfset)))[1:4,]
# myfeat <- listFeatures(conn)
# feat <- getCompoundFeatures(conn, results, myfeat)
# feat[1:4,]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.