#########################################
## Helper Functions for ChEMBL Queries ##
#########################################
## Author: Thomas Girke
## Last update: 05-Apr-20
genConfig <- function(
chemblDbPath = "chembldb.db",
downloadPath = "downloads",
resultsPath = "results") {
# if(!dir.exists(downloadPath))
# dir.create(downloadPath)
if (!dir.exists(resultsPath)) {
dir.create(resultsPath)
}
return(list(
chemblDbPath = chemblDbPath, downloadPath = downloadPath,
resultsPath = resultsPath
))
}
#############
## UniChem ##
#############
## UniChem CMP ID mappings from here:
## https://www.ebi.ac.uk/unichem/ucquery/listSources
## Note: above html table gives numbering to select proper src_id for ftp
## downloads, e.g. DrugBank is src2
## ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/
## Examples:
.getCache <- function() {
return(BiocFileCache(
cache =
rappdirs::user_cache_dir(appname = "drugTargetInteractions"),
ask = FALSE
))
}
.getCacheFile <- function(name) {
# create_time is a column in the DF returned by bfcquery. It is resolved
# in the context of the DF inside dplyr::arrange.
return(dplyr::arrange(
bfcquery(.getCache(), name, field = c("rname")),
dplyr::desc(create_time)
)[1, "rpath"][[1]])
}
.downloadFile <- function(url, name, verbose = FALSE) {
bfc <- .getCache()
rid <- bfcquery(bfc, name, "rname")$rid
if (length(rid) == 0) {
if (verbose) {
message("Downloading ", name)
}
rid <- names(bfcadd(bfc, name, url))
}
if (!isFALSE(bfcneedsupdate(bfc, rid))) {
# bfcdownload(bfc, rid,ask=FALSE)
.downloadWithRetries(bfc, rid, name)
}
bfcrpath(bfc, rids = rid)
}
.downloadWithRetries <- function(bfc, rid, name, tries = 2) {
tries <- as.integer(tries)
stopifnot(length(tries) == 1L, !is.na(tries))
while (tries > 0) {
result <- tryCatch(bfcdownload(bfc, rid, ask = FALSE), error = identity)
if (!inherits(result, "error")) {
break
}
tries <- tries - 1
}
if (tries == 0) {
stop(
"'bfcdownload()' failed:",
"\n filename: ", name,
"\n error: ", conditionMessage(result)
)
}
result
}
.cacheFileFromZip <- function(zipFileUrl, name) {
tempDir <- tempdir()
zipFile <- file.path(tempDir, "temp.zip")
download.file(zipFileUrl, zipFile)
unzip(zipFile, exdir = tempDir)
# message("zip file: ",zipFile)
# message("name: ",name)
# message("file to extract: ",file.path(tempDir,name))
bfc <- .getCache()
rid <- names(bfcadd(bfc, name, file.path(tempDir, name), action = "copy"))
bfcrpath(bfc, rids = rid)
}
downloadUniChem <- function(rerun = TRUE, config = genConfig()) {
# urlBase = "ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/"
urlBase <- "https://drug-target-interactions-data.s3-us-west-1.amazonaws.com/"
if (rerun == TRUE) {
bfc <- .getCache()
## ChEMBL to DrugBank mapping in src1src2.txt.gz:
.downloadFile(
paste(urlBase, "src1src2.txt.gz", sep = ""),
"src1src2.txt.gz"
)
## ChEMBL to PubChem CID mapping in src1src22.txt.gz
.downloadFile(
paste(urlBase, "src1src22.txt.gz", sep = ""),
"src1src22.txt.gz"
)
## ChEMBL to ChEBI mapping in src1src7.txt.gz
.downloadFile(
paste(urlBase, "src1src7.txt.gz", sep = ""),
"src1src7.txt.gz"
)
}
invisible(NULL)
}
downloadChemblDb <- function(version, rerun = TRUE, config = genConfig()) {
url <- paste("https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/",
"chembl_", version, "_sqlite.tar.gz",
sep = ""
)
if (rerun == TRUE) {
tempDir <- tempdir()
tarFile <- file.path(tempDir, "chembl_sqlite.tar.gz")
dbPath <- paste("chembl_", version, "/chembl_", version, "_sqlite/chembl_",
version, ".db",
sep = ""
)
download.file(url, tarFile)
untar(tarFile, c(dbPath), exdir = tempDir)
sourceDbFile <- file.path(
tempDir,
paste("chembl_", version, sep = ""),
paste("chembl_", version, "_sqlite", sep = ""),
paste("chembl_", version, ".db", sep = "")
)
bfc <- .getCache()
rid <- names(bfcadd(bfc, paste("chembl_", version, ".db", sep = ""),
sourceDbFile,
action = "copy"
))
file.remove(sourceDbFile)
return(bfcrpath(bfc, rids = rid))
# file.copy(from=sourceDbFile, to = config$chemblDbPath)
# file.remove(sourceDbFile)
}
}
## Usage:
# downloadUniChem(rerun=FALSE)
###############################################################
## Retrieve with UniProt.ws UniProt IDs using IDMs and SSNNs ##
###############################################################
## The following returns for a set of query IDs (here Ensembl gene or
## UniProt IDs) the corresponding UniProt IDs based on two independent
## approaches: ID mappings (IDMs) and sequence similarity nearest neighbors
## (SSNNs) using UNIREF clusters. Note: the query IDs (e.g. ENSEMBL genes) can
## only be reliably maintained in the SSNN results when 'chunksize=1' since
## batch queries for protein clusters with UnitProt.ws will often drop the query
## IDs such as ENSEMBL gene IDs. To address this, the query result contains
## an extra 'QueryID' column when 'chunksize=1', but not when it is set to a
## different value than 1. The getParalogs function is similar but it uses
## biomaRt's paralogs instead of UNIREF clusters.
## Basic usage of UniProt.ws package
# library(UniProt.ws)
# up <- UniProt.ws(taxId=9606) # Attach organism here human
# ?UniProt.ws # Help on base class of package
# columns(up) # Lists available data
# keytypes(up) # Lists available keys
# species(up) # Returns attached species
# taxId(up) # Returns attached species id
# availableUniprotSpecies()[1:4,] # Lists available species
# availableUniprotSpecies("musculus") # Pattern matching on species
# lookupUniprotSpeciesFromTaxId("3702") # Returns species name for species id
# Attaches mouse as species to UniProt.ws (here 'up') object
# # taxId(up) <- 10090
getUniprotIDs <- function(taxId = 9606, kt = "ENSEMBL", keys,
seq_cluster = "UNIREF90",
chunksize = 20) {
requireNamespace(UniProt.ws)
up <- UniProt.ws(taxId) # Attach organism, here human
## Validity checks
if (!kt %in% c("ENSEMBL", "UNIPROTKB")) {
stop("Argument kt needs to be one of: 'ENSEMBL', 'UNIPROTKB'")
}
if (!seq_cluster %in% c("UNIREF100", "UNIREF90", "UNIREF50")) {
stop(
"Argument seq_cluster needs to be one of:",
" 'UNIREF100', 'UNIREF90', 'UNIREF50'"
)
}
## Columns to return
df <- data.frame(
"ENSEMBL" = character(),
"ID" = character(),
"GENES" = character(),
"UNIREF100" = character(),
"UNIREF90" = character(),
"UNIREF50" = character(),
"ORGANISM" = character(),
"PROTEIN-NAMES" = character(),
check.names = FALSE
)
columns <- colnames(df)
## Arrange keys in chunk list, with list components of length<='chunksize'
keys <- unique(keys)
keylist <- split(keys, ceiling(seq_along(keys) / chunksize))
## Empty list container
listcontainer <- vector("list", length(keylist))
names(listcontainer) <- names(keylist)
res_list <- list(IDM = listcontainer, SSNN = listcontainer)
## Run queries in chunks
for (i in seq_along(keylist)) {
## ID mappings (IDMs)
res_ID <- try(UniProt.ws::select(up, keys = keylist[[i]], columns, kt),
silent = TRUE
)
if (!inherits(res_ID, "try-error")) {
if (chunksize == 1) {
res_ID <- data.frame(
QueryID = keylist[[i]][1], res_ID,
check.names = FALSE
)
res_list[[1]][[i]] <- res_ID[, c("QueryID", columns)]
} else {
res_list[[1]][[i]] <- res_ID[, columns]
}
} else {
if (chunksize == 1) {
res_list[[1]][[i]] <- data.frame(
QueryID = character(), df,
check.names = FALSE
)
# Required since current iteration will not
# enter next step (SSNN)
res_list[[2]][[i]] <- data.frame(
QueryID = character(), df,
check.names = FALSE
)
} else {
res_list[[1]][[i]] <- df
# Required since current iteration will not
# enter next step (SSNN)
res_list[[2]][[i]] <- df
}
}
## Sequence Similarity (SSNNs) using UNIREF*
if (!inherits(res_ID, "try-error")) {
unirefkeys <- as.character(stats::na.omit(unique(res_ID[, seq_cluster])))
unirefkeylist <- split(
unirefkeys,
ceiling(seq_along(unirefkeys) / chunksize)
)
unirefcontainer <- vector("list", length(unirefkeylist))
names(unirefcontainer) <- names(unirefkeylist)
## SSNN queries are run in subloop since number of uniref keys is
# larger than in IDM query
for (j in seq_along(unirefkeylist)) {
tmp <- try(UniProt.ws::select(up,
keys = unirefkeylist[[j]],
columns, seq_cluster
),
silent = TRUE
)
if (!inherits(tmp, "try-error")) {
if (chunksize == 1) {
unirefcontainer[[j]] <- data.frame(
QueryID =
keylist[[i]][1],
tmp[, columns],
check.names = FALSE
)
} else {
unirefcontainer[[j]] <- tmp[, columns]
}
} else {
if (chunksize == 1) {
unirefcontainer <- data.frame(
QueryID = character(), df,
check.names = FALSE
)
} else {
unirefcontainer[[j]] <- df
}
}
}
res_CL <- do.call("rbind", unirefcontainer)
if (chunksize == 1) {
res_list[[2]][[i]] <- res_CL[, c("QueryID", columns)]
} else {
res_list[[2]][[i]] <- res_CL[, columns]
}
}
## Status message
print(paste(
"Processed", chunksize * i, "keys of total of", length(keys),
"keys."
))
}
## Assemble and return results
res_list[[1]] <- do.call("rbind", res_list[[1]])
rownames(res_list[[1]]) <- NULL
res_list[[2]] <- do.call("rbind", res_list[[2]])
rownames(res_list[[2]]) <- NULL
return(res_list)
}
## Usage:
# keys <- c("ENSG00000145700", "ENSG00000135441", "ENSG00000120071",
# "ENSG00000120088", "ENSG00000185829", "ENSG00000185829", "ENSG00000185829",
# "ENSG00000238083", "ENSG00000012061", "ENSG00000104856", "ENSG00000104936",
# "ENSG00000117877", "ENSG00000130202", "ENSG00000130202", "ENSG00000142252",
# "ENSG00000189114", "ENSG00000234906")
# res_list100 <- getUniprotIDs(taxId=9606, kt="ENSEMBL", keys=keys,
# seq_cluster="UNIREF100")
# sapply(res_list100, dim, simplify=FALSE)
# sapply(names(res_list100), function(x) unique(na.omit(res_list100[[x]]$ID)))
# res_list90 <- getUniprotIDs(taxId=9606, kt="ENSEMBL", keys=keys,
# seq_cluster="UNIREF90")
# sapply(res_list90, dim, simplify=FALSE)
# sapply(names(res_list90), function(x) unique(na.omit(res_list90[[x]]$ID)))
##########################################################
## Retrieve with biomaRt UniProt IDs and their Paralogs ##
##########################################################
## Using biomaRt, obtain for query genes the corresponding UniProt IDs as well
## as paralogs. Query genes can be Gene Names or ENSEMBL Gene IDs from
## H sapiens. The result is similar to IDMs and SSNNs from getUniprotIDs
## function, but instead of UNIREF clusters, biomaRt's paralogs are used to
## obtain SSNNs.
getParalogs <- function(queryBy) {
mart <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
## ID Matching (IDM) result table
## To list available uniprot annotation fields, run:
# listAttributes(mart)[grep("uniprot", listAttributes(mart)[,"name"]),]
IDMresult <- biomaRt::getBM(
attributes = c(
"ensembl_gene_id",
"uniprot_gn_symbol",
"uniprotswissprot",
"uniprotsptrembl",
"description"
),
filters = queryBy$idType,
values = queryBy$ids,
mart = mart
)
IDMresult <- data.frame(
QueryID = as.character(IDMresult$ensembl_gene_id),
ENSEMBL = as.character(IDMresult$ensembl_gene_id),
GENES = as.character(IDMresult$uniprot_gn_symbol),
ID_up_sp = as.character(IDMresult$uniprotswissprot),
ID_up_sp_tr = as.character(IDMresult$uniprotsptrembl)
)
## Added by ThG on 25-Aug-21 maintain queries with not matches
missing <- queryBy$ids[!queryBy$ids %in% unique(as.character(IDMresult$QueryID))]
if(length(missing)>0) {
missDF <- data.frame(QueryID=missing, ENSEMBL=missing, GENES="", ID_up_sp="", ID_up_sp_tr="")
IDMresult <- rbind(IDMresult, missDF)
}
## Paralog (Sequence Similarity Nearest Neighbors - SSNN) result table
## To list available paralog annotation fields, run:
# listAttributes(mart)[grep("paralog", listAttributes(mart)[,"name"]),]
result <- biomaRt::getBM(
attributes = c(
"external_gene_name",
"ensembl_gene_id",
"hsapiens_paralog_associated_gene_name",
"hsapiens_paralog_ensembl_gene",
"hsapiens_paralog_perc_id",
"hsapiens_paralog_perc_id_r1"
),
filters = queryBy$idType,
values = queryBy$ids,
mart = mart
)
## Reshape result table
query <- result[!duplicated(result$ensembl_gene_id), ]
query <- data.frame(
QueryID = as.character(query$ensembl_gene_id),
ENSEMBL = as.character(query$ensembl_gene_id),
GENES = as.character(query$external_gene_name),
stringsAsFactors = FALSE
)
query <- cbind(query, paralog_perc_id = 100, paralog_perc_id_r1 = 100)
# removes rows with no paralog matches
result <- result[nchar(result$hsapiens_paralog_ensembl_gene) > 0, ]
paralog <- data.frame(
QueryID = result$ensembl_gene_id,
ENSEMBL = result$hsapiens_paralog_ensembl_gene,
GENES = result$hsapiens_paralog_associated_gene_name,
paralog_perc_id = result$hsapiens_paralog_perc_id,
paralog_perc_id_r1 = result$hsapiens_paralog_perc_id_r1,
stringsAsFactors = FALSE
)
resultDF <- rbind(query, paralog)
resultDF <- resultDF[order(resultDF$QueryID), ]
## Retrieve for ENSEMBL gene IDs in results the corresponding UniProt IDs.
## Note, this needs to be done in separate query since attributes from
## multiple attribue pages are not allowed.
uniprot <- biomaRt::getBM(
attributes = c(
"ensembl_gene_id",
"uniprot_gn_symbol",
"uniprotswissprot",
"uniprotsptrembl",
"description"
),
filters = "ensembl_gene_id",
values = resultDF$ENSEMBL,
mart = mart
)
## For UniProtKB/Swiss-Prot IDs (curated entries)
up_sp <- tapply(uniprot$uniprotswissprot, uniprot$ensembl_gene_id,
function(x) unique(as.character(x)[nchar(x) > 0]),
simplify = FALSE
)
index <- vapply(up_sp, length, integer(1))
up_sp[index == 0] <- ""
index[index == 0] <- 1
## Added by ThG on 25-Aug-21 to account for missing entries index
missing <- unique(as.character(resultDF$ENSEMBL))[!unique(as.character(resultDF$ENSEMBL)) %in% names(index)]
if(length(missing)>0) {
index <- c(index, setNames(rep(1, length(missing)), missing))
up_sp <- c(up_sp, setNames(rep("", length(missing)), missing))
}
index <- index[as.character(resultDF$ENSEMBL)]
up_sp <- up_sp[names(index)]
resultDF <- cbind(resultDF[rep(seq_along(resultDF$ENSEMBL), index), ],
ID_up_sp = as.character(unlist(up_sp))
)
## UniProtKB/TrEMBL IDs (less curated entries)
up_sp_tr <- tapply(uniprot$uniprotsptrembl, uniprot$ensembl_gene_id,
function(x) unique(as.character(x)[nchar(x) > 0]),
simplify = FALSE
)
index <- vapply(up_sp_tr, length, integer(1))
up_sp_tr[index == 0] <- ""
index[index == 0] <- 1
## Added by ThG on 25-Aug-21 to account for missing entries index
missing <- unique(as.character(resultDF$ENSEMBL))[!unique(as.character(resultDF$ENSEMBL)) %in% names(index)]
if(length(missing)>0) {
index <- c(index, setNames(rep(1, length(missing)), missing))
up_sp_tr <- c(up_sp_tr, setNames(rep("", length(missing)), missing))
}
index <- index[as.character(resultDF$ENSEMBL)]
up_sp_tr <- up_sp_tr[names(index)]
resultDF <- cbind(resultDF[rep(seq_along(resultDF$ENSEMBL), index), ],
ID_up_sp_tr = as.character(unlist(up_sp_tr))
)
row.names(resultDF) <- NULL
## Assemble result list
# This makes behavior more similar to result from getUniprotIDs
IDMresult[IDMresult == ""] <- NA
for (i in colnames(IDMresult)) {
if (is.factor(IDMresult[, i])) {
IDMresult[, i] <- as.character(IDMresult[, i])
}
}
resultDF[resultDF == ""] <- NA
for (i in colnames(resultDF)) {
if (is.factor(resultDF[, i])) {
resultDF[, i] <- as.character(resultDF[, i])
}
}
resultList <- list(IDM = IDMresult, SSNN = resultDF)
return(resultList)
}
## Usage:
# queryBy <- list(molType="gene", idType="external_gene_name",
# ids=c("ZPBP", "MAPK1", "EGFR"))
# queryBy <- list(molType="gene", idType="ensembl_gene_id",
# ids=c("ENSG00000042813", "ENSG00000100030", "ENSG00000146648"))
# result <- getParalogs(queryBy)
# result$IDM[1:4,]
# result$SSNN[1:4,]
# sapply(result, dim, simplify=FALSE)
# sapply(names(result), function(x) unique(na.omit(result[[x]]$ID_up_sp)))
#####################################################
## Function to query known drug-target annotations ##
#####################################################
## Function to query known drug-target annotations
drugTargetAnnot <- function(queryBy = list(molType = NULL, idType = NULL, ids = NULL),
cmpid_file = file.path(config$resultsPath, "cmp_ids.rds"),
config = genConfig()) {
if (any(names(queryBy) != c("molType", "idType", "ids"))) {
stop(
"All three list components in 'queryBy' (named: 'molType',",
" 'idType' and 'ids') need to be present."
)
}
if (any(vapply(queryBy, length, integer(1)) == 0)) {
stop(
"All components in 'queryBy' list need to be populated with ",
"corresponding character vectors."
)
}
## Load ChEMBL SQLite
## ChEMBL SQLite downloaded from here: ftp://ftp.ebi.ac.uk/pub/databases/
## chembl/ChEMBLdb/latest/chembl_24_1_sqlite.tar.gz
## after unpacking you get chembl_xx.db
dbpath <- config$chemblDbPath
mydb <- dbConnect(SQLite(), dbpath)
## Input file for following step was generated by cmpIdMapping()
cmp_ids <- readRDS(cmpid_file)
rownames(cmp_ids) <- as.character(cmp_ids$molregno)
## Query by targets
if (queryBy$molType == "protein" & grepl("Uniprot", queryBy$idType[[1]],
ignore.case = TRUE
)) {
idvec <- paste0("(\"", paste(queryBy$ids, collapse = "\", \""), "\")")
query <- paste(
"SELECT d.molregno,e.chembl_id, e.pref_name AS Drug_Name,
d.mechanism_of_action AS MOA,
d.action_type AS Action_Type,
e.first_approval AS First_Approval,
a.chembl_id AS ChEMBL_TID, d.tid AS TID,
c.accession AS UniProt_ID,
c.description AS Desc, c.organism AS Organism,
GROUP_CONCAT(DISTINCT (f.mesh_id || ': ' ||
f.mesh_heading))
AS Mesh_Indication
FROM target_dictionary AS a
LEFT JOIN target_components AS b ON a.tid = b.tid
LEFT JOIN component_sequences AS c
ON b.component_id = c.component_id
LEFT JOIN drug_mechanism AS d ON a.tid = d.tid
LEFT JOIN molecule_dictionary AS e
ON d.molregno = e.molregno
LEFT JOIN drug_indication AS f
ON e.molregno = f.molregno
WHERE c.accession IN", idvec,
"GROUP BY d.molregno, c.accession
ORDER BY c.accession, c.description"
)
myquery <- dbSendQuery(mydb, query)
activityassays <- dbFetch(myquery)
dbClearResult(myquery)
}
## Query by compounds
if (queryBy$molType == "cmp") {
## ID conversions
if (queryBy$idType == "molregno") {
cmpvec <- queryBy$ids
}
if (queryBy$idType == "chembl_id") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$chembl_id), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$chembl_id)
cmpvec <- cmpvec[queryBy$ids]
}
if (queryBy$idType == "PubChem_ID") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$PubChem_ID), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$PubChem_ID)
cmpvec <- cmpvec[queryBy$ids]
}
if (queryBy$idType == "DrugBank_ID") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$DrugBank_ID), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$DrugBank_ID)
cmpvec <- cmpvec[queryBy$ids]
}
idvec <- paste0("(\"", paste(cmpvec, collapse = "\", \""), "\")")
query <- paste(
"SELECT a.molregno, a.chembl_id, a.pref_name AS Drug_Name,
b.mechanism_of_action AS MOA,
b.action_type AS Action_Type,
a.first_approval AS First_Approval,
c.chembl_id AS ChEMBL_TID, b.tid AS TID,
e.accession AS UniProt_ID,
e.description AS Desc, e.organism AS Organism,
GROUP_CONCAT(DISTINCT (f.mesh_id || ': '
|| f.mesh_heading))
AS Mesh_Indication
FROM molecule_dictionary AS a
LEFT JOIN drug_mechanism AS b ON a.molregno = b.molregno
LEFT JOIN target_dictionary AS c ON b.tid = c.tid
LEFT JOIN target_components AS d ON b.tid = d.tid
LEFT JOIN component_sequences AS e
ON d.component_id = e.component_id
LEFT JOIN drug_indication AS f ON a.molregno = f.molregno
WHERE a.molregno IN", idvec,
"GROUP BY a.molregno, e.accession
ORDER BY a.molregno, a.pref_name"
)
myquery <- dbSendQuery(mydb, query)
activityassays <- dbFetch(myquery)
dbClearResult(myquery)
}
resultDF <- data.frame(cmp_ids[
as.character(activityassays$molregno),
-c(2, 4)
], activityassays[, -c(1, 2)])
dbDisconnect(mydb)
## Remove rows with identical values in all fields
resultDF <- resultDF[!duplicated(apply(resultDF, 1, paste, collapse = "_")), ]
## Add query column and sort rows in result table according to query
index_list <- lapply(
queryBy$ids,
function(x) {
which(toupper(resultDF[, queryBy$idType]) %in%
toupper(x))
}
)
names(index_list) <- queryBy$ids
index_list[vapply(index_list, length, integer(1)) == 0] <- Inf
index_df <- data.frame(
ids = rep(
names(index_list),
vapply(index_list, length, integer(1))
),
rowids = unlist(index_list)
)
resultDF <- data.frame(
QueryIDs = index_df[, 1],
resultDF[as.numeric(index_df$rowids), ]
)
rownames(resultDF) <- NULL
resultDF <- resultDF[, colnames(resultDF)!="last_active"] # Added by ThG on 25-Aug-21 to remove undesirable column in >= ChEMBL29
return(resultDF)
}
#########################################
## Query Known Drug-Target Annotations ##
#########################################
## Function to generate drugTargetAnnot.xls table
drugTargetAnnotTable <- function(outfile, rerun = TRUE, config = genConfig()) {
if (rerun == TRUE) {
## Load ChEMBL SQLite
## ChEMBL SQLite downloaded from here: ftp://ftp.ebi.ac.uk/pub/
## databases/chembl/ChEMBLdb/latest/chembl_24_1_sqlite.tar.gz
## after unpacking you get chembl_24.db
chembldb <- config$chemblDbPath
mydb <- dbConnect(SQLite(), chembldb)
## Joins for drug mech
allmech <- dbGetQuery(mydb, "SELECT *
FROM drug_mechanism,
molecule_dictionary
WHERE drug_mechanism.molregno =
molecule_dictionary.molregno")
## Set values in "mechanism_comment" column to NA since it contains
## weird characters resulting in row duplications when writing to files
allmech[, colnames(allmech) %in% "mechanism_comment"] <-
rep(NA, nrow(allmech))
## Get UniChem Info
## ChEMBL to DrubBank mapping in src1src2.txt.gz:
chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
factor(chembl2drugbank[, 1]), paste,
collapse = ", "
)
## ChEMBL to PubChem CID mapping in src1src22.txt.gz
chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
factor(chembl2pubchem[, 1]), paste,
collapse = ", "
)
## ChEMBL to ChEBI mapping in src1src7.txt.gz
chembl2chebi <- read.delim(gzfile(.getCacheFile("src1src7.txt.gz")))
chembl2chebi_vec <- tapply(as.character(chembl2chebi[, 2]),
factor(chembl2chebi[, 1]), paste,
collapse = ", "
)
allmech <- cbind(allmech,
PubChem_ID = chembl2pubchem_vec[allmech$chembl_id],
DrugBank_ID = chembl2drugbank_vec[allmech$chembl_id],
ChEBI_ID = chembl2chebi_vec[allmech$chembl_id]
)
## Joins for target ids
targetids <- dbGetQuery(mydb, "SELECT *
FROM target_components,
component_sequences
WHERE target_components.component_id =
component_sequences.component_id")
tid2accession <- tapply(targetids$accession, as.factor(targetids$tid),
paste,
collapse = ", "
)
tid2description <- tapply(targetids$description,
as.factor(targetids$tid), paste,
collapse = ", "
)
tid2organism <- tapply(targetids$organism, as.factor(targetids$tid),
paste,
collapse = ", "
)
target_dict <- dbGetQuery(mydb, "SELECT * FROM target_dictionary")
chembl_tid <- as.character(target_dict$chembl_id)
names(chembl_tid) <- as.character(target_dict$tid)
tidDF <- data.frame(
row.names = names(tid2accession),
ChEMBL_TID = chembl_tid[names(tid2accession)],
UniProt_ID = tid2accession,
Desc = tid2description[names(tid2accession)],
Organism = tid2organism[names(tid2accession)]
)
## Join mesh and efo indication
indication <- dbGetQuery(mydb, "SELECT * FROM drug_indication")
mesh <- paste(indication$mesh_id, indication$mesh_heading, sep = ": ")
mesh2molregno <- tapply(mesh, as.factor(indication$molregno), paste,
collapse = ", "
)
efo <- paste(indication$efo_id, indication$efo_term, sep = ": ")
efo2molregno <- tapply(efo, as.factor(indication$molregno), paste,
collapse = ", "
)
## Assemble everything
allmech <- data.frame(
row.names = seq_along(allmech[, 1]),
allmech,
tidDF[as.character(allmech$tid), ],
MeshIndication =
mesh2molregno[as.character(allmech$molregno)],
EFOIndication =
efo2molregno[as.character(allmech$molregno)]
)
write.table(allmech, outfile, row.names = FALSE, quote = FALSE, sep = "\t")
}
}
## Usage:
# drugTargetAnnotTable(outfile="results/drugTargetAnnot.xls", rerun=FALSE)
## Compare with online drug-target table from ChEMBL obtained from manual
## download here: https://www.ebi.ac.uk/chembl/old/drug/targets
## Query functions for drugTargetAnnot.xls
getDrugTarget <- function(dt_file = file.path(
config$resultsPath,
"drugTargetAnnot.xls"
),
queryBy = list(molType = NULL, idType = NULL, ids = NULL),
id_mapping = c(
chembl = "chembl_id", pubchem = "PubChem_ID",
uniprot = "UniProt_ID"
),
columns, config = genConfig()) {
# stop("dt_file: ",dt_file)
dt_file <- read.delim(dt_file)
myid <- id_mapping[queryBy$idType]
.queryFct <- function(dt_file, myid) {
idlist <- strsplit(as.character(dt_file[, myid]), ", ")
index_list <- lapply(
queryBy$ids,
function(x) {
which(vapply(
seq_along(idlist),
function(y) {
any(tolower(idlist[[y]]) %in% tolower(x))
},
logical(1)
))
}
)
names(index_list) <- queryBy$ids
## To include no matches in result, inject Inf in corresponding slots
index_list[vapply(index_list, length, integer(1)) == 0] <- Inf
index_df <- data.frame(
ids = rep(
names(index_list),
vapply(index_list, length, integer(1))
),
rowids = unlist(index_list)
)
df <- data.frame(
QueryIDs = index_df[, 1],
dt_file[as.numeric(index_df$rowids), ]
)
return(df)
}
df <- .queryFct(dt_file, myid)
dfsub <- df[, columns]
rownames(dfsub) <- NULL
return(dfsub)
}
## Usage:
# id_mapping <- c(chembl="chembl_id", pubchem="PubChem_ID",
# uniprot="UniProt_ID", drugbank="DrugBank_ID")
# queryBy <- list(molType="cmp", idType="chembl",
# ids=c("CHEMBL25", "CHEMBL1742471"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
# id_mapping, columns=c(1,5,8,16,17,39,46:52))
# queryBy <- list(molType="cmp", idType="pubchem",
# ids=c("2244", "65869", "2244"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
# id_mapping, columns=c(1,5,8,16,17,39,46:52))
# queryBy <- list(molType="protein", idType="uniprot",
# ids=c("P43166", "P00915", "P43166"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
# id_mapping, columns=c(1,5,8,16,17,39,46:52))
#############
## DrugAge ##
#############
## Source file of DrugAge is linked here: http://genomics.senescence.info/drugs
processDrugage <- function(drugagefile = file.path(
config$resultsPath,
"drugage_id_mapping.xls"
),
redownloaddrugage = TRUE, config = genConfig()) {
if (redownloaddrugage == TRUE) {
.cacheFileFromZip(
"http://genomics.senescence.info/drugs/dataset.zip",
"drugage.csv"
)
}
drugage <- read.csv(.getCacheFile("drugage.csv"))
# drugage <- read.csv(file.path(config$downloadPath,"drugage.csv"))
drugage <- drugage[!duplicated(drugage$compound_name), ]
## Get mol_dict table from chembl_db
chembldb <- config$chemblDbPath
mydb <- dbConnect(SQLite(), chembldb)
mol_dict <- dbGetQuery(
mydb,
"SELECT pref_name,chembl_id
FROM molecule_dictionary"
)
mol_dict_sub <- mol_dict[!is.na(mol_dict$pref_name), ] # mol_dict from below
prefname <- as.character(mol_dict_sub$pref_name)
chemblid <- as.character(mol_dict_sub$chembl_id)
fact <- tapply(chemblid, factor(prefname), paste, collapse = ", ")
## Load ChEMBL to PubChem CID and DrugBank ID mappings
## (generated above with downloadUniChem Fct)
chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
factor(chembl2pubchem[, 1]), paste,
collapse = ", "
)
chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
factor(chembl2drugbank[, 1]), paste,
collapse = ", "
)
## Assemble results
drugage <- cbind(drugage,
pref_name = names(fact[toupper(drugage$compound_name)]),
chembl_id = fact[toupper(drugage$compound_name)]
)
drugage <- cbind(drugage,
pubchem_cid = chembl2pubchem_vec[
as.character(drugage$chembl_id)
],
drugbank_id = chembl2drugbank_vec[
as.character(drugage$chembl_id)
]
)
write.table(drugage, drugagefile, row.names = FALSE, quote = FALSE, sep = "\t")
}
## Usage:
# processDrugage(drugagefile="results/drugage_id_mapping.xls",
# redownloaddrugage=FALSE)
## Now the missing IDs need to be added manually. A semi-manual approach
## is to use this web service: https://cts.fiehnlab.ucdavis.edu/batch
########################################################
## Integration with Therapeutic Target Database (TTD) ##
########################################################
## ID crossmatching table
transformTTD <- function(ttdfile = file.path(config$downloadPath, "TTD_IDs.txt"),
redownloadTTD = TRUE, config = genConfig()) {
url <- "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P1-02-TTD_crossmatching.txt"
if (redownloadTTD == TRUE) {
.downloadFile(url, ttdfile)
}
df <- read.delim(.getCacheFile(ttdfile),
skip = 12, header = FALSE,
col.names = c("TTD_ID", "No", "Source", "Name")
)
df_sub <- df[grep("DrugName|PubChem CID", df$Source), ]
list_sub <- split(df_sub, df_sub$TTD_ID)
list_sub <- list_sub[lapply(list_sub, nrow) == 2]
list_sub <- lapply(list_sub, t)
list_sub <- lapply(list_sub, tail, 1)
df_sub <- do.call("rbind", list_sub)
row.names(df_sub) <- names(list_sub)
df_sub <- data.frame(
TTD_ID = rownames(df_sub), CMP_Name = df_sub[, 1],
PubChem_ID = gsub("^CID ", "", as.character(df_sub[, 2]))
)
return(df_sub)
}
## Usage:
# df_sub <- transformTTD(ttdfile="downloads/TTD_IDs.txt", redownloadTTD=FALSE)
# df_sub[1:4,]
############################
## Query Bioactivity Data ##
############################
## Function to generate CMP ID mappings UniChem (current version very slow)
cmpIdMapping <- function(outfile = file.path(config$resultsPath, "cmp_ids.rds"),
rerun = TRUE, config = genConfig()) {
if (rerun == TRUE) {
chembldb <- config$chemblDbPath
mydb <- dbConnect(SQLite(), chembldb)
## Drug ID mappings
chemblid_lookup <- dbGetQuery(mydb, 'SELECT *
FROM chembl_id_lookup
WHERE "entity_type" = "COMPOUND"')
colnames(chemblid_lookup)[colnames(chemblid_lookup) == "entity_id"] <-
"molregno"
## CMP ID mappings from UniChem (very slow!)
## ChEMBL to DrubBank mapping in src1src2.txt.gz:
chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
factor(chembl2drugbank[, 1]), paste,
collapse = ", "
)
## ChEMBL to PubChem CID mapping in src1src22.txt.gz
chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
factor(chembl2pubchem[, 1]), paste,
collapse = ", "
)
## ChEMBL to ChEBI mapping in src1src7.txt.gz
chembl2chebi <- read.delim(gzfile(.getCacheFile("src1src7.txt.gz")))
chembl2chebi_vec <- tapply(as.character(chembl2chebi[, 2]),
factor(chembl2chebi[, 1]), paste,
collapse = ", "
)
## Create final CMP ID mapping table (very slow!!)
cmp_ids <- cbind(chemblid_lookup,
PubChem_ID = chembl2pubchem_vec[
as.character(chemblid_lookup$chembl_id)
],
DrugBank_ID = chembl2drugbank_vec[
as.character(chemblid_lookup$chembl_id)
]
)
saveRDS(cmp_ids, outfile)
}
}
## Usage:
# cmpIdMapping(outfile="results/cmp_ids.rds", rerun=FALSE)
# cmp_ids <- readRDS("results/cmp_ids.rds")
## Function to query bioactivity data by target or compound ids
drugTargetBioactivity <- function(queryBy = list(
molType = NULL,
idType = NULL, ids = NULL
),
cmpid_file = file.path(
config$resultsPath,
"cmp_ids.rds"
),
config = genConfig()) {
if (any(vapply(queryBy, is.null, logical(1)))) {
stop(
"All components in 'queryBy' list need to be populated ",
"with corresponding character vectors."
)
}
dbpath <- config$chemblDbPath
mydb <- dbConnect(SQLite(), dbpath)
## Input file for following step was generated by cmpIdMapping()
cmp_ids <- readRDS(cmpid_file)
rownames(cmp_ids) <- as.character(cmp_ids$molregno)
## Query bioactivity
## Query by targets
if (queryBy$molType == "protein" & queryBy$idType == "uniprot") {
idvec <- paste0("(\"", paste(queryBy$ids, collapse = "\", \""), "\")")
myquery <- dbSendQuery(mydb, paste(
"SELECT activities.molregno,
pref_name,
activity_id,
assays.chembl_id AS chembl_assay_id,
accession AS UniProt_ID,
component_sequences.description,
organism,
activities.standard_value,
activities.standard_units,
activities.standard_flag,
activities.standard_type
FROM activities, assays, target_components,
component_sequences, molecule_dictionary
ON activities.assay_id = assays.assay_id
AND assays.tid = target_components.tid
AND target_components.component_id =
component_sequences.component_id
AND activities.molregno =
molecule_dictionary.molregno
WHERE component_sequences.accession IN",
idvec
))
activityassays <- dbFetch(myquery)
dbClearResult(myquery)
}
## Query bioactivity by compounds
if (queryBy$molType == "cmp") {
## ID conversions
if (queryBy$idType == "molregno") {
cmpvec <- queryBy$ids
}
if (queryBy$idType == "chembl_id") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$chembl_id), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$chembl_id)
cmpvec <- cmpvec[queryBy$ids]
}
if (queryBy$idType == "PubChem_ID") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$PubChem_ID), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$PubChem_ID)
cmpvec <- cmpvec[queryBy$ids]
}
if (queryBy$idType == "DrugBank_ID") {
cmp_ids <- cmp_ids[!duplicated(cmp_ids$DrugBank_ID), ]
cmpvec <- as.character(cmp_ids$molregno)
names(cmpvec) <- as.character(cmp_ids$DrugBank_ID)
cmpvec <- cmpvec[queryBy$ids]
}
idvec <- paste0("(\"", paste(cmpvec, collapse = "\", \""), "\")")
myquery <- dbSendQuery(mydb, paste("SELECT activities.molregno,
pref_name,
activity_id,
assays.chembl_id AS chembl_assay_id,
accession AS UniProt_ID,
component_sequences.description,
organism,
activities.standard_value,
activities.standard_units,
activities.standard_flag,
activities.standard_type
FROM activities, assays, target_components,
component_sequences, molecule_dictionary
ON activities.assay_id = assays.assay_id
AND assays.tid = target_components.tid
AND target_components.component_id =
component_sequences.component_id
AND activities.molregno =
molecule_dictionary.molregno
WHERE activities.molregno IN", idvec))
activityassays <- dbFetch(myquery)
dbClearResult(myquery)
}
resultDF <- data.frame(cmp_ids[
as.character(activityassays$molregno),
-c(2, 4)
], activityassays[, -1])
resultDF <- resultDF[, colnames(resultDF)!="last_active"] # Added by ThG on 25-Aug-21 to remove undesirable column in >= ChEMBL29
dbDisconnect(mydb)
rownames(resultDF) <- NULL
return(resultDF)
}
## Usage:
# chembldb <- "/bigdata/girkelab/shared/lcshared/chemoinformatics/compoundDBs/
# chembl_24/chembl_24_sqlite/chembl_24.db"
# queryBy <- list(molType="protein", idType="uniprot",
# ids=c("P05979", "P35354", "P33033", "Q8VCT3", "P29475", "P51511"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="molregno",
# ids=c("101036", "101137", "1384464"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="DrugBank_ID",
# ids=c("DB00945", "DB00316", "DB01050"))
# qresult <- drugTargetBioactivity(queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="PubChem_ID",
# ids=c("2244", "3672", "1983"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
#################################
## Gene to Protein ID Mappings ##
#################################
## Return for gene or protein IDs a mapping table containing: ENSEMBL Gene IDs,
## Gene Names/Symbols, UniProt IDs and ENSEMBL Protein IDs. The results are
## returned in a list where the first slot contains the ID mapping table and the
## subsequent slots the corresponding named character vectors for: ens_gene_id,
## up_ens_id, and up_gene_id. Currently, the query IDs can be Gene Names,
## Ensembl Gene IDs and UniProt IDs.
getSymEnsUp <- function(EnsDb = "EnsDb.Hsapiens.v86", ids, idtype) {
require(EnsDb, character.only = TRUE)
## ID columns to return
idcolumns <- c("gene_id", "gene_name", "uniprot_id", "protein_id")
## With gene names
if (idtype == "GENE_NAME") {
idDF <- ensembldb::genes(get(EnsDb),
filter = AnnotationFilter::GeneNameFilter(ids),
columns = idcolumns
)
idDF <- idDF[!is.na(S4Vectors::values(idDF)$uniprot_id), ]
idDF <- S4Vectors::values(idDF)
## With ENSEBML Gene IDs
} else if (idtype == "ENSEMBL_GENE_ID") {
idDF <- ensembldb::genes(get(EnsDb),
filter = AnnotationFilter::GeneIdFilter(ids),
columns = idcolumns
)
idDF <- idDF[!is.na(S4Vectors::values(idDF)$uniprot_id), ]
idDF <- S4Vectors::values(idDF)
## With UniProt IDs
} else if (idtype == "UNIPROT_ID") {
idDF <- ensembldb::proteins(get(EnsDb),
filter = AnnotationFilter::UniprotFilter(ids),
columns = idcolumns
)
idDF <- idDF[!is.na(idDF$uniprot_id), ]
} else {
stop("Argument 'idtype' can only be assigned one of:
'GENE_NAME', 'ENSEMBL_GENE_ID', 'UNIPROT_ID'.")
}
## Assemble ID mapping table
idDF <- idDF[, idcolumns]
row.names(idDF) <- NULL
## Assemble named char vector: ens_gene_id
idDF <- as.data.frame(idDF)
ens_gene_idDF <- idDF[!duplicated(idDF$gene_id), ]
ens_gene_id <- as.character(ens_gene_idDF$gene_name)
names(ens_gene_id) <- as.character(ens_gene_idDF$gene_id)
## Assemble named char vector: up_ens_id
up_ens_idDF <- idDF[!duplicated(idDF$uniprot_id), ]
up_ens_id <- as.character(up_ens_idDF$gene_id)
names(up_ens_id) <- as.character(up_ens_idDF$uniprot_id)
## Assemble named char vector: up_gene_id
up_gene_idDF <- idDF[!duplicated(idDF$uniprot_id), ]
up_gene_id <- as.character(up_gene_idDF$gene_name)
names(up_gene_id) <- as.character(up_gene_idDF$uniprot_id)
## Return result as list
returnList <- list(
idDF = idDF, ens_gene_id = ens_gene_id,
up_ens_id = up_ens_id, up_gene_id = up_gene_id
)
return(returnList)
}
## Usage:
# gene_name <- c("CA7", "CFTR")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86", ids=gene_name, idtype="GENE_NAME")
# ensembl_gene_id <- c("ENSG00000001626", "ENSG00000168748")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86",
# ids=ensembl_gene_id, idtype="ENSEMBL_GENE_ID")
# uniprot_id <- c("P43166", "P13569")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86", ids=uniprot_id, idtype="UNIPROT_ID")
##############################################
## Convenience Drug-Target Wrapper Function ##
##############################################
## Meta-function to obtain both drug-target annotation and bioassay data
runDrugTarget_Annot_Bioassay <- function(res_list, up_col_id = "ID",
ens_gene_id,
cmpid_file = file.path(
config$resultsPath,
"cmp_ids.rds"
),
config = genConfig(), ...) {
## (1.1) Check input validity
## To be added, once class system has been introduced...
## (1.2) Input ID mappings, note: Ensembl query IDs with similar sequences
## will often map to the same related/third Ensemble gene resulting in
## paralog genes/proteins mapping to several query genes. Thus, this
## one-to-many relationship needs to be resolved for both geneids and
## ensids.
id_list <- lapply(
names(res_list),
function(x) {
unique(stats::na.omit(as.character(res_list[[x]][, up_col_id])))
}
)
names(id_list) <- names(res_list)
ensids <- tapply(res_list[[2]]$QueryID, res_list[[2]][, up_col_id],
function(x) as.character(unique(x)),
simplify = FALSE
)
# geneids <- vapply(names(ensids), function(x) ens_gene_id[ensids[[x]]], character(1)) # Replaced by ThG on 25-Aug-21 with lapply
geneids <- lapply(names(ensids), function(x) ens_gene_id[ensids[[x]]]) # Added by ThG on 25-Aug-21
names(geneids) <- names(ensids) # Added by ThG on 25-Aug-21
geneids <- vapply(names(geneids), function(x) paste(geneids[[x]], collapse = ", "), character(1)) # Added by ThG on 25-Aug-21
geneids <- unlist(geneids[nchar(names(geneids)) > 0])
ensids <- vapply(names(ensids), function(x) paste(ensids[[x]], collapse = ", "), character(1))
ensids <- unlist(ensids[nchar(names(ensids)) > 0])
## (1.3) Obtain drug-target annotations for IDM
idm_ids <- id_list$IDM
if (length(idm_ids) > 0) {
queryBy <- list(molType = "protein", idType = "UniProt_ID", ids = idm_ids)
qresultIDM <- drugTargetAnnot(queryBy,
cmpid_file = cmpid_file, config = config
)
qresultIDM <- data.frame(
GeneName = as.character(
geneids[as.character(qresultIDM$QueryIDs)]
),
Ensembl_IDs = as.character(
ensids[as.character(qresultIDM$QueryIDs)]
),
qresultIDM
)
} else {
qresultIDM <- data.frame()
}
## (1.4) Obtain drug-target annotations for SSNN excluding IDM overlaps
## (SSNN_noIDM)
# UniProt IDs only in SSNN
ssnn_ids <- id_list$SSNN[!id_list$SSNN %in% idm_ids]
if (length(ssnn_ids) > 0) {
queryBy <- list(molType = "protein", idType = "UniProt_ID", ids = ssnn_ids)
qresultSSNN <- drugTargetAnnot(queryBy,
cmpid_file = cmpid_file, config = config
)
qresultSSNN <- data.frame(
GeneName = as.character(
geneids[as.character(qresultSSNN$QueryIDs)]
),
Ensembl_IDs = as.character(
ensids[as.character(qresultSSNN$QueryIDs)]
),
qresultSSNN
)
## Prepend "Query_" to indicate that returned genes IDs are often not
## the ones encoding SSNN UniProt IDs
if (length(qresultSSNN[, "Ensembl_IDs"]) > 0) {
qresultSSNN[, "Ensembl_IDs"] <- paste0(
"Query_",
qresultSSNN[, "Ensembl_IDs"]
)
}
# Fixes Query_NA strings in NA cases
qresultSSNN[qresultSSNN[, "Ensembl_IDs"] == "Query_NA", "Ensembl_IDs"] <- NA
if (length(qresultSSNN[, "GeneName"]) > 0) {
qresultSSNN[, "GeneName"] <- paste0(
"Query_",
qresultSSNN[, "GeneName"]
)
}
# Fixes Query_NA strings in NA cases
qresultSSNN[qresultSSNN[, "GeneName"] == "Query_NA", "GeneName"] <- NA
} else {
qresultSSNN <- data.frame()
}
## (1.5) Combine IDM and SSNN_noIDM results
qresult <- rbind(
cbind(
ID_Mapping_Type = rep(
"IDM",
nrow(qresultIDM)
),
qresultIDM
),
cbind(
ID_Mapping_Type = rep(
"SSNN_noIDM",
nrow(qresultSSNN)
),
qresultSSNN
)
)
qresult <- qresult[order(
gsub("Query_", "", qresult$GeneName),
qresult$ID_Mapping_Type
), ]
colnames(qresult) <- c(
"ID_Mapping_Type", "GeneName", "Ensembl_IDs",
"UniProt_QueryIDs", "CHEMBL_CMP_ID", "Molregno",
"PubChem_ID", "DrugBank_ID", "Drug_Name", "MOA",
"Action_Type", "First_Approval", "ChEMBL_TID",
"TID", "UniProt_ID", "Target_Desc", "Organism",
"Mesh_Indication"
)
rownames(qresult) <- NULL
## (2.1) Obtain bioassay data for IDM
queryBy <- list(molType = "protein", idType = "uniprot", ids = idm_ids)
qresultBAIDM <- drugTargetBioactivity(queryBy,
cmpid_file = cmpid_file,
config = config
)
qresultBAIDM <- data.frame(
GeneName = as.character(
geneids[as.character(qresultBAIDM$UniProt_ID)]
),
Ensembl_IDs = as.character(
ensids[as.character(qresultBAIDM$UniProt_ID)]
),
qresultBAIDM
)
## (2.2) Obtain bioassay data for SSNN excluding IDM overlaps (SSNN_noIDM)
queryBy <- list(molType = "protein", idType = "uniprot", ids = ssnn_ids)
qresultBASSNN <- suppressWarnings(drugTargetBioactivity(
queryBy,
cmpid_file = cmpid_file, config = config
))
qresultBASSNN <- data.frame(
GeneName = geneids[
as.character(qresultBASSNN$UniProt_ID)
],
Ensembl_IDs = ensids[
as.character(qresultBASSNN$UniProt_ID)
],
qresultBASSNN
)
## Prepend "Query_" to indicate that returned genes IDs are often not the
## ones encoding SSNN UniProt IDs
if (length(qresultBASSNN[, "Ensembl_IDs"]) > 0) {
qresultBASSNN[, "Ensembl_IDs"] <- paste0(
"Query_",
qresultBASSNN[, "Ensembl_IDs"]
)
}
# Fixes Query_NA strings in NA cases
qresultBASSNN[qresultBASSNN[, "Ensembl_IDs"] == "Query_NA", "Ensembl_IDs"] <- NA
if (length(qresultBASSNN[, "GeneName"]) > 0) {
qresultBASSNN[, "GeneName"] <- paste0(
"Query_",
qresultBASSNN[, "GeneName"]
)
}
# Fixes Query_NA strings in NA cases
qresultBASSNN[qresultBASSNN[, "GeneName"] == "Query_NA", "GeneName"] <- NA
## (2.3) Combine IDM and SSNN_noIDM results
qresultBA <- rbind(
cbind(
ID_Mapping_Type = rep("IDM", nrow(qresultBAIDM)),
qresultBAIDM
),
cbind(
ID_Mapping_Type = rep(
"SSNN_noIDM",
nrow(qresultBASSNN)
),
qresultBASSNN
)
)
qresultBA <- qresultBA[order(
gsub("Query_", "", qresultBA$GeneName),
qresultBA$ID_Mapping_Type
), ]
colnames(qresultBA) <- c(
"ID_Mapping_Type", "GeneName", "Ensembl_IDs",
"CHEMBL_CMP_ID", "Molregno", "PubChem_ID",
"DrugBank_ID", "Drug_Name", "Activity_ID",
"CHEMBL_Assay_ID", "UniProt_ID", "Target_Desc",
"Organism", "Standard_Value", "Standard_Units",
"Standard_Flag", "Standard_Type"
)
rownames(qresultBA) <- NULL
## (3.1) Export annotation and bioassay results in list
drug_target_list <- list(Annotation = qresult, Bioassay = qresultBA)
return(drug_target_list)
}
## Usage:
# ensembl_gene_id <- c("ENSG00000001626", "ENSG00000168748")
# idMap <- getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86",
# ids=ensembl_gene_id, idtype="ENSEMBL_GENE_ID")
# ens_gene_id <- idMap$ens_gene_id
# res_list90 <- getUniprotIDs(taxId=9606, kt="ENSEMBL",
# keys=names(ens_gene_id), seq_cluster="UNIREF90", chunksize=1)
# queryBy <- list(molType="gene", idType="ensembl_gene_id",
# ids=names(ens_gene_id))
# res_list <- getParalogs(queryBy)
# drug_target_list <- runDrugTarget_Annot_Bioassay(res_list=res_list,
# up_col_id="ID_up_sp", ens_gene_id,
# chembldb, cmp_ids.rds)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.