drugTargetInteractions: Drug-Target Interactions

Documented in cmpIdMapping downloadChemblDb downloadUniChem drugTargetAnnot drugTargetAnnotTable drugTargetBioactivity genConfig getDrugTarget getParalogs getSymEnsUp getUniprotIDs processDrugage runDrugTarget_Annot_Bioassay transformTTD

#########################################
## Helper Functions for ChEMBL Queries ##
#########################################
## Author: Thomas Girke
## Last update: 05-Apr-20

genConfig <- function(
                      chemblDbPath = "chembldb.db",
                      downloadPath = "downloads",
                      resultsPath = "results") {

    # if(!dir.exists(downloadPath))
    #    dir.create(downloadPath)
    if (!dir.exists(resultsPath)) {
        dir.create(resultsPath)
    }
    return(list(
        chemblDbPath = chemblDbPath, downloadPath = downloadPath,
        resultsPath = resultsPath
    ))
}

#############
## UniChem ##
#############
## UniChem CMP ID mappings from here:
##      https://www.ebi.ac.uk/unichem/ucquery/listSources
## Note: above html table gives numbering to select proper src_id for ftp
##   downloads, e.g. DrugBank is src2
##    ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/
## Examples:

.getCache <- function() {
    return(BiocFileCache(
        cache =
            rappdirs::user_cache_dir(appname = "drugTargetInteractions"),
        ask = FALSE
    ))
}
.getCacheFile <- function(name) {
    # create_time is a column in the DF returned by bfcquery. It is resolved
    # in the context of the DF inside dplyr::arrange.
    return(dplyr::arrange(
        bfcquery(.getCache(), name, field = c("rname")),
        dplyr::desc(create_time)
    )[1, "rpath"][[1]])
}

.downloadFile <- function(url, name, verbose = FALSE) {
    bfc <- .getCache()
    rid <- bfcquery(bfc, name, "rname")$rid
    if (length(rid) == 0) {
        if (verbose) {
            message("Downloading ", name)
        }
        rid <- names(bfcadd(bfc, name, url))
    }
    if (!isFALSE(bfcneedsupdate(bfc, rid))) {
        # bfcdownload(bfc, rid,ask=FALSE)
        .downloadWithRetries(bfc, rid, name)
    }

    bfcrpath(bfc, rids = rid)
}

.downloadWithRetries <- function(bfc, rid, name, tries = 2) {
    tries <- as.integer(tries)
    stopifnot(length(tries) == 1L, !is.na(tries))

    while (tries > 0) {
        result <- tryCatch(bfcdownload(bfc, rid, ask = FALSE), error = identity)
        if (!inherits(result, "error")) {
            break
        }
        tries <- tries - 1
    }

    if (tries == 0) {
        stop(
            "'bfcdownload()' failed:",
            "\n  filename: ", name,
            "\n  error: ", conditionMessage(result)
        )
    }

    result
}
.cacheFileFromZip <- function(zipFileUrl, name) {
    tempDir <- tempdir()
    zipFile <- file.path(tempDir, "temp.zip")

    download.file(zipFileUrl, zipFile)
    unzip(zipFile, exdir = tempDir)

    # message("zip file: ",zipFile)
    # message("name: ",name)
    # message("file to extract: ",file.path(tempDir,name))
    bfc <- .getCache()
    rid <- names(bfcadd(bfc, name, file.path(tempDir, name), action = "copy"))

    bfcrpath(bfc, rids = rid)
}
downloadUniChem <- function(rerun = TRUE, config = genConfig()) {
    # urlBase = "ftp://ftp.ebi.ac.uk/pub/databases/chembl/UniChem/data/wholeSourceMapping/src_id1/"
    urlBase <- "https://drug-target-interactions-data.s3-us-west-1.amazonaws.com/"
    if (rerun == TRUE) {
        bfc <- .getCache()
        ## ChEMBL to DrugBank mapping in src1src2.txt.gz:
        .downloadFile(
            paste(urlBase, "src1src2.txt.gz", sep = ""),
            "src1src2.txt.gz"
        )
        ## ChEMBL to PubChem CID mapping in src1src22.txt.gz
        .downloadFile(
            paste(urlBase, "src1src22.txt.gz", sep = ""),
            "src1src22.txt.gz"
        )
        ## ChEMBL to ChEBI mapping in src1src7.txt.gz
        .downloadFile(
            paste(urlBase, "src1src7.txt.gz", sep = ""),
            "src1src7.txt.gz"
        )
    }
    invisible(NULL)
}
downloadChemblDb <- function(version, rerun = TRUE, config = genConfig()) {
    url <- paste("https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/latest/",
        "chembl_", version, "_sqlite.tar.gz",
        sep = ""
    )
    if (rerun == TRUE) {
        tempDir <- tempdir()
        tarFile <- file.path(tempDir, "chembl_sqlite.tar.gz")

        dbPath <- paste("chembl_", version, "/chembl_", version, "_sqlite/chembl_",
            version, ".db",
            sep = ""
        )
        download.file(url, tarFile)
        untar(tarFile, c(dbPath), exdir = tempDir)
        sourceDbFile <- file.path(
            tempDir,
            paste("chembl_", version, sep = ""),
            paste("chembl_", version, "_sqlite", sep = ""),
            paste("chembl_", version, ".db", sep = "")
        )

        bfc <- .getCache()
        rid <- names(bfcadd(bfc, paste("chembl_", version, ".db", sep = ""),
            sourceDbFile,
            action = "copy"
        ))
        file.remove(sourceDbFile)
        return(bfcrpath(bfc, rids = rid))
        # file.copy(from=sourceDbFile, to = config$chemblDbPath)
        # file.remove(sourceDbFile)
    }
}
## Usage:
# downloadUniChem(rerun=FALSE)

###############################################################
## Retrieve with UniProt.ws UniProt IDs using IDMs and SSNNs ##
###############################################################
## The following returns for a set of query IDs (here Ensembl gene or
## UniProt IDs) the corresponding UniProt IDs based on two independent
## approaches: ID mappings (IDMs) and sequence similarity nearest neighbors
## (SSNNs) using UNIREF clusters. Note: the query IDs (e.g. ENSEMBL genes) can
## only be reliably maintained in the SSNN results when 'chunksize=1' since
## batch queries for protein clusters with UnitProt.ws will often drop the query
## IDs such as ENSEMBL gene IDs. To address this, the query result contains
## an extra 'QueryID' column when 'chunksize=1', but not when it is set to a
## different value than 1.  The getParalogs function is similar but it uses
## biomaRt's paralogs instead of UNIREF clusters.

## Basic usage of UniProt.ws package
# library(UniProt.ws)
# up <- UniProt.ws(taxId=9606) # Attach organism here human
# ?UniProt.ws # Help on base class of package
# columns(up) # Lists available data
# keytypes(up) # Lists available keys
# species(up) # Returns attached species
# taxId(up) # Returns attached species id
# availableUniprotSpecies()[1:4,] # Lists available species
# availableUniprotSpecies("musculus") # Pattern matching on species
# lookupUniprotSpeciesFromTaxId("3702") # Returns species name for species id

# Attaches mouse as species to UniProt.ws (here 'up') object
# # taxId(up) <- 10090

getUniprotIDs <- function(taxId = 9606, kt = "ENSEMBL", keys,
                          seq_cluster = "UNIREF90",
                          chunksize = 20) {
    requireNamespace(UniProt.ws)
    up <- UniProt.ws(taxId) # Attach organism, here human

    ## Validity checks
    if (!kt %in% c("ENSEMBL", "UNIPROTKB")) {
        stop("Argument kt needs to be one of: 'ENSEMBL', 'UNIPROTKB'")
    }
    if (!seq_cluster %in% c("UNIREF100", "UNIREF90", "UNIREF50")) {
        stop(
            "Argument seq_cluster needs to be one of:",
            " 'UNIREF100', 'UNIREF90', 'UNIREF50'"
        )
    }

    ## Columns to return
    df <- data.frame(
        "ENSEMBL" = character(),
        "ID" = character(),
        "GENES" = character(),
        "UNIREF100" = character(),
        "UNIREF90" = character(),
        "UNIREF50" = character(),
        "ORGANISM" = character(),
        "PROTEIN-NAMES" = character(),
        check.names = FALSE
    )
    columns <- colnames(df)

    ## Arrange keys in chunk list, with list components of length<='chunksize'
    keys <- unique(keys)
    keylist <- split(keys, ceiling(seq_along(keys) / chunksize))

    ## Empty list container
    listcontainer <- vector("list", length(keylist))
    names(listcontainer) <- names(keylist)
    res_list <- list(IDM = listcontainer, SSNN = listcontainer)

    ## Run queries in chunks
    for (i in seq_along(keylist)) {
        ## ID mappings (IDMs)
        res_ID <- try(UniProt.ws::select(up, keys = keylist[[i]], columns, kt),
            silent = TRUE
        )
        if (!inherits(res_ID, "try-error")) {
            if (chunksize == 1) {
                res_ID <- data.frame(
                    QueryID = keylist[[i]][1], res_ID,
                    check.names = FALSE
                )
                res_list[[1]][[i]] <- res_ID[, c("QueryID", columns)]
            } else {
                res_list[[1]][[i]] <- res_ID[, columns]
            }
        } else {
            if (chunksize == 1) {
                res_list[[1]][[i]] <- data.frame(
                    QueryID = character(), df,
                    check.names = FALSE
                )
                # Required since current iteration will not
                # enter next step (SSNN)
                res_list[[2]][[i]] <- data.frame(
                    QueryID = character(), df,
                    check.names = FALSE
                )
            } else {
                res_list[[1]][[i]] <- df
                # Required since current iteration will not
                # enter next step (SSNN)
                res_list[[2]][[i]] <- df
            }
        }

        ## Sequence Similarity (SSNNs) using UNIREF*
        if (!inherits(res_ID, "try-error")) {
            unirefkeys <- as.character(stats::na.omit(unique(res_ID[, seq_cluster])))
            unirefkeylist <- split(
                unirefkeys,
                ceiling(seq_along(unirefkeys) / chunksize)
            )
            unirefcontainer <- vector("list", length(unirefkeylist))
            names(unirefcontainer) <- names(unirefkeylist)
            ## SSNN queries are run in subloop since number of uniref keys is
            # larger than in IDM query
            for (j in seq_along(unirefkeylist)) {
                tmp <- try(UniProt.ws::select(up,
                    keys = unirefkeylist[[j]],
                    columns, seq_cluster
                ),
                silent = TRUE
                )
                if (!inherits(tmp, "try-error")) {
                    if (chunksize == 1) {
                        unirefcontainer[[j]] <- data.frame(
                            QueryID =
                                keylist[[i]][1],
                            tmp[, columns],
                            check.names = FALSE
                        )
                    } else {
                        unirefcontainer[[j]] <- tmp[, columns]
                    }
                } else {
                    if (chunksize == 1) {
                        unirefcontainer <- data.frame(
                            QueryID = character(), df,
                            check.names = FALSE
                        )
                    } else {
                        unirefcontainer[[j]] <- df
                    }
                }
            }
            res_CL <- do.call("rbind", unirefcontainer)
            if (chunksize == 1) {
                res_list[[2]][[i]] <- res_CL[, c("QueryID", columns)]
            } else {
                res_list[[2]][[i]] <- res_CL[, columns]
            }
        }
        ## Status message
        print(paste(
            "Processed", chunksize * i, "keys of total of", length(keys),
            "keys."
        ))
    }
    ## Assemble and return results
    res_list[[1]] <- do.call("rbind", res_list[[1]])
    rownames(res_list[[1]]) <- NULL
    res_list[[2]] <- do.call("rbind", res_list[[2]])
    rownames(res_list[[2]]) <- NULL
    return(res_list)
}

## Usage:
# keys <- c("ENSG00000145700", "ENSG00000135441", "ENSG00000120071",
# "ENSG00000120088", "ENSG00000185829", "ENSG00000185829", "ENSG00000185829",
# "ENSG00000238083", "ENSG00000012061", "ENSG00000104856", "ENSG00000104936",
# "ENSG00000117877", "ENSG00000130202", "ENSG00000130202", "ENSG00000142252",
# "ENSG00000189114", "ENSG00000234906")
# res_list100 <- getUniprotIDs(taxId=9606, kt="ENSEMBL", keys=keys,
#                        seq_cluster="UNIREF100")
# sapply(res_list100, dim, simplify=FALSE)
# sapply(names(res_list100), function(x) unique(na.omit(res_list100[[x]]$ID)))
# res_list90 <- getUniprotIDs(taxId=9606, kt="ENSEMBL", keys=keys,
#                    seq_cluster="UNIREF90")
# sapply(res_list90, dim, simplify=FALSE)
# sapply(names(res_list90), function(x) unique(na.omit(res_list90[[x]]$ID)))

##########################################################
## Retrieve with biomaRt UniProt IDs and their Paralogs ##
##########################################################
## Using biomaRt, obtain for query genes the corresponding UniProt IDs as well
## as paralogs.  Query genes can be Gene Names or ENSEMBL Gene IDs from
## H sapiens. The result is similar to IDMs and SSNNs from getUniprotIDs
## function, but instead of UNIREF clusters, biomaRt's paralogs are used to
## obtain SSNNs.

getParalogs <- function(queryBy) {
    mart <- biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")

    ## ID Matching (IDM) result table
    ## To list available uniprot annotation fields, run:
    # listAttributes(mart)[grep("uniprot", listAttributes(mart)[,"name"]),]
    IDMresult <- biomaRt::getBM(
        attributes = c(
            "ensembl_gene_id",
            "uniprot_gn_symbol",
            "uniprotswissprot",
            "uniprotsptrembl",
            "description"
        ),
        filters = queryBy$idType,
        values = queryBy$ids,
        mart = mart
    )
    IDMresult <- data.frame(
        QueryID = as.character(IDMresult$ensembl_gene_id),
        ENSEMBL = as.character(IDMresult$ensembl_gene_id),
        GENES = as.character(IDMresult$uniprot_gn_symbol),
        ID_up_sp = as.character(IDMresult$uniprotswissprot),
        ID_up_sp_tr = as.character(IDMresult$uniprotsptrembl)
    )
    ## Added by ThG on 25-Aug-21 maintain queries with not matches 
    missing <- queryBy$ids[!queryBy$ids %in% unique(as.character(IDMresult$QueryID))]
    if(length(missing)>0) {
        missDF <- data.frame(QueryID=missing, ENSEMBL=missing, GENES="", ID_up_sp="", ID_up_sp_tr="")
        IDMresult <- rbind(IDMresult, missDF)
    }

    ## Paralog (Sequence Similarity Nearest Neighbors - SSNN) result table
    ## To list available paralog annotation fields, run:
    # listAttributes(mart)[grep("paralog", listAttributes(mart)[,"name"]),]
    result <- biomaRt::getBM(
        attributes = c(
            "external_gene_name",
            "ensembl_gene_id",
            "hsapiens_paralog_associated_gene_name",
            "hsapiens_paralog_ensembl_gene",
            "hsapiens_paralog_perc_id",
            "hsapiens_paralog_perc_id_r1"
        ),
        filters = queryBy$idType,
        values = queryBy$ids,
        mart = mart
    )
    ## Reshape result table
    query <- result[!duplicated(result$ensembl_gene_id), ]
    query <- data.frame(
        QueryID = as.character(query$ensembl_gene_id),
        ENSEMBL = as.character(query$ensembl_gene_id),
        GENES = as.character(query$external_gene_name),
        stringsAsFactors = FALSE
    )
    query <- cbind(query, paralog_perc_id = 100, paralog_perc_id_r1 = 100)

    # removes rows with no paralog matches
    result <- result[nchar(result$hsapiens_paralog_ensembl_gene) > 0, ]
    paralog <- data.frame(
        QueryID = result$ensembl_gene_id,
        ENSEMBL = result$hsapiens_paralog_ensembl_gene,
        GENES = result$hsapiens_paralog_associated_gene_name,
        paralog_perc_id = result$hsapiens_paralog_perc_id,
        paralog_perc_id_r1 = result$hsapiens_paralog_perc_id_r1,
        stringsAsFactors = FALSE
    )
    resultDF <- rbind(query, paralog)
    resultDF <- resultDF[order(resultDF$QueryID), ]

    ## Retrieve for ENSEMBL gene IDs in results the corresponding UniProt IDs.
    ## Note, this needs to be done in separate query since attributes from
    ## multiple attribue pages are not allowed.
    uniprot <- biomaRt::getBM(
        attributes = c(
            "ensembl_gene_id",
            "uniprot_gn_symbol",
            "uniprotswissprot",
            "uniprotsptrembl",
            "description"
        ),
        filters = "ensembl_gene_id",
        values = resultDF$ENSEMBL,
        mart = mart
    )

    ## For UniProtKB/Swiss-Prot IDs (curated entries)
    up_sp <- tapply(uniprot$uniprotswissprot, uniprot$ensembl_gene_id,
        function(x) unique(as.character(x)[nchar(x) > 0]),
        simplify = FALSE
    )
    index <- vapply(up_sp, length, integer(1))
    up_sp[index == 0] <- ""
    index[index == 0] <- 1
    ## Added by ThG on 25-Aug-21 to account for missing entries index 
    missing <- unique(as.character(resultDF$ENSEMBL))[!unique(as.character(resultDF$ENSEMBL)) %in% names(index)]
    if(length(missing)>0) {
        index <- c(index, setNames(rep(1, length(missing)), missing))
        up_sp <- c(up_sp, setNames(rep("", length(missing)), missing))
    }
    index <- index[as.character(resultDF$ENSEMBL)]
    up_sp <- up_sp[names(index)]
    resultDF <- cbind(resultDF[rep(seq_along(resultDF$ENSEMBL), index), ],
        ID_up_sp = as.character(unlist(up_sp))
    )

    ## UniProtKB/TrEMBL IDs (less curated entries)
    up_sp_tr <- tapply(uniprot$uniprotsptrembl, uniprot$ensembl_gene_id,
        function(x) unique(as.character(x)[nchar(x) > 0]),
        simplify = FALSE
    )
    index <- vapply(up_sp_tr, length, integer(1))
    up_sp_tr[index == 0] <- ""
    index[index == 0] <- 1
    ## Added by ThG on 25-Aug-21 to account for missing entries index 
    missing <- unique(as.character(resultDF$ENSEMBL))[!unique(as.character(resultDF$ENSEMBL)) %in% names(index)]
    if(length(missing)>0) {
        index <- c(index, setNames(rep(1, length(missing)), missing))
        up_sp_tr <- c(up_sp_tr, setNames(rep("", length(missing)), missing))
    }
    index <- index[as.character(resultDF$ENSEMBL)]
    up_sp_tr <- up_sp_tr[names(index)]
    resultDF <- cbind(resultDF[rep(seq_along(resultDF$ENSEMBL), index), ],
        ID_up_sp_tr = as.character(unlist(up_sp_tr))
    )
    row.names(resultDF) <- NULL

    ## Assemble result list

    # This makes behavior more similar to result from getUniprotIDs
    IDMresult[IDMresult == ""] <- NA
    for (i in colnames(IDMresult)) {
        if (is.factor(IDMresult[, i])) {
            IDMresult[, i] <- as.character(IDMresult[, i])
        }
    }
    resultDF[resultDF == ""] <- NA
    for (i in colnames(resultDF)) {
        if (is.factor(resultDF[, i])) {
            resultDF[, i] <- as.character(resultDF[, i])
        }
    }
    resultList <- list(IDM = IDMresult, SSNN = resultDF)
    return(resultList)
}
## Usage:
# queryBy <- list(molType="gene", idType="external_gene_name",
#               ids=c("ZPBP", "MAPK1", "EGFR"))
# queryBy <- list(molType="gene", idType="ensembl_gene_id",
#               ids=c("ENSG00000042813", "ENSG00000100030", "ENSG00000146648"))
# result <- getParalogs(queryBy)
# result$IDM[1:4,]
# result$SSNN[1:4,]
# sapply(result, dim, simplify=FALSE)
# sapply(names(result), function(x) unique(na.omit(result[[x]]$ID_up_sp)))

#####################################################
## Function to query known drug-target annotations ##
#####################################################

## Function to query known drug-target annotations
drugTargetAnnot <- function(queryBy = list(molType = NULL, idType = NULL, ids = NULL),
                            cmpid_file = file.path(config$resultsPath, "cmp_ids.rds"),
                            config = genConfig()) {
    if (any(names(queryBy) != c("molType", "idType", "ids"))) {
        stop(
            "All three list components in 'queryBy' (named: 'molType',",
            " 'idType' and 'ids') need to be present."
        )
    }
    if (any(vapply(queryBy, length, integer(1)) == 0)) {
        stop(
            "All components in 'queryBy' list need to be populated with ",
            "corresponding character vectors."
        )
    }


    ## Load ChEMBL SQLite
    ## ChEMBL SQLite downloaded from here: ftp://ftp.ebi.ac.uk/pub/databases/
    ##    chembl/ChEMBLdb/latest/chembl_24_1_sqlite.tar.gz
    ## after unpacking you get chembl_xx.db
    dbpath <- config$chemblDbPath
    mydb <- dbConnect(SQLite(), dbpath)

    ## Input file for following step was generated by cmpIdMapping()
    cmp_ids <- readRDS(cmpid_file)
    rownames(cmp_ids) <- as.character(cmp_ids$molregno)

    ## Query by targets
    if (queryBy$molType == "protein" & grepl("Uniprot", queryBy$idType[[1]],
        ignore.case = TRUE
    )) {
        idvec <- paste0("(\"", paste(queryBy$ids, collapse = "\", \""), "\")")

        query <- paste(
            "SELECT d.molregno,e.chembl_id, e.pref_name AS Drug_Name,
                              d.mechanism_of_action AS MOA,
                              d.action_type AS Action_Type,
                              e.first_approval AS First_Approval,
                              a.chembl_id AS ChEMBL_TID, d.tid AS TID,
                              c.accession AS UniProt_ID,
                              c.description AS Desc, c.organism AS Organism,
                              GROUP_CONCAT(DISTINCT (f.mesh_id || ': ' ||
                                                     f.mesh_heading))
                                AS Mesh_Indication
                          FROM target_dictionary AS a
                          LEFT JOIN target_components AS b ON a.tid = b.tid
                          LEFT JOIN component_sequences AS c
                            ON b.component_id = c.component_id
                          LEFT JOIN drug_mechanism AS d ON a.tid = d.tid
                          LEFT JOIN molecule_dictionary AS e
                            ON d.molregno = e.molregno
                          LEFT JOIN drug_indication AS f
                            ON e.molregno = f.molregno
                          WHERE c.accession IN", idvec,
            "GROUP BY d.molregno, c.accession
                           ORDER BY c.accession, c.description"
        )
        myquery <- dbSendQuery(mydb, query)
        activityassays <- dbFetch(myquery)
        dbClearResult(myquery)
    }

    ## Query by compounds
    if (queryBy$molType == "cmp") {
        ## ID conversions
        if (queryBy$idType == "molregno") {
            cmpvec <- queryBy$ids
        }
        if (queryBy$idType == "chembl_id") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$chembl_id), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$chembl_id)
            cmpvec <- cmpvec[queryBy$ids]
        }
        if (queryBy$idType == "PubChem_ID") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$PubChem_ID), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$PubChem_ID)
            cmpvec <- cmpvec[queryBy$ids]
        }
        if (queryBy$idType == "DrugBank_ID") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$DrugBank_ID), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$DrugBank_ID)
            cmpvec <- cmpvec[queryBy$ids]
        }
        idvec <- paste0("(\"", paste(cmpvec, collapse = "\", \""), "\")")

        query <- paste(
            "SELECT a.molregno, a.chembl_id, a.pref_name AS Drug_Name,
                            b.mechanism_of_action AS MOA,
                            b.action_type AS Action_Type,
                            a.first_approval AS First_Approval,
                            c.chembl_id AS ChEMBL_TID, b.tid AS TID,
                            e.accession AS UniProt_ID,
                            e.description AS Desc, e.organism AS Organism,
                            GROUP_CONCAT(DISTINCT (f.mesh_id || ': '
                                                   || f.mesh_heading))
                                AS Mesh_Indication
                      FROM molecule_dictionary AS a
                      LEFT JOIN drug_mechanism AS b ON a.molregno = b.molregno
                      LEFT JOIN target_dictionary AS c ON b.tid = c.tid
                      LEFT JOIN target_components AS d ON b.tid = d.tid
                      LEFT JOIN component_sequences AS e
                        ON d.component_id = e.component_id
                      LEFT JOIN drug_indication AS f ON a.molregno = f.molregno
                      WHERE a.molregno IN", idvec,
            "GROUP BY a.molregno, e.accession
                       ORDER BY a.molregno, a.pref_name"
        )
        myquery <- dbSendQuery(mydb, query)
        activityassays <- dbFetch(myquery)
        dbClearResult(myquery)
    }
    resultDF <- data.frame(cmp_ids[
        as.character(activityassays$molregno),
        -c(2, 4)
    ], activityassays[, -c(1, 2)])
    dbDisconnect(mydb)

    ## Remove rows with identical values in all fields
    resultDF <- resultDF[!duplicated(apply(resultDF, 1, paste, collapse = "_")), ]
    ## Add query column and sort rows in result table according to query
    index_list <- lapply(
        queryBy$ids,
        function(x) {
            which(toupper(resultDF[, queryBy$idType]) %in%
                toupper(x))
        }
    )
    names(index_list) <- queryBy$ids
    index_list[vapply(index_list, length, integer(1)) == 0] <- Inf
    index_df <- data.frame(
        ids = rep(
            names(index_list),
            vapply(index_list, length, integer(1))
        ),
        rowids = unlist(index_list)
    )
    resultDF <- data.frame(
        QueryIDs = index_df[, 1],
        resultDF[as.numeric(index_df$rowids), ]
    )
    rownames(resultDF) <- NULL
    resultDF <- resultDF[, colnames(resultDF)!="last_active"] # Added by ThG on 25-Aug-21 to remove undesirable column in >= ChEMBL29
    return(resultDF)
}

#########################################
## Query Known Drug-Target Annotations ##
#########################################
## Function to generate drugTargetAnnot.xls table
drugTargetAnnotTable <- function(outfile, rerun = TRUE, config = genConfig()) {
    if (rerun == TRUE) {
        ## Load ChEMBL SQLite
        ## ChEMBL SQLite downloaded from here: ftp://ftp.ebi.ac.uk/pub/
        ##          databases/chembl/ChEMBLdb/latest/chembl_24_1_sqlite.tar.gz
        ## after unpacking you get chembl_24.db
        chembldb <- config$chemblDbPath
        mydb <- dbConnect(SQLite(), chembldb)

        ## Joins for drug mech
        allmech <- dbGetQuery(mydb, "SELECT *
                                    FROM drug_mechanism,
                                         molecule_dictionary
                                    WHERE drug_mechanism.molregno =
                                        molecule_dictionary.molregno")

        ## Set values in "mechanism_comment" column to NA since it contains
        ## weird characters resulting in row duplications when writing to files
        allmech[, colnames(allmech) %in% "mechanism_comment"] <-
            rep(NA, nrow(allmech))

        ## Get UniChem Info
        ## ChEMBL to DrubBank mapping in src1src2.txt.gz:

        chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
        chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
            factor(chembl2drugbank[, 1]), paste,
            collapse = ", "
        )
        ## ChEMBL to PubChem CID mapping in src1src22.txt.gz
        chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
        chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
            factor(chembl2pubchem[, 1]), paste,
            collapse = ", "
        )
        ## ChEMBL to ChEBI mapping in src1src7.txt.gz
        chembl2chebi <- read.delim(gzfile(.getCacheFile("src1src7.txt.gz")))
        chembl2chebi_vec <- tapply(as.character(chembl2chebi[, 2]),
            factor(chembl2chebi[, 1]), paste,
            collapse = ", "
        )
        allmech <- cbind(allmech,
            PubChem_ID = chembl2pubchem_vec[allmech$chembl_id],
            DrugBank_ID = chembl2drugbank_vec[allmech$chembl_id],
            ChEBI_ID = chembl2chebi_vec[allmech$chembl_id]
        )

        ## Joins for target ids
        targetids <- dbGetQuery(mydb, "SELECT *
                                       FROM target_components,
                                            component_sequences
                                       WHERE target_components.component_id =
                                           component_sequences.component_id")
        tid2accession <- tapply(targetids$accession, as.factor(targetids$tid),
            paste,
            collapse = ", "
        )
        tid2description <- tapply(targetids$description,
            as.factor(targetids$tid), paste,
            collapse = ", "
        )
        tid2organism <- tapply(targetids$organism, as.factor(targetids$tid),
            paste,
            collapse = ", "
        )
        target_dict <- dbGetQuery(mydb, "SELECT * FROM target_dictionary")
        chembl_tid <- as.character(target_dict$chembl_id)
        names(chembl_tid) <- as.character(target_dict$tid)
        tidDF <- data.frame(
            row.names = names(tid2accession),
            ChEMBL_TID = chembl_tid[names(tid2accession)],
            UniProt_ID = tid2accession,
            Desc = tid2description[names(tid2accession)],
            Organism = tid2organism[names(tid2accession)]
        )

        ## Join mesh and efo indication
        indication <- dbGetQuery(mydb, "SELECT * FROM drug_indication")
        mesh <- paste(indication$mesh_id, indication$mesh_heading, sep = ": ")
        mesh2molregno <- tapply(mesh, as.factor(indication$molregno), paste,
            collapse = ", "
        )
        efo <- paste(indication$efo_id, indication$efo_term, sep = ": ")
        efo2molregno <- tapply(efo, as.factor(indication$molregno), paste,
            collapse = ", "
        )

        ## Assemble everything
        allmech <- data.frame(
            row.names = seq_along(allmech[, 1]),
            allmech,
            tidDF[as.character(allmech$tid), ],
            MeshIndication =
                mesh2molregno[as.character(allmech$molregno)],
            EFOIndication =
                efo2molregno[as.character(allmech$molregno)]
        )
        write.table(allmech, outfile, row.names = FALSE, quote = FALSE, sep = "\t")
    }
}
## Usage:
# drugTargetAnnotTable(outfile="results/drugTargetAnnot.xls", rerun=FALSE)

## Compare with online drug-target table from ChEMBL obtained from manual
## download here: https://www.ebi.ac.uk/chembl/old/drug/targets

## Query functions for drugTargetAnnot.xls
getDrugTarget <- function(dt_file = file.path(
                              config$resultsPath,
                              "drugTargetAnnot.xls"
                          ),
                          queryBy = list(molType = NULL, idType = NULL, ids = NULL),
                          id_mapping = c(
                              chembl = "chembl_id", pubchem = "PubChem_ID",
                              uniprot = "UniProt_ID"
                          ),
                          columns, config = genConfig()) {
    # stop("dt_file: ",dt_file)
    dt_file <- read.delim(dt_file)
    myid <- id_mapping[queryBy$idType]
    .queryFct <- function(dt_file, myid) {
        idlist <- strsplit(as.character(dt_file[, myid]), ", ")
        index_list <- lapply(
            queryBy$ids,
            function(x) {
                which(vapply(
                    seq_along(idlist),
                    function(y) {
                        any(tolower(idlist[[y]]) %in% tolower(x))
                    },
                    logical(1)
                ))
            }
        )
        names(index_list) <- queryBy$ids
        ## To include no matches in result, inject Inf in corresponding slots
        index_list[vapply(index_list, length, integer(1)) == 0] <- Inf
        index_df <- data.frame(
            ids = rep(
                names(index_list),
                vapply(index_list, length, integer(1))
            ),
            rowids = unlist(index_list)
        )
        df <- data.frame(
            QueryIDs = index_df[, 1],
            dt_file[as.numeric(index_df$rowids), ]
        )
        return(df)
    }
    df <- .queryFct(dt_file, myid)
    dfsub <- df[, columns]
    rownames(dfsub) <- NULL
    return(dfsub)
}
## Usage:
# id_mapping <- c(chembl="chembl_id", pubchem="PubChem_ID",
#                   uniprot="UniProt_ID", drugbank="DrugBank_ID")
# queryBy <- list(molType="cmp", idType="chembl",
#                   ids=c("CHEMBL25", "CHEMBL1742471"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
#               id_mapping, columns=c(1,5,8,16,17,39,46:52))
# queryBy <- list(molType="cmp", idType="pubchem",
#               ids=c("2244", "65869", "2244"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
#   id_mapping, columns=c(1,5,8,16,17,39,46:52))
# queryBy <- list(molType="protein", idType="uniprot",
#   ids=c("P43166", "P00915", "P43166"))
# getDrugTarget(dt_file="results/drugTargetAnnot.xls", queryBy=queryBy,
#   id_mapping, columns=c(1,5,8,16,17,39,46:52))

#############
## DrugAge ##
#############
## Source file of DrugAge is linked here: http://genomics.senescence.info/drugs
processDrugage <- function(drugagefile = file.path(
                               config$resultsPath,
                               "drugage_id_mapping.xls"
                           ),
                           redownloaddrugage = TRUE, config = genConfig()) {
    if (redownloaddrugage == TRUE) {
        .cacheFileFromZip(
            "http://genomics.senescence.info/drugs/dataset.zip",
            "drugage.csv"
        )
    }
    drugage <- read.csv(.getCacheFile("drugage.csv"))
    # drugage <- read.csv(file.path(config$downloadPath,"drugage.csv"))
    drugage <- drugage[!duplicated(drugage$compound_name), ]
    ## Get mol_dict table from chembl_db
    chembldb <- config$chemblDbPath
    mydb <- dbConnect(SQLite(), chembldb)
    mol_dict <- dbGetQuery(
        mydb,
        "SELECT pref_name,chembl_id
                             FROM molecule_dictionary"
    )
    mol_dict_sub <- mol_dict[!is.na(mol_dict$pref_name), ] # mol_dict from below
    prefname <- as.character(mol_dict_sub$pref_name)
    chemblid <- as.character(mol_dict_sub$chembl_id)
    fact <- tapply(chemblid, factor(prefname), paste, collapse = ", ")
    ## Load ChEMBL to PubChem CID and DrugBank ID mappings
    ## (generated above with downloadUniChem Fct)
    chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
    chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
        factor(chembl2pubchem[, 1]), paste,
        collapse = ", "
    )
    chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
    chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
        factor(chembl2drugbank[, 1]), paste,
        collapse = ", "
    )
    ## Assemble results
    drugage <- cbind(drugage,
        pref_name = names(fact[toupper(drugage$compound_name)]),
        chembl_id = fact[toupper(drugage$compound_name)]
    )
    drugage <- cbind(drugage,
        pubchem_cid = chembl2pubchem_vec[
            as.character(drugage$chembl_id)
        ],
        drugbank_id = chembl2drugbank_vec[
            as.character(drugage$chembl_id)
        ]
    )
    write.table(drugage, drugagefile, row.names = FALSE, quote = FALSE, sep = "\t")
}
## Usage:
# processDrugage(drugagefile="results/drugage_id_mapping.xls",
#   redownloaddrugage=FALSE)
## Now the missing IDs need to be added manually. A semi-manual approach
##  is to use this web service: https://cts.fiehnlab.ucdavis.edu/batch

########################################################
## Integration with Therapeutic Target Database (TTD) ##
########################################################
## ID crossmatching table
transformTTD <- function(ttdfile = file.path(config$downloadPath, "TTD_IDs.txt"),
                         redownloadTTD = TRUE, config = genConfig()) {
    url <- "https://db.idrblab.org/ttd/sites/default/files/ttd_database/P1-02-TTD_crossmatching.txt"
    if (redownloadTTD == TRUE) {
        .downloadFile(url, ttdfile)
    }
    df <- read.delim(.getCacheFile(ttdfile),
        skip = 12, header = FALSE,
        col.names = c("TTD_ID", "No", "Source", "Name")
    )
    df_sub <- df[grep("DrugName|PubChem CID", df$Source), ]
    list_sub <- split(df_sub, df_sub$TTD_ID)
    list_sub <- list_sub[lapply(list_sub, nrow) == 2]
    list_sub <- lapply(list_sub, t)
    list_sub <- lapply(list_sub, tail, 1)
    df_sub <- do.call("rbind", list_sub)
    row.names(df_sub) <- names(list_sub)
    df_sub <- data.frame(
        TTD_ID = rownames(df_sub), CMP_Name = df_sub[, 1],
        PubChem_ID = gsub("^CID ", "", as.character(df_sub[, 2]))
    )
    return(df_sub)
}
## Usage:
# df_sub <- transformTTD(ttdfile="downloads/TTD_IDs.txt", redownloadTTD=FALSE)
# df_sub[1:4,]


############################
## Query Bioactivity Data ##
############################
## Function to generate CMP ID mappings UniChem (current version very slow)
cmpIdMapping <- function(outfile = file.path(config$resultsPath, "cmp_ids.rds"),
                         rerun = TRUE, config = genConfig()) {
    if (rerun == TRUE) {
        chembldb <- config$chemblDbPath
        mydb <- dbConnect(SQLite(), chembldb)

        ## Drug ID mappings
        chemblid_lookup <- dbGetQuery(mydb, 'SELECT *
                                             FROM chembl_id_lookup
                                             WHERE "entity_type" = "COMPOUND"')
        colnames(chemblid_lookup)[colnames(chemblid_lookup) == "entity_id"] <-
            "molregno"

        ## CMP ID mappings from UniChem (very slow!)
        ## ChEMBL to DrubBank mapping in src1src2.txt.gz:
        chembl2drugbank <- read.delim(gzfile(.getCacheFile("src1src2.txt.gz")))
        chembl2drugbank_vec <- tapply(as.character(chembl2drugbank[, 2]),
            factor(chembl2drugbank[, 1]), paste,
            collapse = ", "
        )
        ## ChEMBL to PubChem CID mapping in src1src22.txt.gz
        chembl2pubchem <- read.delim(gzfile(.getCacheFile("src1src22.txt.gz")))
        chembl2pubchem_vec <- tapply(as.character(chembl2pubchem[, 2]),
            factor(chembl2pubchem[, 1]), paste,
            collapse = ", "
        )
        ## ChEMBL to ChEBI mapping in src1src7.txt.gz
        chembl2chebi <- read.delim(gzfile(.getCacheFile("src1src7.txt.gz")))
        chembl2chebi_vec <- tapply(as.character(chembl2chebi[, 2]),
            factor(chembl2chebi[, 1]), paste,
            collapse = ", "
        )
        ## Create final CMP ID mapping table (very slow!!)
        cmp_ids <- cbind(chemblid_lookup,
            PubChem_ID = chembl2pubchem_vec[
                as.character(chemblid_lookup$chembl_id)
            ],
            DrugBank_ID = chembl2drugbank_vec[
                as.character(chemblid_lookup$chembl_id)
            ]
        )
        saveRDS(cmp_ids, outfile)
    }
}
## Usage:
# cmpIdMapping(outfile="results/cmp_ids.rds", rerun=FALSE)
# cmp_ids <- readRDS("results/cmp_ids.rds")

## Function to query bioactivity data by target or compound ids
drugTargetBioactivity <- function(queryBy = list(
                                      molType = NULL,
                                      idType = NULL, ids = NULL
                                  ),
                                  cmpid_file = file.path(
                                      config$resultsPath,
                                      "cmp_ids.rds"
                                  ),
                                  config = genConfig()) {
    if (any(vapply(queryBy, is.null, logical(1)))) {
        stop(
            "All components in 'queryBy' list need to be populated ",
            "with corresponding character vectors."
        )
    }

    dbpath <- config$chemblDbPath
    mydb <- dbConnect(SQLite(), dbpath)

    ## Input file for following step was generated by cmpIdMapping()
    cmp_ids <- readRDS(cmpid_file)
    rownames(cmp_ids) <- as.character(cmp_ids$molregno)

    ## Query bioactivity
    ## Query by targets
    if (queryBy$molType == "protein" & queryBy$idType == "uniprot") {
        idvec <- paste0("(\"", paste(queryBy$ids, collapse = "\", \""), "\")")

        myquery <- dbSendQuery(mydb, paste(
            "SELECT activities.molregno,
                                                   pref_name,
                                                   activity_id,
                                                   assays.chembl_id AS chembl_assay_id,
                                                   accession AS UniProt_ID,
                                                   component_sequences.description,
                                                   organism,
                                                   activities.standard_value,
                                                   activities.standard_units,
                                                   activities.standard_flag,
                                                   activities.standard_type
                                  FROM activities, assays, target_components,
                                       component_sequences, molecule_dictionary
                                  ON activities.assay_id = assays.assay_id
                                    AND assays.tid = target_components.tid
                                    AND target_components.component_id =
                                         component_sequences.component_id
                                    AND activities.molregno =
                                         molecule_dictionary.molregno
                                    WHERE component_sequences.accession IN",
            idvec
        ))
        activityassays <- dbFetch(myquery)
        dbClearResult(myquery)
    }

    ## Query bioactivity by compounds
    if (queryBy$molType == "cmp") {
        ## ID conversions
        if (queryBy$idType == "molregno") {
            cmpvec <- queryBy$ids
        }
        if (queryBy$idType == "chembl_id") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$chembl_id), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$chembl_id)
            cmpvec <- cmpvec[queryBy$ids]
        }
        if (queryBy$idType == "PubChem_ID") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$PubChem_ID), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$PubChem_ID)
            cmpvec <- cmpvec[queryBy$ids]
        }
        if (queryBy$idType == "DrugBank_ID") {
            cmp_ids <- cmp_ids[!duplicated(cmp_ids$DrugBank_ID), ]
            cmpvec <- as.character(cmp_ids$molregno)
            names(cmpvec) <- as.character(cmp_ids$DrugBank_ID)
            cmpvec <- cmpvec[queryBy$ids]
        }
        idvec <- paste0("(\"", paste(cmpvec, collapse = "\", \""), "\")")

        myquery <- dbSendQuery(mydb, paste("SELECT activities.molregno,
                                                   pref_name,
                                                   activity_id,
                                                   assays.chembl_id AS chembl_assay_id,
                                                   accession AS UniProt_ID,
                                                   component_sequences.description,
                                                   organism,
                                                   activities.standard_value,
                                                   activities.standard_units,
                                                   activities.standard_flag,
                                                   activities.standard_type
                                  FROM activities, assays, target_components,
                                       component_sequences, molecule_dictionary
                                  ON activities.assay_id = assays.assay_id
                                    AND assays.tid = target_components.tid
                                    AND target_components.component_id =
                                          component_sequences.component_id
                                    AND activities.molregno =
                                          molecule_dictionary.molregno
                                    WHERE activities.molregno IN", idvec))
        activityassays <- dbFetch(myquery)
        dbClearResult(myquery)
    }
    resultDF <- data.frame(cmp_ids[
        as.character(activityassays$molregno),
        -c(2, 4)
    ], activityassays[, -1])
    resultDF <- resultDF[, colnames(resultDF)!="last_active"] # Added by ThG on 25-Aug-21 to remove undesirable column in >= ChEMBL29
    dbDisconnect(mydb)
    rownames(resultDF) <- NULL
    return(resultDF)
}

## Usage:
# chembldb <- "/bigdata/girkelab/shared/lcshared/chemoinformatics/compoundDBs/
#                 chembl_24/chembl_24_sqlite/chembl_24.db"
# queryBy <- list(molType="protein", idType="uniprot",
#           ids=c("P05979", "P35354", "P33033", "Q8VCT3", "P29475", "P51511"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="molregno",
#                  ids=c("101036", "101137", "1384464"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="DrugBank_ID",
#               ids=c("DB00945", "DB00316", "DB01050"))
# qresult <- drugTargetBioactivity(queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]
# queryBy <- list(molType="cmp", idType="PubChem_ID",
#                 ids=c("2244", "3672", "1983"))
# qresult <- drugTargetBioactivity( queryBy, cmpid_file="results/cmp_ids.rds")
# qresult[1:4,]

#################################
## Gene to Protein ID Mappings ##
#################################
## Return for gene or protein IDs a mapping table containing: ENSEMBL Gene IDs,
## Gene Names/Symbols, UniProt IDs and ENSEMBL Protein IDs. The results are
## returned in a list where the first slot contains the ID mapping table and the
## subsequent slots the corresponding named character vectors for: ens_gene_id,
## up_ens_id, and up_gene_id.  Currently, the query IDs can be Gene Names,
## Ensembl Gene IDs and UniProt IDs.
getSymEnsUp <- function(EnsDb = "EnsDb.Hsapiens.v86", ids, idtype) {
    require(EnsDb, character.only = TRUE)
    ## ID columns to return
    idcolumns <- c("gene_id", "gene_name", "uniprot_id", "protein_id")

    ## With gene names
    if (idtype == "GENE_NAME") {
        idDF <- ensembldb::genes(get(EnsDb),
            filter = AnnotationFilter::GeneNameFilter(ids),
            columns = idcolumns
        )
        idDF <- idDF[!is.na(S4Vectors::values(idDF)$uniprot_id), ]
        idDF <- S4Vectors::values(idDF)
        ## With ENSEBML Gene IDs
    } else if (idtype == "ENSEMBL_GENE_ID") {
        idDF <- ensembldb::genes(get(EnsDb),
            filter = AnnotationFilter::GeneIdFilter(ids),
            columns = idcolumns
        )
        idDF <- idDF[!is.na(S4Vectors::values(idDF)$uniprot_id), ]
        idDF <- S4Vectors::values(idDF)
        ## With UniProt IDs
    } else if (idtype == "UNIPROT_ID") {
        idDF <- ensembldb::proteins(get(EnsDb),
            filter = AnnotationFilter::UniprotFilter(ids),
            columns = idcolumns
        )
        idDF <- idDF[!is.na(idDF$uniprot_id), ]
    } else {
        stop("Argument 'idtype' can only be assigned one of:
             'GENE_NAME', 'ENSEMBL_GENE_ID', 'UNIPROT_ID'.")
    }
    ## Assemble ID mapping table
    idDF <- idDF[, idcolumns]
    row.names(idDF) <- NULL

    ## Assemble named char vector: ens_gene_id
    idDF <- as.data.frame(idDF)
    ens_gene_idDF <- idDF[!duplicated(idDF$gene_id), ]
    ens_gene_id <- as.character(ens_gene_idDF$gene_name)
    names(ens_gene_id) <- as.character(ens_gene_idDF$gene_id)

    ## Assemble named char vector: up_ens_id
    up_ens_idDF <- idDF[!duplicated(idDF$uniprot_id), ]
    up_ens_id <- as.character(up_ens_idDF$gene_id)
    names(up_ens_id) <- as.character(up_ens_idDF$uniprot_id)

    ## Assemble named char vector: up_gene_id
    up_gene_idDF <- idDF[!duplicated(idDF$uniprot_id), ]
    up_gene_id <- as.character(up_gene_idDF$gene_name)
    names(up_gene_id) <- as.character(up_gene_idDF$uniprot_id)

    ## Return result as list
    returnList <- list(
        idDF = idDF, ens_gene_id = ens_gene_id,
        up_ens_id = up_ens_id, up_gene_id = up_gene_id
    )
    return(returnList)
}
## Usage:
# gene_name <- c("CA7", "CFTR")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86", ids=gene_name, idtype="GENE_NAME")
# ensembl_gene_id <- c("ENSG00000001626", "ENSG00000168748")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86",
#               ids=ensembl_gene_id, idtype="ENSEMBL_GENE_ID")
# uniprot_id <- c("P43166", "P13569")
# getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86", ids=uniprot_id, idtype="UNIPROT_ID")

##############################################
## Convenience Drug-Target Wrapper Function ##
##############################################
## Meta-function to obtain both drug-target annotation and bioassay data
runDrugTarget_Annot_Bioassay <- function(res_list, up_col_id = "ID",
                                         ens_gene_id,
                                         cmpid_file = file.path(
                                             config$resultsPath,
                                             "cmp_ids.rds"
                                         ),
                                         config = genConfig(), ...) {
    ## (1.1) Check input validity
    ## To be added, once class system has been introduced...
    ## (1.2) Input ID mappings, note: Ensembl query IDs with similar sequences
    ##   will often map to the same related/third Ensemble gene resulting in
    ##   paralog genes/proteins mapping to several query genes. Thus, this
    ##   one-to-many relationship needs to be resolved for both geneids and
    ##   ensids.
    id_list <- lapply(
        names(res_list),
        function(x) {
            unique(stats::na.omit(as.character(res_list[[x]][, up_col_id])))
        }
    )
    names(id_list) <- names(res_list)
    ensids <- tapply(res_list[[2]]$QueryID, res_list[[2]][, up_col_id],
        function(x) as.character(unique(x)),
        simplify = FALSE
    )
    # geneids <- vapply(names(ensids), function(x) ens_gene_id[ensids[[x]]], character(1)) # Replaced by ThG on 25-Aug-21 with lapply
    geneids <- lapply(names(ensids), function(x) ens_gene_id[ensids[[x]]]) # Added by ThG on 25-Aug-21
    names(geneids) <- names(ensids) # Added by ThG on 25-Aug-21
    geneids <- vapply(names(geneids), function(x) paste(geneids[[x]], collapse = ", "), character(1)) # Added by ThG on 25-Aug-21
    geneids <- unlist(geneids[nchar(names(geneids)) > 0])

    ensids <- vapply(names(ensids), function(x) paste(ensids[[x]], collapse = ", "), character(1))
    ensids <- unlist(ensids[nchar(names(ensids)) > 0])

    ## (1.3) Obtain drug-target annotations for IDM
    idm_ids <- id_list$IDM
    if (length(idm_ids) > 0) {
        queryBy <- list(molType = "protein", idType = "UniProt_ID", ids = idm_ids)
        qresultIDM <- drugTargetAnnot(queryBy,
            cmpid_file = cmpid_file, config = config
        )
        qresultIDM <- data.frame(
            GeneName = as.character(
                geneids[as.character(qresultIDM$QueryIDs)]
            ),
            Ensembl_IDs = as.character(
                ensids[as.character(qresultIDM$QueryIDs)]
            ),
            qresultIDM
        )
    } else {
        qresultIDM <- data.frame()
    }

    ## (1.4) Obtain drug-target annotations for SSNN excluding IDM overlaps
    ## (SSNN_noIDM)

    # UniProt IDs only in SSNN
    ssnn_ids <- id_list$SSNN[!id_list$SSNN %in% idm_ids]
    if (length(ssnn_ids) > 0) {
        queryBy <- list(molType = "protein", idType = "UniProt_ID", ids = ssnn_ids)
        qresultSSNN <- drugTargetAnnot(queryBy,
            cmpid_file = cmpid_file, config = config
        )
        qresultSSNN <- data.frame(
            GeneName = as.character(
                geneids[as.character(qresultSSNN$QueryIDs)]
            ),
            Ensembl_IDs = as.character(
                ensids[as.character(qresultSSNN$QueryIDs)]
            ),
            qresultSSNN
        )
        ## Prepend "Query_" to indicate that returned genes IDs are often not
        ## the ones encoding SSNN UniProt IDs
        if (length(qresultSSNN[, "Ensembl_IDs"]) > 0) {
            qresultSSNN[, "Ensembl_IDs"] <- paste0(
                "Query_",
                qresultSSNN[, "Ensembl_IDs"]
            )
        }
        # Fixes Query_NA strings in NA cases
        qresultSSNN[qresultSSNN[, "Ensembl_IDs"] == "Query_NA", "Ensembl_IDs"] <- NA
        if (length(qresultSSNN[, "GeneName"]) > 0) {
            qresultSSNN[, "GeneName"] <- paste0(
                "Query_",
                qresultSSNN[, "GeneName"]
            )
        }
        # Fixes Query_NA strings in NA cases
        qresultSSNN[qresultSSNN[, "GeneName"] == "Query_NA", "GeneName"] <- NA
    } else {
        qresultSSNN <- data.frame()
    }

    ## (1.5) Combine IDM and SSNN_noIDM results
    qresult <- rbind(
        cbind(
            ID_Mapping_Type = rep(
                "IDM",
                nrow(qresultIDM)
            ),
            qresultIDM
        ),
        cbind(
            ID_Mapping_Type = rep(
                "SSNN_noIDM",
                nrow(qresultSSNN)
            ),
            qresultSSNN
        )
    )
    qresult <- qresult[order(
        gsub("Query_", "", qresult$GeneName),
        qresult$ID_Mapping_Type
    ), ]
    colnames(qresult) <- c(
        "ID_Mapping_Type", "GeneName", "Ensembl_IDs",
        "UniProt_QueryIDs", "CHEMBL_CMP_ID", "Molregno",
        "PubChem_ID", "DrugBank_ID", "Drug_Name", "MOA",
        "Action_Type", "First_Approval", "ChEMBL_TID",
        "TID", "UniProt_ID", "Target_Desc", "Organism",
        "Mesh_Indication"
    )
    rownames(qresult) <- NULL

    ## (2.1) Obtain bioassay data for IDM
    queryBy <- list(molType = "protein", idType = "uniprot", ids = idm_ids)
    qresultBAIDM <- drugTargetBioactivity(queryBy,
        cmpid_file = cmpid_file,
        config = config
    )
    qresultBAIDM <- data.frame(
        GeneName = as.character(
            geneids[as.character(qresultBAIDM$UniProt_ID)]
        ),
        Ensembl_IDs = as.character(
            ensids[as.character(qresultBAIDM$UniProt_ID)]
        ),
        qresultBAIDM
    )

    ## (2.2) Obtain bioassay data for SSNN excluding IDM overlaps (SSNN_noIDM)
    queryBy <- list(molType = "protein", idType = "uniprot", ids = ssnn_ids)
    qresultBASSNN <- suppressWarnings(drugTargetBioactivity(
        queryBy,
        cmpid_file = cmpid_file, config = config
    ))
    qresultBASSNN <- data.frame(
        GeneName = geneids[
            as.character(qresultBASSNN$UniProt_ID)
        ],
        Ensembl_IDs = ensids[
            as.character(qresultBASSNN$UniProt_ID)
        ],
        qresultBASSNN
    )
    ## Prepend "Query_" to indicate that returned genes IDs are often not the
    ## ones encoding SSNN UniProt IDs
    if (length(qresultBASSNN[, "Ensembl_IDs"]) > 0) {
        qresultBASSNN[, "Ensembl_IDs"] <- paste0(
            "Query_",
            qresultBASSNN[, "Ensembl_IDs"]
        )
    }
    # Fixes Query_NA strings in NA cases
    qresultBASSNN[qresultBASSNN[, "Ensembl_IDs"] == "Query_NA", "Ensembl_IDs"] <- NA
    if (length(qresultBASSNN[, "GeneName"]) > 0) {
        qresultBASSNN[, "GeneName"] <- paste0(
            "Query_",
            qresultBASSNN[, "GeneName"]
        )
    }
    # Fixes Query_NA strings in NA cases
    qresultBASSNN[qresultBASSNN[, "GeneName"] == "Query_NA", "GeneName"] <- NA

    ## (2.3) Combine IDM and SSNN_noIDM results
    qresultBA <- rbind(
        cbind(
            ID_Mapping_Type = rep("IDM", nrow(qresultBAIDM)),
            qresultBAIDM
        ),
        cbind(
            ID_Mapping_Type = rep(
                "SSNN_noIDM",
                nrow(qresultBASSNN)
            ),
            qresultBASSNN
        )
    )
    qresultBA <- qresultBA[order(
        gsub("Query_", "", qresultBA$GeneName),
        qresultBA$ID_Mapping_Type
    ), ]
    colnames(qresultBA) <- c(
        "ID_Mapping_Type", "GeneName", "Ensembl_IDs",
        "CHEMBL_CMP_ID", "Molregno", "PubChem_ID",
        "DrugBank_ID", "Drug_Name", "Activity_ID",
        "CHEMBL_Assay_ID", "UniProt_ID", "Target_Desc",
        "Organism", "Standard_Value", "Standard_Units",
        "Standard_Flag", "Standard_Type"
    )
    rownames(qresultBA) <- NULL

    ## (3.1) Export annotation and bioassay results in list
    drug_target_list <- list(Annotation = qresult, Bioassay = qresultBA)
    return(drug_target_list)
}
## Usage:
# ensembl_gene_id <- c("ENSG00000001626", "ENSG00000168748")
# idMap <- getSymEnsUp(EnsDb="EnsDb.Hsapiens.v86",
#              ids=ensembl_gene_id, idtype="ENSEMBL_GENE_ID")
# ens_gene_id <- idMap$ens_gene_id
# res_list90 <- getUniprotIDs(taxId=9606, kt="ENSEMBL",
#                keys=names(ens_gene_id), seq_cluster="UNIREF90", chunksize=1)
# queryBy <- list(molType="gene", idType="ensembl_gene_id",
#                     ids=names(ens_gene_id))
# res_list <- getParalogs(queryBy)
# drug_target_list <- runDrugTarget_Annot_Bioassay(res_list=res_list,
#                         up_col_id="ID_up_sp", ens_gene_id,
#                         chembldb, cmp_ids.rds)
girke-lab/drugTargetInteractions documentation built on Oct. 10, 2022, 10:35 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
girke-lab/drugTargetInteractions
Drug-Target Interactions

R/drugTargetAnnotations_Fct.R
In girke-lab/drugTargetInteractions: Drug-Target Interactions

Documented in cmpIdMapping downloadChemblDb downloadUniChem drugTargetAnnot drugTargetAnnotTable drugTargetBioactivity genConfig getDrugTarget getParalogs getSymEnsUp getUniprotIDs processDrugage runDrugTarget_Annot_Bioassay transformTTD

R Package Documentation

Browse R Packages

We want your feedback!

girke-lab/drugTargetInteractions Drug-Target Interactions

R/drugTargetAnnotations_Fct.R In girke-lab/drugTargetInteractions: Drug-Target Interactions

Documented in cmpIdMapping downloadChemblDb downloadUniChem drugTargetAnnot drugTargetAnnotTable drugTargetBioactivity genConfig getDrugTarget getParalogs getSymEnsUp getUniprotIDs processDrugage runDrugTarget_Annot_Bioassay transformTTD

R Package Documentation

Browse R Packages

We want your feedback!

girke-lab/drugTargetInteractions
Drug-Target Interactions

R/drugTargetAnnotations_Fct.R
In girke-lab/drugTargetInteractions: Drug-Target Interactions