data-raw/getPertInfo.R

## Function to get PubChem CIDs by name
name2pubchemCID <- function(x) {
  #print(x)
  url <- paste0('"https://pubchem.ncbi.nlm.nih.gov/compound/', x, '"')
  system2("wget", paste(url, '-O tmp_file --quiet'))
  pc_page <- readLines("tmp_file", warn=FALSE)
  unlink("tmp_file")
  cid_line <- pc_page[grepl("meta name=\"pubchem_uid_value\" content=", pc_page)][1]
  cid <- gsub("(^.*content=\")|\">$", "", cid_line)
  if(is.na(cid[1])){
    return("NotFound")
  } else {
    return(cid)
  }
}
## Usage:
# ids <- c("rhodomyrtoxin-b", "SD-169", "not_avail_test", "baccatin-III")
# vapply(ids, function(i) name2pubchemCID(i), character(1))

# Get compound information from CLUE API, can be replaced by directly downloading
# the Touchstone compound annotation table at https://clue.io/touchstone#
# after selecting `Name`, `MoA`, `Target` columns and `Compounds` perturbagen type.
getPertInfo <- function(pert_iname, user_key){
  pert_info = NULL
  col = c("canonical_smiles", "description", "inchi_key", "inchi_string", "moa", 
          "molecular_formula", "pert_id", "pert_iname", "pert_type", "pubchem_cid", "target")

  for(iname in pert_iname){
    json <- system(paste0('curl -X GET --globoff --header "Accept: application/json" --header "user_key: ',
    user_key, '" "https://api.clue.io/api/perts?filter=%7B%22where%22%3A%7B%22pert_iname%22%3A%22',
    iname,'%22%7D%7D"'), intern = TRUE, ignore.stderr = TRUE)
    
    if(length(json)==0){
      message("Curl error happens on ", iname)
      next()
    }
    if(grepl("error",json) | json=="[]"){
      message("API error happens on ", iname)
      next()
    }
    require(jsonlite)
    jsonr <- fromJSON(json)
    # print(i)
    # i=i+1
    # rownames(jsonr) <- pert_id
    for(item in col){
      if(! item %in% colnames(jsonr)){
        new <- data.frame(NA)
        colnames(new) <- item
        jsonr = cbind(jsonr, new)
      }
    }
    pert_info <- rbind(pert_info, jsonr[,col])
  }
  return(pert_info)
}

# Function to query known drug annotations in ChEMBL db by providing drug
# "chembl_id", "PubChem_ID" or "DrugBank_ID". 
# The result annotation table contains the following columns:
# 
# "Query ID", "ChEMBL ID", "molregno", "PubChem_ID", "DrugBank_ID", "Preferred Name",
# "Max Phase", "Therapeutic Flag", "Molecule Type",  "First Approval", "Oral", "Parenteral",
# "Topical", "Natural Product", "Inorganic Flag", "USAN Year", "Availability Type", 
# "USAN Stem", "USAN Stem Definition", "Indication Class", "Withdrawn Flag", 
# "Withdrawn Year", "Withdrawn Country", "Withdrawn Reason", "Withdrawn Class",
# "mechanism_of_action", "Action Type", "Direct Interaction", "Molecular Mechanism",
# "Disease Efficacy", "Mechanism Comment", "Selectivity Comment".
# 
drugAnnot <- function(queryBy = list(molType = NULL, idType = NULL, ids = NULL),
                      cmpid_file = file.path(config$resultsPath, "cmp_ids.rds"),
                      config = genConfig()) {
  if (any(names(queryBy) != c("molType", "idType", "ids"))) {
    stop(
      "All three list components in 'queryBy' (named: 'molType',",
      " 'idType' and 'ids') need to be present."
    )
  }
  if (any(vapply(queryBy, length, integer(1)) == 0)) {
    stop(
      "All components in 'queryBy' list need to be populated with ",
      "corresponding character vectors."
    )
  }
  
  ## Load ChEMBL SQLite
  ## ChEMBL SQLite downloaded from here: ftp://ftp.ebi.ac.uk/pub/databases/
  ##    chembl/ChEMBLdb/latest/chembl_24_1_sqlite.tar.gz
  ## after unpacking you get chembl_xx.db
  library(RSQLite)
  dbpath <- config$chemblDbPath
  mydb <- dbConnect(SQLite(), dbpath)
  
  ## Input file for following step was generated by cmpIdMapping()
  cmp_ids <- readRDS(cmpid_file)
  rownames(cmp_ids) <- as.character(cmp_ids$molregno)
  
  ## Query by compounds
  if (queryBy$molType == "cmp") {
    ## ID conversions
    if (queryBy$idType == "molregno") {
      cmpvec <- queryBy$ids
    }
    if (queryBy$idType == "chembl_id") {
      cmp_ids <- cmp_ids[!duplicated(cmp_ids$chembl_id), ]
      cmpvec <- as.character(cmp_ids$molregno)
      names(cmpvec) <- as.character(cmp_ids$chembl_id)
      cmpvec <- cmpvec[queryBy$ids]
    }
    if (queryBy$idType == "PubChem_ID") {
      cmp_ids <- cmp_ids[!duplicated(cmp_ids$PubChem_ID), ]
      cmpvec <- as.character(cmp_ids$molregno)
      names(cmpvec) <- as.character(cmp_ids$PubChem_ID)
      cmpvec <- cmpvec[queryBy$ids]
    }
    if (queryBy$idType == "DrugBank_ID") {
      cmp_ids <- cmp_ids[!duplicated(cmp_ids$DrugBank_ID), ]
      cmpvec <- as.character(cmp_ids$molregno)
      names(cmpvec) <- as.character(cmp_ids$DrugBank_ID)
      cmpvec <- cmpvec[queryBy$ids]
    }
    idvec <- paste0("(\"", paste(cmpvec, collapse = "\", \""), "\")")
    
    # "Query ID",  
    # "Direct Interaction", "Molecular Mechanism",
    # "Disease Efficacy", "Mechanism Comment", "Selectivity Comment".
    query <- paste(
      "SELECT a.molregno, a.chembl_id, a.pref_name, a.max_phase,
                a.therapeutic_flag, a.molecule_type, a.first_approval, a.oral,
                a.parenteral, a.topical, a.natural_product, a.inorganic_flag,
                a.usan_year, a.availability_type, a.usan_stem, a.usan_stem_definition,
                a.indication_class, a.withdrawn_flag, a.withdrawn_year, a.withdrawn_country,
                a.withdrawn_reason, a.withdrawn_class,
                b.mechanism_of_action, b.action_type, b.direct_interaction,
                b.molecular_mechanism, b.disease_efficacy, b.mechanism_comment,
                b.selectivity_comment
                      FROM molecule_dictionary AS a
                      LEFT JOIN drug_mechanism AS b ON a.molregno = b.molregno
                      WHERE a.molregno IN", idvec,
      "GROUP BY a.molregno
                       ORDER BY a.molregno, a.pref_name"
    )
    myquery <- dbSendQuery(mydb, query)
    activityassays <- dbFetch(myquery)
    dbClearResult(myquery)
  }
  resultDF <- data.frame(cmp_ids[as.character(activityassays$molregno), -c(2, 4)], 
                         activityassays[, -c(1, 2)])
  dbDisconnect(mydb)
  
  ## Remove rows with identical values in all fields
  resultDF <- resultDF[!duplicated(apply(resultDF, 1, paste, collapse = "_")), ]
  ## Add query column and sort rows in result table according to query
  resultDF <- data.frame(
    QueryIDs = names(cmpvec),
    resultDF[match(names(cmpvec), resultDF[[queryBy$idType]]), ]
  )
  rownames(resultDF) <- NULL
  return(tibble::tibble(resultDF))
}
girke-lab/signatureSearch documentation built on Feb. 21, 2024, 8:32 a.m.