##
## clue.R: collect data from clue.io
##
## NOTE: clue.io API access requires registration
##
## API key is assigned to the clue.io user profile.
## This script reads it from CLUE_USER_KEY environment variable.
##
## Usage: call the main() from command line:
## R -e 'source("R/clue.R"); main()'
library(assertthat)
library(data.table)
library(dplyr)
library(glue)
library(googlesheets4)
library(httr)
library(jsonlite)
library(readr)
library(whisker)
##
## Settings of global variables
##
USER_KEY <- Sys.getenv("CLUE_USER_KEY")
## quick verification
assertthat::assert_that(!is.null(USER_KEY) && nchar(USER_KEY) > 0)
TARGET.INPUT <- "data/target_list.tsv"
TARGET.LIST.ID <- Sys.getenv("TARGET_LIST_ID")
SERVICE_TOKEN_JSON_VAR_NAME <- "SERVICE_TOKEN_JSON"
API_BASE <- "https://api.clue.io/api/"
VERBOSE <- NULL
## for verbosed httr requests use the following:
## VERBOSE <- verbose()
OUTPUT <- "OUTPUT"
CLUE.RDS <- glue::glue("{OUTPUT}/clue.rds")
CLUE.COLLAPSED.TSV <- glue::glue("{OUTPUT}/clueCollapsed.tsv")
PERTS.TSV <- glue::glue("{OUTPUT}/perts.tsv")
## pert API call result with each columns
PERTS_WIDE.TSV <- glue::glue("{OUTPUT}/perts_wide.tsv")
TARGET_LIST.RDS <- glue::glue("{OUTPUT}/targetList.rds")
# TODO: do we need information from these endpoints as well?
# - rep_fda_product
# - rep_fda_orange-book_term
# - rep_fda_exclusivity
#' Main function
#'
#' Downloads data from clue.io and represent it in
#' TSV and HTML format.
#'
#' @return
main <- function() {
## read input list of targets
if (TARGET.LIST.ID != character(1)) {
message(glue::glue("reading data from spreadsheet..."))
targetList <- readGoogleSpreadSheet(
SERVICE_TOKEN_JSON_VAR_NAME, TARGET.LIST.ID
)
message(glue::glue("reading data from spreadsheet done"))
} else {
message(glue::glue("reading data from {TARGET.INPUT}..."))
targetList <- readr::read_tsv(TARGET.INPUT)
message(glue::glue("reading from {TARGET.INPUT} done"))
}
targetList <- targetList %>%
rename(HUGO = 1, Label = 2, UNIPROT_KB_ID = 3) %>%
select(HUGO, Label, UNIPROT_KB_ID) %>%
dplyr::filter(!is.na(HUGO))
targetListVector <- targetList %>% pull(HUGO)
print(glue::glue("Input list: {targetListVector}"))
# prepare output directory
dir.create(OUTPUT, recursive = TRUE, showWarnings = FALSE)
message("downloading data from clue.io...")
result <- download(targetListVector)
message("download finished")
# create output files ---------------------------------------------------
# export result as RDS
saveRDS(result, CLUE.RDS)
message(glue::glue("{CLUE.RDS} created"))
resultCollapsed <- collapseResult(result)
## export collapsed table as TSV
data.table::fwrite(resultCollapsed, CLUE.COLLAPSED.TSV, sep = "\t")
message(glue::glue("{CLUE.COLLAPSED.TSV} created"))
saveRDS(targetList, TARGET_LIST.RDS)
}
#' Read Google SpreadSheet
#'
#' @param authTokenEnvName OS environment variable containing JSON token
#' @param driveFileId unique
#'
#' @return spreadsheet first sheet as tibble
readGoogleSpreadSheet <- function(authTokenEnvName, driveFileId) {
jsonString <- Sys.getenv(authTokenEnvName)
googlesheets4::gs4_auth(
path = jsonString,
scopes = "https://www.googleapis.com/auth/spreadsheets.readonly"
)
googlesheets4::read_sheet(driveFileId)
}
#' Get drug-targets information from clue.io
#'
#' @param ... list of official HUGO gene symbols
#'
#' @return data.frame with "name", "pert_iname",
#' "private_source" and "source" columns.
rep_drug_targets <- function(...) {
apiFunction <- "rep_drug_targets"
filterParams <- list(
where = list(
name = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue::glue("{API_BASE}{apiFunction}?filter={filterParams}")
result <- getWithUserKey(requestUrl)
return(getJSONContentAsDataFrame(result))
}
#' Get sample information from clue.io
#'
#' @param ... list of official HUGO gene symbols
#'
#' @return data.frame with "broad_id", "catalog_no",
#' "expected_mass", "id", "InChIKey", "pert_id", "pert_iname", "pubchem_cid",
#' "purity", "qc_filename", "qc_ID", "qc_result", "sample_ID", "smiles",
#' "vendor" and "vendor_name" columns.
rep_samples <- function(...) {
apiFunction <- "rep_samples"
filterParams <- list(
where = list(
pert_iname = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue::glue("{API_BASE}{apiFunction}?filter={filterParams}")
result <- getWithUserKey(requestUrl)
return(getJSONContentAsDataFrame(result))
}
#' MoAs from clue.io
#'
#' Client side of rep_drug_moas API endpoint.
#'
#' @param ... list of internal (clue.io) perturbation identifiers.
#'
#' @return data.frame of perturbagens with associated indications and diseases.
rep_drug_moas <- function(...) {
apiFunction <- "rep_drug_moas"
filterParams <- list(
where = list(
pert_iname = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue::glue("{API_BASE}{apiFunction}?filter={filterParams}")
result <- getWithUserKey(requestUrl)
return(getJSONContentAsDataFrame(result))
}
#' Drug-disease/indication from clue.io
#'
#' Client side of rep_drug_indications API endpoint.
#'
#' @param ... list of internal (clue.io) perturbation identifiers.
#'
#' @return data.frame of perturbagens with associated indications and diseases.
#' @import dplyr
rep_drug_indications <- function(...) {
apiFunction <- "rep_drug_indications"
filterParams <- list(
where = list(
pert_iname = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue("{API_BASE}{apiFunction}?filter={filterParams}")
result <- getWithUserKey(requestUrl)
return(getJSONContentAsDataFrame(result))
}
#' Get drug information from clue.io
#'
#' Client side of rep_drugs API endpoint.
#'
#' @param ... list of internal (clue.io) perturbation identifiers.
#'
#' @return data.frame of the details.
#' @import dplyr
rep_drugs <- function(...) {
apiFunction <- "rep_drugs"
filterParams <- list(
# fields = list(
# pert_iname = TRUE,
# synonyms = TRUE,
# final_status = TRUE,
# status_source = TRUE,
# orange_book = TRUE, # TODO: bug report to CLUE.IO: cannot be enforced to
# drugbank_id = TRUE, # include "orange_book" field(s), if it has no
# chembl_id = TRUE, # values in the result set
# iuphar_id = TRUE,
# ttd_id = TRUE,
# clinical_notes = TRUE,
# id = TRUE,
# cid = TRUE,
# animal_only = TRUE,
# in_cmap = TRUE
# ),
where = list(
pert_iname = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue("{API_BASE}{apiFunction}?filter={filterParams}")
result <- getWithUserKey(requestUrl)
return(getJSONContentAsDataFrame(result))
}
#' Get perturbation data from clue.io.
#'
#' Client side of perts API endpoint.
#'
#' @param ...
#'
#' @return data.frame of the details.
#' NOTE: Targets with no matches excluded from the result table.
#' @import dplyr
perts <- function(...) {
columnNames <- c(
#id,
# other fields?
"alt_name",
"pert_id",
"inchi_key",
"pert_url",
# pert_summary, # maybe "description" is enough
"pcl_membership",
"target",
"pert_iname",
"moa",
"description",
"status",
"pubchem_cid"
)
apiFunction <- "perts"
filterParams <- list(
where = list(
target = list(inq = c(...))
)
)
filterParams <- jsonlite::toJSON(filterParams)
## concatenate URL with parameters
requestUrl <- glue::glue("{API_BASE}{apiFunction}?filter={filterParams}")
response <- getWithUserKey(requestUrl)
responseFrame <- getJSONContentAsDataFrame(response)
## if data.frame is empty, a temporary dummy item added
if (nrow(responseFrame) == 0) {
responseFrame$dummy_column <- NA
}
## if the response does not contain each column,
## it is augmented by NA vectors
missingColumns <- setdiff(columnNames, names(responseFrame))
if (length(missingColumns) > 0) {
responseFrame[, missingColumns] <- NA
}
## drop dummy column
responseFrame$dummy_column <- NULL
## selection of relevant fields
responseFrame <-
responseFrame %>%
select(all_of(columnNames))
return(responseFrame)
}
#' GET request
#'
#' @param requestUrl
#'
#' @return list object that represents HTTP response data
getWithUserKey <- function(requestUrl) {
result <- httr::GET(url = requestUrl,
config = add_headers(user_key = USER_KEY), VERBOSE)
return(result)
}
#' Convert HTTP response body to data.frame
#'
#' Utility function, used by clue.io API calls.
#'
#' @param httrResponse result of an HTTP request performed by an httr function
#'
#' @return A data.frame object that contains data coerced from JSON response.
getJSONContentAsDataFrame <- function(httrResponse) {
txt <- httr::content(httrResponse, as = "text")
list <- fromJSON(txt)
return(as.data.frame(list))
}
#' Download data from endpoints.
#'
#' Download and join data from different
#' clue.io REST API endpoints.
#'
#' @param ... list of HUGO names of genes.
#'
#' @return Final data.frame composed from multiple datasets.
#' @import dplyr
download <- function(...) {
## function to transform and save perts data
pertsSaveAndExport <- function(pertsData) {
p <- perts %>%
mutate(
alt_name = null.to.na(alt_name),
pert_url = null.to.na(pert_url),
moa = null.to.na(moa)
)
data.table::fwrite(p, file = PERTS_WIDE.TSV, sep = "\t")
message(glue::glue("{PERTS_WIDE.TSV} created"))
pertsData <- pertsData %>%
rowwise() %>%
mutate(target = paste(target, collapse = "|")) %>%
## what are the other fields?
select(target, pert_iname, pubchem_cid) %>%
arrange(target, pert_iname)
data.table::fwrite(pertsData, file = PERTS.TSV, sep = "\t")
message(glue::glue("{PERTS.TSV} created"))
return(invisible(NULL))
}
## helper function
null.to.na <- function(x) {
if (is.list(x) || is.vector(x)) return(x)
if(is.null(x)) x <- NA
return(x)
}
null.to.na <- Vectorize(null.to.na)
## get data from "perts" endpoint
perts <- perts(...)
pertsSaveAndExport(perts)
# Renaming to avoid conflict with moa from repDrugMoAs
perts <- perts %>%
rename(
moa_from_perts = moa,
pubchem_cid_from_perts = pubchem_cid
)
# TODO: alt_name excluded since its complicated list values with NULLs
# select(-c(alt_name))
repDrugTargets <- rep_drug_targets(...) %>%
rename(HUGO = name) %>%
select(-c(id))
repDrugMoAs <- rep_drug_moas(repDrugTargets$pert_iname) %>%
select(pert_iname, moa = name, -c(id))
repDrugIndications <- rep_drug_indications(repDrugTargets$pert_iname) %>%
rename(indication_source = source) %>%
select(-c(id))
repDrugs <- rep_drugs(repDrugTargets$pert_iname) %>%
mutate(rep_drugs.pert_iname = pert_iname) %>%
select(-c(id))
repSamples <- rep_samples(repDrugTargets$pert_iname) %>%
# In this case "pert_id" the join key to "pert" dataset
select(pert_iname, pert_id, InChIKey, pubchem_cid, broad_id) %>%
rename(
# rep_samples.pert_iname = pert_iname,
rep_samples.pert_id = pert_id,
# rep_samples.pubchem_cid = pubchem_cid,
rep_samples.InChIKey = InChIKey
)
## joining tables
result <- repDrugTargets %>%
left_join(repDrugs) %>%
left_join(repDrugMoAs) %>%
left_join(repDrugIndications) %>%
left_join(repSamples) %>%
left_join(perts) %>%
arrange(HUGO) %>%
mutate(
source = null.to.na(source),
orange_book = null.to.na(orange_book),
# variables from perts
alt_name = null.to.na(alt_name),
moa_from_perts = null.to.na(moa_from_perts),
pcl_membership = null.to.na(pcl_membership),
pert_url = null.to.na(pert_url)
) %>%
## re-position and exclusion of columns
select(
HUGO,
pert_iname,
pubchem_cid,
synonyms,
moa,
moa_from_perts,
final_status,
chembl_id,
source,
ttd_id,
drugbank_id,
status_source, ## source of final_status
clinical_notes,
orange_book,
disease_area,
indication,
indication_source,
alt_name,
pert_id, # BRD-...
rep_samples.pert_id,
inchi_key,
rep_samples.InChIKey,
pert_url,
pcl_membership,
## exclude some optional columns
-c(in_cmap, iuphar_id, animal_only)
)
return(result)
}
#' Collapse table from clue
#'
#' Remove redundancies resulted by left joins on clue.io sub-tables.
#'
#' @param result tibble composition of clue.io sub-tables
#'
#' @return collapsed table
collapseResult <- function(result) {
resultCollapsed <- result %>%
select(
HUGO,
pert_iname,
moa,
final_status,
status_source,
drugbank_id,
chembl_id,
pubchem_cid
) %>%
group_by(HUGO, pert_iname) %>%
mutate(
drugbank_id = paste(unique(drugbank_id), collapse = "|"),
pubchem_cid = paste(unique(pubchem_cid), collapse = "|"),
chembl_id = paste(unique(chembl_id, collapse = "|"))) %>%
distinct()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.