#' Placeholder
#' @description Placeholder
#' @param placeholder
#' @return TBD
#' @export
#'
process_structures <- function(...){
dfs <- purrr::map(list(...), dplyr::mutate_all, as.character)
structures <- dplyr::bind_rows(dfs)
inchikey_smiles <- structures %>%
dplyr::select(inchikey, inchi, smiles) %>%
dplyr::group_by(inchikey) %>%
dplyr::slice(1) %>%
dplyr::ungroup()
#TIf there are more than one InChI or SMILES per InChIKey group (most likely from some of the hand-curated data sources), take the first InChI/SMILES in each group; this will result in a preference for the the
#ChEMBL-assigned InChI/SMILES, if it exists, which is what I prefer.
message('validating smiles...')
parser <- rcdk::get.smiles.parser()
valid.smiles<- pbapply::pbsapply(inchikey_smiles$smiles, webchem::is.smiles)
valid.smiles <- as.data.frame(valid.smiles)
valid.smiles$smiles <- inchikey_smiles$smiles
valid.smiles$inchikey <- inchikey_smiles$inchikey
valid.smiles <- dplyr::distinct(valid.smiles)
message('standardizing valid smiles...')
valid <- valid.smiles$smiles[valid.smiles$valid.smiles==TRUE]
if(!reticulate::py_module_available('molvs')){
message("molvs python module not available. install using reticulate::py_install('molvs')")
}else{
molvs <- reticulate::import('molvs')
standardized_smiles <- pbapply::pbsapply(unique(valid), function(x){
tryCatch({
molvs$standardize_smiles(reticulate::r_to_py(x))
}, warning = function(w) {
message(paste("structure",x,"has an issue"))
}, error = function(e) {
message(paste("structure",x,"is invalid"))
})
})
names(standardized_smiles) <- unique(valid)
valid.df <- standardized_smiles %>%
plyr::ldply() %>%
purrr::set_names(c("smiles", "std_smiles")) %>%
dplyr::filter(!is.na(std_smiles)) %>%
dplyr::inner_join(valid.smiles) %>%
dplyr::select(smiles, std_smiles, inchikey)
structure_ids <- dplyr::select(structures, database, external_id, inchi, inchikey) %>%
dplyr::distinct()
structures_with_std_smiles <- dplyr::inner_join(structure_ids, valid.df)
}
}
#' Function to generate fingerprints.
#' @description This function is best run on a beefy cloud instance (I tested on aws c5d.4xlarge). It will take a while, and use a lot of memory. You really must run "options(java.parameters = '-Xmx16g')" prior to loading rJava package (may require starting a new session), to prevent running into OOM errors.
#' @param placeholder
#' @return TBD
#' @export
#'
generate_fingerprints <- function(structure_df, type){
javamem <- getOption("java.parameters") %>% stringr::str_extract("\\d+.")
if(grepl(".+g", javamem)){
valid <- as.character(structure_df$std_smiles)
parser <- rcdk::get.smiles.parser()
message("parsing smiles")
input.mol <- rcdk::parse.smiles(valid, smiles.parser = parser)
message("doing typing")
pbapply::pblapply(input.mol, rcdk::do.typing)
message("doing aromaticity")
pbapply::pblapply(input.mol, rcdk::do.aromaticity)
message("doing isotopes")
pbapply::pblapply(input.mol, rcdk::do.isotopes)
message("generating fingerprints")
output <- pbapply::pblapply(input.mol, rcdk::get.fingerprint, type = type)
names(output) <- structure_df$inchikey
output
}else{
message(glue::glue("not enough memory allocated for java, try restarting R and running `options(java.parameters = '-Xmx##g')` - where ## is the number of GB of memory you can provide to java. 8 gb minimum. "))
}
}
#' Placeholder
#' @description Placeholder
#' @param placeholder
#' @return TBD
#' @export
#'
process_activities <- function(processed_structures, ...){
ext_id_inchikey <- dplyr::select(processed_structures, external_id, inchikey) %>%
dplyr::distinct()
dfs <- purrr::map(list(...), function(x){
x %>%
dplyr::mutate_all(as.character)
})
activities <- dplyr::bind_rows(dfs) %>%
dplyr::select(external_id, hugo_gene, pchembl_value, standard_relation, standard_value, standard_units, standard_type, evidence_type) %>%
dplyr::mutate(dplyr::across(c('pchembl_value','standard_value'), as.numeric))
message("processing quantitative values...")
##quantitative data
quant <- activities %>% dplyr::filter(evidence_type == "quantitative")
message("summarizing pchembl values...")
pchembl_summary <- quant %>%
dplyr::left_join(x=ext_id_inchikey, y = .) %>%
dplyr::select(inchikey, hugo_gene, pchembl_value) %>%
dplyr::filter(!is.na(pchembl_value)) %>%
dplyr::filter(pchembl_value != "NULL") %>%
dplyr::group_by(inchikey, hugo_gene) %>%
dplyr::summarize("n_quantitative" = dplyr::n(),
"mean_pchembl" = mean(pchembl_value, na.rm = T),
"cv" = raster::cv(pchembl_value),
"sd" = sd(pchembl_value)) %>%
dplyr::ungroup()
message("summarizing assay values...")
chembl_assaytype_summary <- quant %>%
dplyr::left_join(x=ext_id_inchikey, y = .) %>%
dplyr::select(inchikey, hugo_gene, standard_value, standard_type) %>%
dplyr::filter(standard_type %in% c("IC50", "AC50", "EC50", "C50", "Potency", "Ki", "Kd")) %>%
dplyr::filter(!is.na(standard_type)) %>%
dplyr::group_by(inchikey, hugo_gene, standard_type) %>%
dplyr::summarize(mean_value = mean(standard_value)) %>%
dplyr::ungroup() %>%
tidyr::spread(standard_type, mean_value) %>%
dplyr::select(inchikey, hugo_gene, IC50, AC50, EC50, Potency, Ki, Kd) %>%
purrr::set_names(c("inchikey", "hugo_gene", "IC50_nM","AC50_nM",
"EC50_nM", "Potency_nM", "Ki_nM", "Kd_nM"))
quant_data <- dplyr::full_join(pchembl_summary, chembl_assaytype_summary)
##qualitative data
message("summarizing qualitative associations...")
qual <- activities %>% dplyr::filter(evidence_type == "qualitative")
qual_data <- qual %>%
dplyr::left_join(x=ext_id_inchikey, y = .) %>%
dplyr::select(inchikey, hugo_gene) %>%
dplyr::group_by(inchikey, hugo_gene) %>%
dplyr::count(name = "n_qualitative") %>%
dplyr::ungroup() %>%
dplyr::filter(!is.na(hugo_gene)) %>%
dplyr::distinct()
full.db <- dplyr::full_join(quant_data, qual_data, by = c("hugo_gene", "inchikey"))
####need to add confidence calculations...
message('calculating confidence and selectivity scores...')
total_qualitative <- sum(full.db$n_qualitative, na.rm = T)
total_quantitative <- sum(full.db$n_quantitative, na.rm = T)
full.db.scored <- full.db %>%
dplyr::group_by(inchikey, hugo_gene) %>%
dplyr::mutate(total_n = sum(n_quantitative,n_qualitative, na.rm=T)) %>%
dplyr::ungroup()
sd.n <- sd(full.db.scored$total_n, na.rm = T)
mean.n <- mean(full.db.scored$total_n, na.rm = T)
full.db.scored <- full.db.scored %>%
dplyr::mutate(confidence = (total_n-mean.n)/sd.n) %>%
dplyr::group_by(inchikey) %>%
dplyr::mutate(pchembl_d = sum(mean_pchembl, na.rm = T)) %>%
dplyr::ungroup() %>%
dplyr::group_by(hugo_gene) %>%
dplyr::mutate(pchembl_t = sum(mean_pchembl, na.rm = T)) %>%
dplyr::ungroup() %>%
dplyr::group_by(inchikey, hugo_gene) %>%
# mutate(ksi_dt = mean_pchembl/((pchembl_t+pchembl_d)-mean_pchembl)) %>% ##normalizes for target frequency, perhaps not useful for "drug" selectivity
dplyr::mutate(known_selectivity_index = mean_pchembl/pchembl_d) %>%
dplyr::mutate(confidence = signif(confidence,3)) %>%
dplyr::mutate(mean_pchembl = signif(mean_pchembl,3)) %>%
dplyr::mutate(known_selectivity_index = signif(known_selectivity_index,3)) %>%
dplyr::ungroup()
}
#' Placeholder
#' @description Placeholder
#' @param placeholder
#' @return TBD
#' @export
#'
process_names <- function(processed_structures, processed_activities, ...){
ext_id_inchikey <- dplyr::select(processed_structures, external_id, inchikey) %>%
dplyr::distinct() %>%
dplyr::filter(inchikey %in% processed_activities$inchikey)
dfs <- purrr::map(list(...), function(x){
x %>%
dplyr::mutate_all(as.character)
})
names <- dplyr::bind_rows(dfs) %>%
dplyr::inner_join(ext_id_inchikey)
synonyms <- dplyr::select(names, inchikey, cmpd_name) %>%
dplyr::distinct() %>%
dplyr::group_by(inchikey) %>%
dplyr::mutate(pref_name = cmpd_name[1]) %>% ##first occurrence of name is the "preferred name"
dplyr::mutate(upper=toupper(cmpd_name),
dupl=duplicated(upper,fromLast=FALSE)) %>%
dplyr::filter(dupl!=TRUE) %>%
dplyr::select(-upper,-dupl) %>%
dplyr::rename(synonym = cmpd_name) %>%
dplyr::ungroup()
distinct_names <- synonyms %>%
dplyr::select(inchikey, pref_name) %>%
dplyr::distinct()
list("names" = distinct_names, "synonyms" = synonyms)
}
#' Placeholder
#' @description Placeholder
#' @param placeholder
#' @return TBD
#' @export
#'
filter_structures <- function(processed_structures, processed_activities){
processed_structures %>%
dplyr::select(inchikey, inchi, std_smiles) %>%
dplyr::filter(inchikey %in% processed_activities$inchikey) %>%
dplyr::distinct()
}
#' Placeholder
#' @description Placeholder
#' @param placeholder
#' @return TBD
#' @export
#'
process_external_ids <- function(processed_structures, processed_activities){
ext_id_inchikey <- dplyr::select(processed_structures, external_id, inchikey, database) %>%
dplyr::filter(inchikey %in% processed_activities$inchikey) %>%
dplyr::distinct()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.