knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
library(ChemLogic)
## first we load our receptor 1 data, this is for the androgen receptor receptor_1_data <- load_chem_data(path = "inst/extdata/Curated_AR_TOX21.xlsx", file_type = "xlsx", filter = TRUE, filter_type = "Pubchem") ## first we load our receptor 2 data, this is for the estrogen receptor receptor_2_data <- load_chem_data(path = "inst/extdata/Curated_ER_TOX21.xlsx", file_type = "xlsx", filter = TRUE, filter_type = "Pubchem") ## load the chemical labels that correspond to the pubchem fingerprint columns pubchem_fingerprints_cleaned <- load_chem_data(path = "inst/extdata/pubchem_fingerprints_cleaned.txt", file_type = "txt", filter = FALSE) ## get an array of the fingerprints fingerprints <- pubchem_fingerprints_cleaned[,2] ## load test data for applying results to pfas_data <- load_chem_data(path = "inst/extdata/pfas.csv", file_type = "csv", filter = TRUE, filter_type = "Pubchem")
receptor1_binding_type_data <- load_chem_data(path = "inst/extdata/AR_with_outcome.csv", file_type = "csv", filter = FALSE) receptor2_binding_type_data <- load_chem_data(path = "inst/extdata/ER_with_outcome.csv", file_type = "csv", filter = FALSE)
table(receptor_1_data$Name %in% receptor1_binding_type_data$Name) receptor_1_data_w_binding <- merge(receptor_1_data, receptor1_binding_type_data, by = "Name") table(receptor_1_data_w_binding$Label_names) receptor_2_data_w_binding <- merge(receptor_2_data, receptor2_binding_type_data, by = "Name") table(receptor_2_data_w_binding$Label_names)
Clean up the outcome data
relabel_outcomes <- function(x) { x$Label_names <- tolower(x$Label_names) x$Label_names <- ifelse(x$Label_names == "inconclusive", "inactive", x$Label_names) x$Label_names <- ifelse(x$Label_names == "inactive", 0, 1) x$Label_names <- as.factor(x$Label_names) return(x) } receptor_1_data <- relabel_outcomes(receptor_1_data) receptor_2_data <- relabel_outcomes(receptor_2_data) receptor_1_data_w_binding <- relabel_outcomes(receptor_1_data_w_binding) receptor_2_data_w_binding <- relabel_outcomes(receptor_2_data_w_binding) table(receptor_1_data$Label_names) table(receptor_2_data$Label_names)
## one of our outcome data variables has blanks for agonists/antagonists, let's change that for better plotting receptor_1_data_w_binding$ASSAY_OUTCOME <- ifelse(receptor_1_data_w_binding$ASSAY_OUTCOME =="", "no data", receptor_1_data_w_binding$ASSAY_OUTCOME) receptor_1_data_w_binding$ASSAY_OUTCOME <-as.factor(receptor_1_data_w_binding$ASSAY_OUTCOME) ## estrogen receptor receptor_2_data_w_binding$ASSAY_OUTCOME <- ifelse(receptor_2_data_w_binding$ASSAY_OUTCOME =="", "no data", receptor_2_data_w_binding$ASSAY_OUTCOME) receptor_2_data_w_binding$ASSAY_OUTCOME <-as.factor(receptor_2_data_w_binding$ASSAY_OUTCOME) table(receptor_2_data_w_binding$ASSAY_OUTCOME)
receptor_1_cluster_results <- cluster_chem_data(receptor_type = "Androgen", data = receptor_1_data, rank = 5, filter_type = "Pubchem") receptor_2_cluster_results <- cluster_chem_data(receptor_type = "Estrogen", data = receptor_2_data, rank = 5, filter_type = "Pubchem") receptor_1_cluster_results$kmean_plot receptor_2_cluster_results$kmean_plot
receptor_1_SVD_grps_results <- plot_SVD_kmeans(receptor_type = "Androgen", log_SVD_results = receptor_1_cluster_results$logsvd_results, binary_data = receptor_1_data, ## binary data k = 2, ## number of k-means clusters from plot above outcome_label = quo(Label_names), endo_ligand = TRUE, ## plot endogenous ligand? endo_ligand_id = '144207887', ## chemical name for endogenous ligand of interest additional_label = FALSE, ## if you want to colorize by a binding type label = TRUE binding_label = "ASSAY_OUTCOME") ## label name for alternate label receptor_1_SVD_grps_results$plot_kmeans receptor_1_SVD_grps_results$plot_outcome_label receptor_1_data_wPCs_kgrps <- receptor_1_SVD_grps_results$updated_data receptor_2_SVD_grps_results <- plot_SVD_kmeans(receptor_type = "Estrogen", log_SVD_results = receptor_2_cluster_results$logsvd_results, binary_data = receptor_2_data, k = 3, outcome_label = quo(Label_names), endo_ligand = TRUE, endo_ligand_id = '144213948', additional_label = FALSE, binding_label = "ASSAY_OUTCOME") # is the name for testosterone the endogenous ligand receptor_2_SVD_grps_results$plot_kmeans receptor_2_SVD_grps_results$plot_outcome_label receptor_2_data_wPCs_kgrps <- receptor_2_SVD_grps_results$updated_data receptor_1_data_wPCs_kgrps$log_SVM_group <-as.factor(receptor_1_data_wPCs_kgrps$log_SVM_group) receptor_2_data_wPCs_kgrps$log_SVM_group <-as.factor(receptor_2_data_wPCs_kgrps$log_SVM_group)
Running the MCMC logic regression
niter <- 10000 n_burn_in <- 1000 mx_size <- 50 top_fraction <- 0.5 ## run the logic MCMC on the androgen receptor data receptor_1_logic_MCMC_results <- run_logic_MCMC( outcome_label = quo(Label_names), filter_type = "Pubchem", binary_data = receptor_1_data_wPCs_kgrps, subgroups = TRUE, subgroup_label = quo(log_SVM_group), mx_size = mx_size, ntrees = 4, n_burn_in = n_burn_in, niter = niter, outcome_permuted = FALSE, top_fraction = top_fraction, find_pop_tree = FALSE ) ## run the logic MCMC on the estrogen receptor data receptor_2_logic_MCMC_results <- run_logic_MCMC( outcome_label = quo(Label_names), filter_type = "Pubchem", binary_data = receptor_2_data_wPCs_kgrps, subgroups = TRUE, subgroup_label = quo(log_SVM_group), mx_size = mx_size, ntrees = 1, n_burn_in = n_burn_in, niter = niter, hyperparameters = log(2), outcome_permuted = FALSE, top_fraction = top_fraction )
## collecting results for MCMC logic using all active as positive outcomes vs. non-binding for AR receptor_1_MCMC_results_main_outcome <- purrr::map(.x = receptor_1_logic_MCMC_results[2], .f = collect_logic_MCMC_results, fingerprints = fingerprints, niter = niter, binary_data = receptor_1_data_wPCs_kgrps, ## still need to carry the new data with groups in for rule making filter_type = "Pubchem", top_fraction = top_fraction) ## collecting results for MCMC logic using all active as positive outcomes vs. non-binding for ER receptor_2_MCMC_results_main_outcome <- purrr::map(.x = receptor_2_logic_MCMC_results[2], .f = collect_logic_MCMC_results, fingerprints = fingerprints, niter = niter, binary_data = receptor_2_data_wPCs_kgrps, filter_type = "Pubchem", top_fraction = top_fraction) ## collecting results for MCMC logic using subgroups as positive outcomes vs. non-binding receptor_1_MCMC_results_subgroups <- purrr::map(.x = receptor_1_logic_MCMC_results$MCMC_subgroups_analysis, .f = collect_logic_MCMC_results, fingerprints = fingerprints, niter = niter, binary_data = receptor_1_data_wPCs_kgrps, filter_type = "Pubchem", top_fraction = top_fraction) receptor_2_MCMC_results_subgroups <- purrr::map(.x = receptor_2_logic_MCMC_results$MCMC_subgroups_analysis, .f = collect_logic_MCMC_results, fingerprints = fingerprints, niter = niter, binary_data = receptor_2_data_wPCs_kgrps, filter_type = "Pubchem", top_fraction = top_fraction)
plot_model_size(receptor_1_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Androgen") plot_model_size(receptor_2_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Estrogen")
AR_var_use <- plot_variables_in_MCMC(MCMC_results = receptor_1_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Androgen", top = 20, iters = niter) AR_var_use ER_var_use <- plot_variables_in_MCMC(MCMC_results = receptor_2_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Estrogen", top = 20, iters = niter) ER_var_use
MCMC_var_use_comparisons <-plot_compare_vars_in_MCMC(MCMC_results_1 = receptor_1_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, MCMC_results_2 = receptor_2_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor_1 = "Androgen", receptor_2 = "Estrogen", top = 20, iters = niter) MCMC_var_use_comparisons$top_receptor_1_agst_2 MCMC_var_use_comparisons$top_receptor_2_agst_1
subgroup_plots <- plot_compare_vars_subgroups(receptor_1_MCMC_results_subgroups) subgroup_plots$`1` subgroup_plots$`2` subgroup_plots$`3` subgroup_plots$`4`
receptor_1_MCMC_couples_triples <- extract_MCMC_couples_triples(receptor_1_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Androgen", couple_thresh = 0.6, triple_thresh = 0.5) receptor_2_MCMC_couples_triples <- extract_MCMC_couples_triples(receptor_2_MCMC_results_main_outcome$MCMC_main_outcomes_analysis, receptor = "Estrogen", couple_thresh = 0.6, triple_thresh = 0.5) recepter1_receptor2_comparison_couples <- rbind(receptor_1_MCMC_couples_triples$MCMC_couples, receptor_2_MCMC_couples_triples$MCMC_couples) receptor2_receptor1_comparison_triples <- rbind(receptor_1_MCMC_couples_triples$MCMC_triples, receptor_2_MCMC_couples_triples$MCMC_triples)
androgen_subgroups_couples_triples <- extract_MCMC_couples_triples_subgroups(MCMC_results = receptor_1_MCMC_results_subgroups, couple_thresh = 0.7, triple_thresh = 0.5) estrogen_subgroups_couples_triples <- extract_MCMC_couples_triples_subgroups(MCMC_results = receptor_2_MCMC_results_subgroups, couple_thresh = 0.7, triple_thresh = 0.5)
mcmc_pop_tree_main <-create_popular_tree_var(receptor_1_logic_MCMC_results$main_pop_mcmc_tree_results, binary_data = receptor_1_data_wPCs_kgrps, filter_type = "Pubchem") receptor_1_data_wPCs_kgrps$mcmc_main_tree <- mcmc_pop_tree_main$pop_tree_var$mcmc_pop_tree_var mcmc_pop_tree_subgroups <- purrr::map(.x = receptor_1_logic_MCMC_results$subgroup_pop_mcmc_tree_results, .f = create_popular_tree_var, binary_data = receptor_1_data_wPCs_kgrps, filter_type = "Pubchem") trees <-lapply(mcmc_pop_tree_subgroups, function (x) x[c('pop_tree_var')]) for (i in 1:length(trees)) { tree_name <- as.character(paste("mcmc_pop_tree", i, sep = "")) receptor_1_data_wPCs_kgrps[,tree_name] <- trees[[i]]$pop_tree_var$mcmc_pop_tree_var } receptor_1_data_wPCs_kgrps$mcmc_grove <- ifelse( receptor_1_data_wPCs_kgrps$mcmc_pop_tree3 == 1 | receptor_1_data_wPCs_kgrps$mcmc_pop_tree4 == 1 | receptor_1_data_wPCs_kgrps$mcmc_main_tree == 1, 1, 0) mcmc_grove_results <- caret::confusionMatrix(table(receptor_1_data_wPCs_kgrps$mcmc_grove, receptor_1_data_wPCs_kgrps$Label_names)) mcmc_main_results <- caret::confusionMatrix(table(receptor_1_data_wPCs_kgrps$mcmc_main_tree, receptor_1_data_wPCs_kgrps$Label_names))
main_mol_boolean_itx_results <- purrr::map(.x = receptor_1_MCMC_results_main_outcome, .f=create_molecular_tree_var, MCMC_result, couple_thresh = 0.5, triple_thresh = 0.5, top_MCMC_vars_thresh = 0.5, binary_data = receptor_1_data_wPCs_kgrps, make_logic_tree = TRUE, expand_boolean_grid = FALSE, filter_type = "Pubchem", fingerprints = fingerprints, outcome_label = quo(Label_names), iters = 10, subgroup = FALSE, mcmc = FALSE) subgroup_mol_boolean_itx_results <- purrr::pmap(list(receptor_1_MCMC_results_subgroups, receptor_1_logic_MCMC_results$subgroup_outcomes, receptor_1_logic_MCMC_results$subgroup_exposures, receptor_1_logic_MCMC_results$subgroup_names, group = c(1:length(receptor_1_logic_MCMC_results$subgroup_outcomes))), .f=create_molecular_tree_var, couple_thresh = 0.1, triple_thresh = 0.1, top_MCMC_vars_thresh = 0.1, binary_data = receptor_1_data_wPCs_kgrps, make_logic_tree = TRUE, expand_boolean_grid = FALSE, filter_type = "Pubchem", fingerprints = fingerprints, outcome_label = quo(log_SVM_group), iters = 10, n_burn_in = 1000, subgroup = TRUE, mcmc = FALSE)
Setting up Python environment in order to calculate molecular similarity scores and physical property scores
library(reticulate) conda_list() use_condaenv(condaenv = 'my-rdkit-env', required = TRUE)
rdkit needs to be installed in a virtual environment, we then use the use_condaenv to use python/anaconda in this environment which has access to the rdkit tools:
```{python my-rdkit-env} import os import pandas as pd import numpy as np from rdkit import Chem, DataStructs from rdkit.Chem import Descriptors,Crippen
rdkit_data = pd.read_csv("/Users/davidmccoy/Documents/PhD/ChemLogic/inst/extdata/AR_with_outcome.csv", engine='python')
rdkit_data.head()
```{python rdkit} from rdkit.Chem import MACCSkeys from rdkit.Chem import rdMolDescriptors from rdkit import Chem, DataStructs from rdkit.Chem import Descriptors,Crippen target_ligand = Chem.MolFromSmiles('CC12CCC3C(C1CCC2O)CCC4=CC(=O)CCC34C') ## canonical smiles for Testosterone target_ligand = Chem.RDKFingerprint(target_ligand) for i in rdkit_data.index: mol=Chem.MolFromSmiles(rdkit_data.loc[i,'SMILES']) print(rdkit_data.loc[i,'SMILES']) print(i) rdkit_data.loc[i,'mol']=mol rdkit_data.loc[i,'ExactMolWt']=Descriptors.ExactMolWt(mol) rdkit_data.loc[i,'FractionCSP3']=Descriptors.FractionCSP3(mol) rdkit_data.loc[i,'HeavyAtomCount']=Descriptors.HeavyAtomCount(mol) rdkit_data.loc[i,'LabuteASA']=Descriptors.LabuteASA(mol) rdkit_data.loc[i,'MolLogP']=Descriptors.MolLogP(mol) rdkit_data.loc[i,'MolWt']=Descriptors.MolWt(mol) rdkit_data.loc[i,'NHOHCount']=Descriptors.NHOHCount(mol) rdkit_data.loc[i,'NOCount']=Descriptors.NOCount(mol) rdkit_data.loc[i,'NumAliphaticCarbocycles']=Descriptors.NumAliphaticCarbocycles(mol) rdkit_data.loc[i,'NumAliphaticHeterocycles']=Descriptors.NumAliphaticHeterocycles(mol) rdkit_data.loc[i,'NumAliphaticRings']=Descriptors.NumAliphaticRings(mol) rdkit_data.loc[i,'NumAromaticCarbocycles']=Descriptors.NumAromaticCarbocycles(mol) rdkit_data.loc[i,'NumAromaticHeterocycles']=Descriptors.NumAromaticHeterocycles(mol) rdkit_data.loc[i,'NumAromaticRings']=Descriptors.NumAromaticRings(mol) rdkit_data.loc[i,'NumHAcceptors']=Descriptors.NumHAcceptors(mol) rdkit_data.loc[i,'NumHDonors']=Descriptors.NumHDonors(mol) rdkit_data.loc[i,'NumRotatableBonds']=Descriptors.NumRotatableBonds(mol) rdkit_data.loc[i,'NumSaturatedCarbocycles']=Descriptors.NumSaturatedCarbocycles(mol) rdkit_data.loc[i,'NumSaturatedHeterocycles']=Descriptors.NumSaturatedHeterocycles(mol) rdkit_data.loc[i,'NumSaturatedRings']=Descriptors.NumSaturatedRings(mol) rdkit_data.loc[i,'RingCount']=Descriptors.RingCount(mol) rdkit_data.loc[i,'TPSA']=Descriptors.TPSA(mol) rdkit_data.loc[i,'NumAmideBonds']=rdMolDescriptors.CalcNumAmideBonds(mol) rdkit_data.loc[i,'NumBridgeheadAtoms']=rdMolDescriptors.CalcNumBridgeheadAtoms(mol) rdkit_data.loc[i,'NumSpiroAtom']=rdMolDescriptors.CalcNumSpiroAtoms(mol) rdkit_data.loc[i,'Tanimoto']=DataStructs.FingerprintSimilarity(Chem.RDKFingerprint(mol),target_ligand)
physical_data <- py$rdkit_data
subgroup_mol_boolean_itx_results <- subgroup_mol_boolean_itx_results[-1] tables <- make_groves(subgroup_tree_formula_results = subgroup_mol_boolean_itx_results, main_tree_formula_results = main_mol_boolean_itx_results, input_data = receptor_1_data_wPCs_kgrps, filter_type = "Pubchem", outcome_label = quo(Label_names)) updated_data <- tables$updated_data tables <- c(tables$main, tables$subgroup, tables$subgroup_main) matt_coef <- function(conf_matrix) { TP <- conf_matrix$table[1,1] TN <- conf_matrix$table[2,2] FP <- conf_matrix$table[1,2] FN <- conf_matrix$table[2,1] mcc_num <- (TP*TN - FP*FN) mcc_den <- as.double((TP+FP))*as.double((TP+FN))*as.double((TN+FP))*as.double((TN+FN)) mcc_final <- mcc_num/sqrt(mcc_den) return(mcc_final) } purrr::map(.x = tables, .f = matt_coef)
tree_var_w_names <- updated_data %>% select(Name, contains("mol_tree"), Label_names, subgroup_tree_w_main) full_data <- merge(physical_data, tree_var_w_names, by.x = "Name", by.y = "Name") AR_data_PCs_grps_tree_var_cln <- full_data %>% select(-c(Name, SMILES, Label, mol, ASSAY_OUTCOME, `Unnamed: 0`)) AR_data_PCs_grps_tree_var_cln$Label_names <- as.factor(AR_data_PCs_grps_tree_var_cln$Label_names) mol_ens <- pre(Label_names ~ ., data = AR_data_PCs_grps_tree_var_cln, family = binomial) mol.cv <- cvpre(mol_ens) predictions <- mol.cv$cvpreds predictions_binary <- ifelse(predictions > 0.5, 1,0) prediction_cm <- table(predictions_binary, AR_data_PCs_grps_tree_var_cln$Label_names) caret::confusionMatrix(prediction_cm) plot(mol_ens, nterms = 5, cex = .5) coefs <- coef(mol_ens) imps <- importance(mol_ens, round = 4) expl <- explain(mol_ens, newdata =AR_data_PCs_grps_tree_var_cln, cex = .8)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.