When developing an assay for chemical concentration response screening, it is important and useful to understand the noise level of the developing assay. Inspecting the response variation in vehicle control data is a common use approach such as SD, MAD. It is often useful to identify a threshold, above which, the chemical (based on concentration response data) could be considered as active. The SD, MAD metrics are based on a distribution not based on concentration response data and often, it is hard to decide whether x SD or MAD are needed. Also, how to define the outliers in the distribution can greatly shift the SD value.
For toxicologists, this threshold is particularly important because it could represent the point of departure (potency). Previously, we provided an approach to identify this threshold based on boostrap samples at the concentrations and Curvep (a noise filtering program for concentration response data). The identified threshold is the threshold at which the variance in potency estimation is substantially reduced.
The method shows its usefulness in working with the NTP 80 compounds screened on the in vitro and alternative animal assays. However, questions remain such as if it is needed to do bootstrap (n = 1000) on 80 compounds since it takes time. Parellelization could be a solution but it will be also useful to understand if the number of bootstrap sample/compounds can be reduced. Reducing the number of chemicals could be particularly useful for the researchers who are developing assays since usually a small number of compounds are used
In one of the assays in the NTP 80 screen, additional 32 compounds were screened. The same threshold was identified using either the 32-compound screen or NTP 80 screen.
By controling the two parameters (n of compounds) and (n of bootstrap samples), investigate the stability of threshold identification methods. (i.e., using raw, non fitted values or using fitted values from exponential fitting (starting values from lm fit or starting values from SSasymp )
The stored bootstrap sample results (n = 1000) for the 32 compounds
library(here) library(Rcurvep) library(tibble) library(dplyr) library(tidyr) library(stringr) library(purrr) library(ggplot2)
zfishdev_boot <- readRDS(here("data-raw", "big_dataset", "biobide32_percent_boot_1000.rds")) bootd <- zfishdev_boot[["percent_affected_96"]] #make old format as the new one bootd <- bootd %>% rename(threshold = thres) %>% separate(dduid, c("endpoint", "chemical", "direction"), sep = "#") %>% separate(direction, c("directionality", "direction"), sep = "@") %>% select(-directionality) acts <- extract_curvep_data(bootd, "act")
sum_act <- extract_curvep_data(bootd, "summary", "INVERSE") sum_act_hit <- sum_act %>% group_by(chemical, endpoint, direction) %>% summarize(hit_confidence = mean(hit_confidence)) actived <- sum_act_hit %>% filter(hit_confidence > 0.5) inactived <- sum_act_hit %>% filter(hit_confidence <= 0.5) p1 <- ggplot(sum_act_hit, aes(x = hit_confidence)) + geom_histogram() p1
In total, regardless of the curvep threshold used, there are r nrow(actived)
active compounds out of r nrow(sum_act_hit)
compounds. The inactive compound names are: r inactived %>% pull(chemical)
.
subset_boot_data <- function(dd, n_chemical, n_sample) { sel_sample_ids <- seq(1:1000) if (n_sample != 1000) sel_sample_ids <- sample(1:1000, n_sample) chem_names <- dd %>% pull(chemical) %>% unique() sel_chem_names <- sample(chem_names, n_chemical) dd_p <- dd %>% filter(chemical %in% sel_chem_names) %>% filter(repeat_id %in% sel_sample_ids) acts <- dd_p %>% #extract_curvep_data(dd_p, "act") %>% mutate(POD = ifelse(is.na(POD), conc_highest, POD)) result <- acts %>% identify_basenoise_threshold(., plot = FALSE) return(list(thres_d = result[[1]], thres_o = result[[2]], sel_chem = sel_chem_names, sel_sample = sel_sample_ids)) }
save_subset_result <- function(n_chemical, n_sample, times, original_data, dir) { result <- map(1:times, function(z) { result <- subset_boot_data(dd = original_data, n_chemical = n_chemical, n_sample = n_sample) }) saveRDS(result, file.path(dir, paste0(n_chemical, "-", n_sample, "-", times, ".rds"))) }
update_perm_rds <- function(filn, indir, outdir) { d_l <- readRDS(file.path(indir, filn)) d_l_c <- d_l map(1:length(d_l), function(y) { new_comment <- d_l[[y]][[1]] %>% nest(-endpoint, -direction) %>% mutate(knee_out = purrr::map(data, function(x) cal_knee_point(x, "threshold", "pooled_variance")[[2]])) %>% select(-data) %>% unnest() d_l_c[[y]][[2]] <- new_comment } ) saveRDS(d_l_c, file.path(outdir, paste0(filn, ".rds"))) }
read_perm_rds <- function(filn, dir) { pars <- str_replace(filn, ".rds", "") %>% str_split(., "-") %>% unlist() %>% as.numeric() %>% set_names(c("n_chemical", "n_perm", "n_times")) result <- readRDS(file.path(dir, filn)) %>% map_df(., ~ .x[[2]], .id = "perm_id") %>% mutate( n_chemical = pars[["n_chemical"]], n_perm = pars[['n_perm']] ) return(result) }
calculate_metrics <- function(dd) { permd_sum1 <- dd %>% group_by(n_chemical, n_perm, endpoint, direction) %>% summarise( fraction_check = sum(thresComment == "check")/n(), fraction_badfit = sum(cor_exp_fit < 0.95 | is.na(cor_exp_fit))/n(), fraction_caution = sum(thresComment == "cautionary")/n() ) permd_sum2 <- dd %>% filter(thresComment != "check") %>% group_by(n_chemical, n_perm, endpoint, direction) %>% summarize( mean_thresDist_raw = mean(thresDist_raw, na.rm = TRUE), se_thresDist_raw = sd(thresDist_raw, na.rm = TRUE)/sqrt(n()), mean_thresDist_exp = mean(thresDist_exp, na.rm = TRUE), se_thresDist_exp = sd(thresDist_exp, na.rm = TRUE)/sqrt(n()) ) permd_sum <- inner_join(permd_sum1, permd_sum2) return(permd_sum) }
set.seed(19800925) paras <- expand.grid(seq(5, 30, by = 5), c(10, 100, 1000)) %>% set_names(c("n_chemical", "n_sample")) system.time(pmap(paras, save_subset_result, times = 1000, dir = here("data-raw", "big_dataset", "perm_percent"), original_data = acts))
permd1 <- list.files(here("data-raw", "big_dataset", "perm_percent", "v0.5.1"), pattern = "rds") map_df(permd1, function(x) {update_perm_rds(filn = x, indir = here("data-raw", "big_dataset", "perm_percent", "v0.5.1"), outdir = here("data-raw", "big_dataset", "perm_percent"))})
permd1 <- list.files(here("data-raw", "big_dataset", "perm_percent"), pattern = "rds") permd1 <- map_df(permd1, function(x) {read_perm_rds(filn = x, dir = here("data-raw", "big_dataset", "perm_percent"))}) permd2 <- list.files(here("data-raw", "big_dataset", "perm_percent", "v0.5"), pattern = "rds") permd2 <- map_df(permd2, function(x) {read_perm_rds(filn = x, dir = here("data-raw", "big_dataset", "perm_percent", "v0.5"))})
permd_sum1 <- calculate_metrics(permd1) %>% mutate(ver = "SSasymp") permd_sum2 <- calculate_metrics(permd2) %>% mutate(ver = "lmstart") permd_sum <- bind_rows(permd_sum1, permd_sum2) %>% group_by(ver, add = TRUE)
p1 <- ggplot(permd_sum, aes(x = n_chemical, y = fraction_check, color = as.factor(n_perm))) p1 <- p1 + geom_point() + geom_line() + facet_grid(ver ~ ., scales = "free") p1
p2 <- ggplot(permd_sum, aes(x = n_chemical, y = fraction_caution, color = as.factor(n_perm))) p2 <- p2 + geom_point() + geom_line() + facet_grid(ver ~ ., scales = "free") p2
permd_sum_thres <- permd_sum %>% select(mean_thresDist_raw, mean_thresDist_exp) %>% gather(thres_type, mean_thres, mean_thresDist_raw, mean_thresDist_exp) %>% mutate( thres_type = str_replace(thres_type, "mean_", "") ) %>% inner_join( permd_sum %>% select(se_thresDist_raw, se_thresDist_exp) %>% gather(thres_type, se_thres, se_thresDist_raw, se_thresDist_exp) %>% mutate( thres_type = str_replace(thres_type, "se_", "") ) )
p3 <- ggplot(permd_sum_thres, aes(x = n_chemical, y = mean_thres, color = as.factor(n_perm))) p3 <- p3 + geom_point() + geom_line() + facet_grid(thres_type ~ ver, scales = "free") + geom_hline(yintercept = 25) + scale_y_continuous(breaks = seq(18, 36, by = 2)) + scale_x_continuous(breaks = seq(5, 80, by = 5)) + geom_errorbar(aes(ymin = mean_thres - se_thres, ymax = mean_thres + se_thres), width = 0.2) p3
permd_sum %>% filter(n_perm == 100) %>% arrange(fraction_check)
permd_sum %>% filter(n_perm == 100) %>% arrange(fraction_badfit)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.