This notebook analyzes feature distributions. We categorize features as being multimodal, skewed but unimodel, or symmetric and unimodal. A statistic (Hartigan's dip test statistic) is used to check for multimodality. Unimodal features with skewness > 2 are categorized as "skewed". All other features are categorized as symmetric and unimodal. This is reported in a table in the notebook and also available as a CSV file. Additionally, we plot the histogram of each feature, saved as a PNG. Correlations between features are also reported.
knitr::opts_chunk$set(echo = TRUE)
library(cytominergallery) library(dplyr) library(ggplot2) library(magrittr) library(outliers) library(purrr) library(readr) library(stringr) library(tidyr) library(tibble) library(reshape2) library(rbenchmark) library(robustbase)
set.seed(42)
```{sh eval=FALSE} mkdir -p ~/Downloads/BBBC022_workspace
cd ~/Downloads/BBBC022_workspace
wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/backend_BBBC022_20646.tar.gz
wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/metadata_BBBC022.tar.gz
tar xvzf backend_BBBC022_20646.tar.gz
tar xvzf metadata_BBBC022.tar.gz
## Load database backend ```r workspace_dir <- file.path(Sys.getenv("HOME"), "Downloads", "BBBC022_workspace") batch_id <- "BBBC022_2013" plate_id <- "20646" plate_backend <- file.path(workspace_dir, paste0("backend/", batch_id, "/", plate_id, "/", plate_id,".sqlite")) db <- src_sqlite(path = plate_backend)
barcode_platemap <- suppressMessages(read_csv(file.path(workspace_dir, paste0("metadata/", batch_id, "/barcode_platemap.csv")))) metadata <- paste0( file.path(workspace_dir, paste0("metadata/", batch_id,"/platemap/")), barcode_platemap$Plate_Map_Name %>% unique(), ".txt" ) %>% map_df(function(x) suppressMessages(read_tsv(x))) %>% rename(Plate_Map_Name = plate_map_name) %>% inner_join(barcode_platemap, by = c("Plate_Map_Name")) %>% mutate(Plate = Assay_Plate_Barcode, Well = well_position) %>% mutate(broad_sample = ifelse(is.na(broad_sample), "DMSO", broad_sample)) names(metadata) %<>% str_c("Metadata", ., sep = "_") if (db_has_table(db$con, table = "metadata")) { db$con %>% db_drop_table(table = "metadata") } metadata <- dplyr::copy_to(db, metadata)
frac_cells_per_image <- .8 images_per_well <- 6 # sample images from DMSO wells sampled_images <- metadata %>% filter(Metadata_broad_sample == "DMSO") %>% inner_join(tbl(db, "Image"), by = c("Metadata_Plate" = "Image_Metadata_Plate", "Metadata_Well" = "Image_Metadata_Well")) %>% select(matches("Metadata_|TableNumber|ImageNumber")) %>% collect() %>% group_by(Metadata_Plate, Metadata_Well) %>% sample_n(images_per_well) %>% ungroup() if (db_has_table(db$con, table = "sampled_images")) { db$con %>% db_drop_table(table = "sampled_images") } sampled_images <- dplyr::copy_to(db, sampled_images) # sample cells from the sampled images sampled_objects <- sampled_images %>% inner_join( tbl(db, "Cells") %>% select(TableNumber, ImageNumber, ObjectNumber), by = c("TableNumber", "ImageNumber")) %>% collect() %>% group_by(TableNumber, ImageNumber) %>% sample_frac(frac_cells_per_image) %>% ungroup() if (db_has_table(db$con, table = "sampled_objects")) { db$con %>% db_drop_table(table = "sampled_objects") } sampled_objects <- dplyr::copy_to(db, sampled_objects) sampled_objects %<>% inner_join(tbl(db, "Cells"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>% inner_join(tbl(db, "Cytoplasm"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>% inner_join(tbl(db, "Nuclei"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>% collect(n = Inf)
remove_outliers <- function(x) { upper_end_outlier <- outliers::scores(x, type = "iqr", lim = 1.5) lower_end_outlier <- outliers::scores(-x, type = "iqr", lim = 1.5) x[which(!(upper_end_outlier | lower_end_outlier))] } robust_dip_test <- function(x) { diptest::dip.test(remove_outliers(x))[["p.value"]] } diptest_p_values <- sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% map(robust_dip_test) %>% as_data_frame() %>% gather(feature, diptest_p_value) diptest_p_values$diptest_p_value_adjusted <- p.adjust(diptest_p_values$diptest_p_value, method = "hochberg")
skewnesses <- sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% map(robustbase::mc) %>% as_data_frame() %>% gather(feature, skewness)
feature_ranges <- sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% gather(feature, value) %>% group_by(feature) %>% summarise_at(vars(value), funs(min = min(., na.rm = TRUE), q01 = quantile(., probs = 0.01, na.rm = TRUE), q99 = quantile(., probs = 0.99, na.rm = TRUE), max = max(., na.rm = TRUE)))
feature_outlier_ranges <- sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% gather(feature, value) %>% group_by(feature) %>% summarise_at(vars(value), funs(lower_outlier_thr = quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE), upper_outlier_thr = quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)))
features_statistics <- inner_join(diptest_p_values, skewnesses, by = c("feature")) %>% inner_join(feature_ranges, by = c("feature")) %>% inner_join(feature_outlier_ranges, by = c("feature")) %>% mutate(is_multimodal = diptest_p_value < 0.1) %>% mutate(is_skewed = abs(skewness) > .5) %>% mutate(is_skewed = ifelse(is_multimodal, NA, is_skewed)) # Feature skewness summary features_statistics %>% group_by(is_skewed) %>% tally() %>% print(digits = 2) # Feature multimodality summary features_statistics %>% group_by(is_multimodal) %>% tally() %>% print(digits = 2) # Feature multimodality table features_statistics %>% mutate(neg_log_p_value_dip = -log(diptest_p_value, base = 10)) %>% filter(diptest_p_value < 0.10) %>% select(feature, neg_log_p_value_dip) %>% arrange(desc(neg_log_p_value_dip)) %>% print(digits = 3) # Feature skewness table features_statistics %>% select(feature, skewness) %>% na.omit() %>% arrange(desc(abs(skewness))) %>% print(digits = 3) # Feature range table features_statistics %>% select(feature, min, q01, q99, max) %>% print(digits = 3) # Feature outlier threshold table feature_outlier_ranges %>% select(feature, lower_outlier_thr, upper_outlier_thr) %>% print(digits = 3) features_statistics %>% write_csv("feature_statistics.csv") feature_outlier_ranges %>% write_csv("feature_outlier_ranges.csv")
if(!dir.exists("feature_densities")) { dir.create("feature_densities") } feature_names <- names(sampled_objects) %>% str_subset("Cells_|Cytoplasm_|Nuclei_") for (feature_name in feature_names) { g <- ggplot(sampled_objects, aes_string(feature_name)) + stat_density(adjust = 0.8, alpha = 0.5) ggsave(plot = g, filename = sprintf("feature_densities/%s.png", feature_name), width = 5, height = 5) }
splits <- 2 cores <- 4 # how many top correlated feature to each given feature is shown n_top_cor <- 5 correlations <- sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% cytostats::parallel_cor(splits = splits, cores = cores, cov_fun = "two_pass_multi_covar") %>% as.data.frame %>% rownames_to_column("feature_1") %>% gather(feature_2, cor, -feature_1) %>% filter(feature_1 != feature_2) %>% na.omit %>% arrange(desc(abs(cor))) top_correlations_per_feature <- correlations %>% group_by(feature_1) %>% slice(1:n_top_cor) %>% ungroup # Correlations between features top_correlations_per_feature %>% print(digits = 3) correlations %>% write_csv("correlations.csv")
cor
and parallel_cor
splits <- 4 cores <- 4 sample_fraction <- 1/8 rbenchmark::benchmark( sampled_objects %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% sample_frac(sample_fraction) %>% cytostats::parallel_cor(splits = splits, cores = cores, cov_fun = "two_pass_multi_covar"), replications = 3) %>% select(-test) %>% gather(key, value) # for `sample_fraction = 1`, `splits = 32`, `cores = 32` # on an `m4.16xlarge` AWS instance: #> replications elapsed relative user.self sys.self user.child sys.child # 3 7.356 1 3.768 1.996 70.176 6.253 rbenchmark::benchmark( sampled_objects %>% sample_frac(sample_fraction) %>% select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>% cor, replications = 3 ) %>% select(-test) %>% gather(key, value) # for `sample_fraction = 1`, `splits = 32`, `cores = 32` # on an `m4.16xlarge` AWS instance: #> replications elapsed relative user.self sys.self user.child sys.child #> 3 47.54 1 47.35 0.175 0 0
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.