In cytomining/cytominergallery: A Gallery of Cytominer Vignettes

This notebook analyzes feature distributions. We categorize features as being multimodal, skewed but unimodel, or symmetric and unimodal. A statistic (Hartigan's dip test statistic) is used to check for multimodality. Unimodal features with skewness > 2 are categorized as "skewed". All other features are categorized as symmetric and unimodal. This is reported in a table in the notebook and also available as a CSV file. Additionally, we plot the histogram of each feature, saved as a PNG. Correlations between features are also reported.

knitr::opts_chunk$set(echo = TRUE)

library(cytominergallery)
library(dplyr)
library(ggplot2)
library(magrittr)
library(outliers)
library(purrr)
library(readr)
library(stringr)
library(tidyr)
library(tibble)
library(reshape2)
library(rbenchmark)
library(robustbase)

set.seed(42)

Download files

```{sh eval=FALSE} mkdir -p ~/Downloads/BBBC022_workspace

cd ~/Downloads/BBBC022_workspace

wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/backend_BBBC022_20646.tar.gz

wget https://s3.amazonaws.com/imaging-platform-collaborator/2016_09_09_cytominer_workshop/metadata_BBBC022.tar.gz

tar xvzf backend_BBBC022_20646.tar.gz

tar xvzf metadata_BBBC022.tar.gz

## Load database backend

```r

workspace_dir <- file.path(Sys.getenv("HOME"), "Downloads", "BBBC022_workspace")

batch_id <- "BBBC022_2013"

plate_id <- "20646"

plate_backend <-
  file.path(workspace_dir,
            paste0("backend/", batch_id, "/", plate_id, "/", plate_id,".sqlite"))

db <- src_sqlite(path = plate_backend)

Load metadata

barcode_platemap <-
  suppressMessages(read_csv(file.path(workspace_dir, paste0("metadata/", batch_id, "/barcode_platemap.csv"))))

metadata <-
paste0(
  file.path(workspace_dir, paste0("metadata/", batch_id,"/platemap/")),
  barcode_platemap$Plate_Map_Name %>% unique(),
  ".txt"
  ) %>%
  map_df(function(x) suppressMessages(read_tsv(x))) %>%
  rename(Plate_Map_Name = plate_map_name) %>%
  inner_join(barcode_platemap, by = c("Plate_Map_Name")) %>%
  mutate(Plate = Assay_Plate_Barcode,
         Well = well_position) %>%
  mutate(broad_sample = ifelse(is.na(broad_sample), "DMSO", broad_sample))

names(metadata) %<>% str_c("Metadata", ., sep = "_")

if (db_has_table(db$con, table = "metadata")) {
  db$con %>% db_drop_table(table = "metadata")
}
metadata <- dplyr::copy_to(db, metadata)

Sample cells from DMSO wells

frac_cells_per_image <- .8

images_per_well <- 6

# sample images from DMSO wells
sampled_images <-
  metadata %>%
  filter(Metadata_broad_sample == "DMSO") %>%
  inner_join(tbl(db, "Image"), by = c("Metadata_Plate" = "Image_Metadata_Plate",
                         "Metadata_Well" = "Image_Metadata_Well")) %>%
  select(matches("Metadata_|TableNumber|ImageNumber")) %>%
  collect() %>%
  group_by(Metadata_Plate, Metadata_Well) %>%
  sample_n(images_per_well) %>%
  ungroup()

if (db_has_table(db$con, table = "sampled_images")) {
  db$con %>% db_drop_table(table = "sampled_images")
}

sampled_images <- dplyr::copy_to(db, sampled_images)

# sample cells from the sampled images
sampled_objects <-
  sampled_images %>%
  inner_join(
    tbl(db, "Cells") %>% select(TableNumber, ImageNumber, ObjectNumber),
    by = c("TableNumber", "ImageNumber")) %>%
  collect() %>%
  group_by(TableNumber, ImageNumber) %>%
  sample_frac(frac_cells_per_image) %>%
  ungroup()

if (db_has_table(db$con, table = "sampled_objects")) {
  db$con %>% db_drop_table(table = "sampled_objects")
}

sampled_objects <- dplyr::copy_to(db, sampled_objects)

sampled_objects %<>%
  inner_join(tbl(db, "Cells"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Cytoplasm"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  inner_join(tbl(db, "Nuclei"), by = c("TableNumber", "ImageNumber", "ObjectNumber")) %>%
  collect(n = Inf)

Peform Hartigan's dip test for multimodality

remove_outliers <- function(x) {
  upper_end_outlier <- outliers::scores(x, type = "iqr", lim = 1.5)
  lower_end_outlier <- outliers::scores(-x, type = "iqr", lim = 1.5)
  x[which(!(upper_end_outlier | lower_end_outlier))]
}

robust_dip_test <- function(x) {
  diptest::dip.test(remove_outliers(x))[["p.value"]]
}

diptest_p_values <-
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  map(robust_dip_test) %>%
  as_data_frame() %>%
  gather(feature, diptest_p_value)

diptest_p_values$diptest_p_value_adjusted <-
  p.adjust(diptest_p_values$diptest_p_value, method = "hochberg")

Compute skewness

skewnesses <-
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  map(robustbase::mc) %>%
  as_data_frame() %>%
  gather(feature, skewness)

Compute feature ranges

feature_ranges <-
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  gather(feature, value) %>%
  group_by(feature) %>%
  summarise_at(vars(value),
               funs(min = min(., na.rm = TRUE),
                    q01 = quantile(., probs = 0.01, na.rm = TRUE),
                    q99 = quantile(., probs = 0.99, na.rm = TRUE),
                    max = max(., na.rm = TRUE)))

Find thresholds for outlier detection

feature_outlier_ranges <-
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  gather(feature, value) %>%
  group_by(feature) %>%
  summarise_at(vars(value),
               funs(lower_outlier_thr = quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE),
                    upper_outlier_thr = quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)))

Collect, summarize and save statistics

features_statistics <-
  inner_join(diptest_p_values, skewnesses, by = c("feature")) %>%
  inner_join(feature_ranges, by = c("feature")) %>%
  inner_join(feature_outlier_ranges, by = c("feature")) %>%
  mutate(is_multimodal = diptest_p_value < 0.1) %>%
  mutate(is_skewed = abs(skewness) > .5) %>%
  mutate(is_skewed = ifelse(is_multimodal, NA, is_skewed))

# Feature skewness summary
features_statistics %>%
  group_by(is_skewed) %>%
  tally() %>%
  print(digits = 2)

# Feature multimodality summary
features_statistics %>%
  group_by(is_multimodal) %>%
  tally() %>%
  print(digits = 2)

# Feature multimodality table
features_statistics %>%
  mutate(neg_log_p_value_dip = -log(diptest_p_value, base = 10)) %>%
  filter(diptest_p_value < 0.10) %>%
  select(feature, neg_log_p_value_dip) %>%
  arrange(desc(neg_log_p_value_dip)) %>%
  print(digits = 3)

# Feature skewness table
features_statistics %>%
  select(feature, skewness) %>%
  na.omit() %>%
  arrange(desc(abs(skewness))) %>%
  print(digits = 3)

# Feature range table
features_statistics %>%
  select(feature, min, q01, q99, max) %>%
  print(digits = 3)

# Feature outlier threshold table
feature_outlier_ranges %>%
  select(feature, lower_outlier_thr, upper_outlier_thr) %>%
  print(digits = 3)

features_statistics %>% write_csv("feature_statistics.csv")

feature_outlier_ranges %>% write_csv("feature_outlier_ranges.csv")

Plot densities of features

if(!dir.exists("feature_densities")) {
  dir.create("feature_densities")
}

feature_names <- names(sampled_objects) %>%
  str_subset("Cells_|Cytoplasm_|Nuclei_")

for (feature_name in feature_names) {
  g <- ggplot(sampled_objects, aes_string(feature_name)) +
    stat_density(adjust = 0.8, alpha = 0.5)

  ggsave(plot = g,
         filename = sprintf("feature_densities/%s.png", feature_name),
         width = 5, height = 5)
}

Report the correlations using a distributed version of the correlation function

splits <- 2
cores <- 4
# how many top correlated feature to each given feature is shown
n_top_cor <- 5

correlations <-
  sampled_objects %>%
  select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
  cytostats::parallel_cor(splits = splits,
                          cores = cores,
                          cov_fun = "two_pass_multi_covar") %>%
  as.data.frame %>%
  rownames_to_column("feature_1") %>%
  gather(feature_2, cor, -feature_1) %>%
  filter(feature_1 != feature_2) %>%
  na.omit %>%
  arrange(desc(abs(cor)))

top_correlations_per_feature <-
  correlations %>%
  group_by(feature_1) %>%
  slice(1:n_top_cor) %>%
  ungroup

# Correlations between features
top_correlations_per_feature %>%
  print(digits = 3)

correlations %>% write_csv("correlations.csv")

Benchmark `cor` and `parallel_cor`

splits <- 4
cores <- 4
sample_fraction <- 1/8

rbenchmark::benchmark(
  sampled_objects %>%
    select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
    sample_frac(sample_fraction) %>%
    cytostats::parallel_cor(splits = splits,
                            cores = cores,
                            cov_fun = "two_pass_multi_covar"), replications = 3) %>%
  select(-test) %>%
  gather(key, value)

# for `sample_fraction = 1`, `splits = 32`, `cores = 32`
# on an `m4.16xlarge` AWS instance:
#>  replications elapsed relative user.self sys.self user.child sys.child
#            3   7.356        1     3.768    1.996     70.176     6.253


rbenchmark::benchmark(
  sampled_objects %>%
    sample_frac(sample_fraction) %>%
    select(matches("^Cells_|^Cytoplasm_|^Nuclei_")) %>%
    cor,
  replications = 3
  ) %>%
  select(-test) %>%
  gather(key, value)

# for `sample_fraction = 1`, `splits = 32`, `cores = 32`
# on an `m4.16xlarge` AWS instance:
#>  replications elapsed relative user.self sys.self user.child sys.child
#>            3   47.54        1     47.35    0.175          0         0