In alex-kalinka-cruk/perept: Performance estimation using repeated tests.

Setup

options(warn=-1)
suppressMessages(library(knitr))
suppressMessages(library(tidyr))
suppressMessages(library(magrittr))
suppressMessages(library(ggplot2))
suppressMessages(library(dplyr))
suppressMessages(library(moments))

data_root <- "~/data/az_cruk"

# Pan-cancer (Sanger), Bagel, and ribosomal proteins.
load("../data/ess_genes.rda")

Overview

The overall goal is to develop a set of standard repeated-tests CRISPR datasets that can be used in conjunction with the Maximum Likelihood estimates of false-positive and false-negative rates (implemented in the R package perept) to compare the performance of algorithm changes for the AZ-CRUK CRISPR analysis pipeline.

Three data sources will be used:

HT29 essentiality screens performed at Sanger.
EGFR treatment-control screens performed at AZ.
Veneto... treatment-control screen.

The HT29 data is not ideal as they are essentiality screens and the effect sizes will be large and not so relevant to drug treatment screens. However, there are a lot of replicates potentially providing power for discriminating the performance of different algorithms.

While the drug screens (2 and 3) are ideal screen types, they have very few replicates and will require sub-sampling of sgRNAs to create pseudo-replicates (respecting screen and sample origins). This is not ideal as it creates dependencies between the tests.

HT29 essentiality screens provided by Francesco Iorio (Sanger)

Approach

The data will be broken up into pairs of replicates since this is the structure of the data that will be used in the in-house CRISPR screens.
Replicate pairs will always be taken from the same screen to respect the original structure of the data.
There will be no overlaps between replicate pairs except in terms of the two plasmid count samples.

Caveats

Only two sets of plasmid counts (named ERS717283.plasmid and CRISPR_C6596666.sample) are used throughout the screens which undermines the independence between different tests.

Creating datasets

path_ht29 <- file.path(data_root,"ht29-francesco-iorio","01_normalised_and_FCs")

load_normalised_counts <- function(path){
  load(path)
  return(normed)
}

# Assemble the sgRNAs, genes, and the two plasmid count replicates.
c903 <- load_normalised_counts(file.path(path_ht29,"HT29_c903.tsv_normCounts.RData"))
c904 <- load_normalised_counts(file.path(path_ht29,"HT29_c904.tsv_normCounts.RData"))
c905 <- load_normalised_counts(file.path(path_ht29,"HT29_c905.tsv_normCounts.RData"))
c906 <- load_normalised_counts(file.path(path_ht29,"HT29_c906.tsv_normCounts.RData"))
c907 <- load_normalised_counts(file.path(path_ht29,"HT29_c907.tsv_normCounts.RData"))
c908 <- load_normalised_counts(file.path(path_ht29,"HT29_c908.tsv_normCounts.RData"))

sgrna_comm <- Reduce(intersect, list(c903$sgRNA, c904$sgRNA, c905$sgRNA, c906$sgRNA, c907$sgRNA, c908$sgRNA))

c903 %<>% filter(sgRNA %in% sgrna_comm)
c904 %<>% filter(sgRNA %in% sgrna_comm)
c905 %<>% filter(sgRNA %in% sgrna_comm)
c906 %<>% filter(sgRNA %in% sgrna_comm)
c907 %<>% filter(sgRNA %in% sgrna_comm)
c908 %<>% filter(sgRNA %in% sgrna_comm)

common <- data.frame(c903[,1:3], c905$CRISPR_C6596666.sample)
colnames(common)[3:4] <- c("plasmid.1","plasmid.2")

Two replicates for 'treatments'

# c903 - 6 replicates, break into 3 non-overlapping sets of 2 replicates.
write.table(data.frame(common,c903[,4:5]), file = file.path(params$out,"ht29-perept-data-c903-1.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c903[,6:7]), file = file.path(params$out,"ht29-perept-data-c903-2.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c903[,8:9]), file = file.path(params$out,"ht29-perept-data-c903-3.tsv"),
            row.names = F, quote=F, sep="\t")
# c904 - 3 replicates, break into 2.
write.table(data.frame(common,c904[,4:5]), file = file.path(params$out,"ht29-perept-data-c904-1.tsv"),
            row.names = F, quote=F, sep="\t")
# c905 - 9 replicates, break into 4 sets of 2 replicates.
write.table(data.frame(common,c905[,4:5]), file = file.path(params$out,"ht29-perept-data-c905-1.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c905[,6:7]), file = file.path(params$out,"ht29-perept-data-c905-2.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c905[,8:9]), file = file.path(params$out,"ht29-perept-data-c905-3.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c905[,10:11]), file = file.path(params$out,"ht29-perept-data-c905-4.tsv"),
            row.names = F, quote=F, sep="\t")
# c906 - 6 replicates, break into 3 sets of 2 replicates.
write.table(data.frame(common,c906[,4:5]), file = file.path(params$out,"ht29-perept-data-c906-1.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c906[,6:7]), file = file.path(params$out,"ht29-perept-data-c906-2.tsv"),
            row.names = F, quote=F, sep="\t")
write.table(data.frame(common,c906[,8:9]), file = file.path(params$out,"ht29-perept-data-c906-3.tsv"),
            row.names = F, quote=F, sep="\t")
# c907 - 3 replicates, break into 2.
write.table(data.frame(common,c907[,4:5]), file = file.path(params$out,"ht29-perept-data-c907-1.tsv"),
            row.names = F, quote=F, sep="\t")
# c908 - 3 replicates, break into 2.
write.table(data.frame(common,c908[,4:5]), file = file.path(params$out,"ht29-perept-data-c908-1.tsv"),
            row.names = F, quote=F, sep="\t")

Six screens as in the original structure

Just need to add both plasmids to each screen.

# c903.
write.table(data.frame(common,c903[,4:9]), file = file.path(params$out_orig,"ht29-perept-data-c903.tsv"),
            row.names = F, quote=F, sep="\t")
# c904.
write.table(data.frame(common,c904[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c904.tsv"),
            row.names = F, quote=F, sep="\t")
# c905.
write.table(data.frame(common,c905[,4:12]), file = file.path(params$out_orig,"ht29-perept-data-c905.tsv"),
            row.names = F, quote=F, sep="\t")
# c906.
write.table(data.frame(common,c906[,4:9]), file = file.path(params$out_orig,"ht29-perept-data-c906.tsv"),
            row.names = F, quote=F, sep="\t")
# c907.
write.table(data.frame(common,c907[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c907.tsv"),
            row.names = F, quote=F, sep="\t")
# c908.
write.table(data.frame(common,c908[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c908.tsv"),
            row.names = F, quote=F, sep="\t")

Creating algorithm-specific gene matrices

Mageck

# Read in Mageck 'gene_summary' files (ranked by negative hits) and process.
process_mageck_output <- function(file_list){
  ret <- NULL
  for(file in file_list){
    screen_name <- gsub("^(.*?)\\.gene_summ.*$","\\1",tail(unlist(strsplit(file,"/")),1))
    ts <- read.table(file, header=T, stringsAsFactors = F) %>%
      arrange(id) %>%
      rowwise() %>%
      mutate(neg_hit = ifelse(neg.fdr < params$fdr,1,0),
             pos_hit = ifelse(pos.fdr < params$fdr,1,0),
             neg_or_pos_hit = ifelse((neg_hit==1 || pos_hit==1),1,0)) %>%
      ungroup() %>%
      select(id,neg_hit,pos_hit,neg_or_pos_hit)
    if(is.null(ret)){
      ret <- ts
    }else{
      ret <- cbind(ret, ts %>% select(-id))
    }
    # Rename columns.
    ret %<>%
      rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit, 
             !!paste(screen_name,".pos_hit",sep="") := pos_hit, 
             !!paste(screen_name,".neg_or_pos_hit",sep="") := neg_or_pos_hit)
  }
  return(ret)
}

# Paired replicates.
files <- list.files(file.path(params$root,"output","mageck","paired_replicates"), 
                    pattern = "gene_summary.txt$", full.names = T)
ht29.mageck_hit_matrix <- process_mageck_output(files)

ht29.mageck_neg <- ht29.mageck_hit_matrix %>%
  select(id, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:14]))

# Original screen structure.
files.orig <- list.files(file.path(params$root,"output","mageck","original_screen_structure","cleanr"), 
                    pattern = "gene_summary.txt$", full.names = T)
ht29.mageck_hit_matrix.orig <- process_mageck_output(files.orig)

ht29.mageck_neg.orig <- ht29.mageck_hit_matrix.orig %>%
  select(id, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:7])) %>%
  rowwise() %>%
  mutate(ess.pan_cancer = ifelse(id%in%ess_genes$gene,
                                 ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==id],T,F),F),
         ess.bagel = ifelse(id%in%ess_genes$gene,
                            ifelse(ess_genes$bagel_essential[ess_genes$gene==id],T,F),F),
         ess.ribosomal_proteins = ifelse(id%in%ess_genes$gene,
                                         ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==id],T,F),F)) %>%
  ungroup()

cat("Pan-cancer aggregate sensitivity:")
sum(ht29.mageck_neg.orig$pos_count[ht29.mageck_neg.orig$ess.pan_cancer])/(sum(ht29.mageck_neg.orig$ess.pan_cancer)*6)

drugZ

# Read in drugZ output files (ranked by negative hits) and process.
process_drugz_output <- function(file_list){
  ret <- NULL
  for(file in file_list){
    screen_name <- gsub("^(.*?)-drugz\\.txt$","\\1",tail(unlist(strsplit(file,"/")),1))
    ts <- read.table(file, header=T, stringsAsFactors = F) %>%
      arrange(GENE) %>%
      rowwise() %>%
      mutate(neg_hit = ifelse(fdr_synth < params$fdr,1,0)) %>%
      ungroup() %>%
      select(GENE,neg_hit)
    if(is.null(ret)){
      ret <- ts
    }else{
      ret <- cbind(ret, ts %>% select(-GENE))
    }
    # Rename columns.
    ret %<>%
      dplyr::rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit)
  }
  return(ret)
}

# Paired replicates.
files.drugz <- list.files(file.path(params$root,"output","drugz","paired_replicates"), 
                    pattern = "txt$", full.names = T)
ht29.drugz_hit_matrix <- process_drugz_output(files.drugz)

ht29.drugz_neg <- ht29.drugz_hit_matrix %>%
  select(GENE, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:14]))

# Original screen structure.
files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr"), 
                    pattern = "txt$", full.names = T)
ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig)

ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>%
  select(GENE, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:7])) %>%
  rowwise() %>%
  mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene,
                                 ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F),
         ess.bagel = ifelse(GENE%in%ess_genes$gene,
                            ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F),
         ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene,
                                         ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>%
  ungroup()

cat("Pan-cancer aggregate sensitivity:")
sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6)

# Original screen structure - median z-score gene summary.
files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr_medzscore"), 
                    pattern = "txt$", full.names = T)
ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig)

ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>%
  select(GENE, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:7])) %>%
  rowwise() %>%
  mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene,
                                 ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F),
         ess.bagel = ifelse(GENE%in%ess_genes$gene,
                            ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F),
         ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene,
                                         ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>%
  ungroup()

cat("Pan-cancer aggregate sensitivity:")
sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6)


# Original screen structure - DESeq2 lfc shrinkage ashr.
files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr_lfc_ashr_deseq2-medz"), 
                    pattern = "txt$", full.names = T)
ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig)

ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>%
  select(GENE, contains("neg_hit")) %>%
  mutate(pos_count = rowSums(.[2:7])) %>%
  rowwise() %>%
  mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene,
                                 ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F),
         ess.bagel = ifelse(GENE%in%ess_genes$gene,
                            ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F),
         ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene,
                                         ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>%
  ungroup()

cat("Pan-cancer aggregate sensitivity:")
sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6)

Analysing disagreements

# Add both plasmids into each normalised count data frame.
c903 <- data.frame(common,c903[,4:ncol(c903)])
c904 <- data.frame(common,c904[,4:ncol(c904)])
c905 <- data.frame(common,c905[,4:ncol(c905)])
c906 <- data.frame(common,c906[,4:ncol(c906)])
c907 <- data.frame(common,c907[,4:ncol(c907)])
c908 <- data.frame(common,c908[,4:ncol(c908)])

# All 6 positive in Mageck, all 6 negative in drugZ.
gm6 <- (ht29.mageck_neg.orig %>% filter(ess.pan_cancer & pos_count == 6))$id
gd0 <- (ht29.drugz_neg.orig %>% filter(ess.pan_cancer & pos_count == 0))$GENE
mp.dn.genes <- intersect(gm6, gd0)

# All 6 positive in both.
gd6 <- (ht29.drugz_neg.orig %>% filter(ess.pan_cancer & pos_count == 6))$GENE
mp.dp.genes <- intersect(gm6, gd6)

# Calculate log2 FC per guide (un-paired) and Sarle's bimodality index divided by the min(abs(logfc)):
# large values indicate a small minimum logfc but a large skew (due to an outlier with low logfc).
quantify_lowlf_skew <- function(data, screen_name){
  # drugz steps:
  # 1. normalize by total read count * 10e6.
  # 2. (unpaired mode). Calculate log2fc using average across replicates.
  nc <- ncol(data)
  data %<>%
    mutate(mean_plasmid = rowMeans(.[3:4]),
           mean_treat = rowMeans(.[5:nc])) %>%
    group_by(gene, sgRNA) %>%
    mutate(logfc = log2(mean_treat/mean_plasmid)) %>%
    ungroup() %>%
    group_by(gene) %>%
    summarise(screen = screen_name,
              num_grnas = n(),
              sarles_bimod = (1+skewness(logfc)^2)/kurtosis(logfc),
              min_logfc = min(abs(logfc)),
              max_logfc = max(abs(logfc)),
              median_logfc = median(logfc),
              logfc_min_norm = median_logfc/min_logfc,
              low_lfc_skew = sarles_bimod*(1/max_logfc))
  return(data)
}

lowlfc_skew <- rbind(quantify_lowlf_skew(c903, "c903"),
                     quantify_lowlf_skew(c904, "c904"),
                     quantify_lowlf_skew(c905, "c905"),
                     quantify_lowlf_skew(c906, "c906"),
                     quantify_lowlf_skew(c907, "c907"),
                     quantify_lowlf_skew(c908, "c908")) %>%
  mutate(type = case_when(gene %in% mp.dn.genes ~ "Call_Only_Mageck",
                          gene %in% mp.dp.genes ~ "Call_Both",
                          TRUE ~ "Other"))

Using logfc EB shrinkage in `DESeq2`

run_deseq2_lfc_shrink <- function(counts, shrink_method){
  # Prep for DESeq2.
  counts <- counts %>%
    select(-gene) %>%
    mutate_if(is.numeric, round)
  rownames(counts) <- counts$sgRNA
  counts %<>% select(-sgRNA)
  # Treatment info.
  ns <- ncol(counts)
  samp <- data.frame(sample = colnames(counts), treatment = c("ctrl","ctrl",rep("drug",ns-2)), stringsAsFactors = F)
  data <- DESeqDataSetFromMatrix(counts, colData=samp, design=~treatment)
  dds <- DESeq(data)
  # EB shrinkage of logFC.
  slf <- lfcShrink(dds, coef="treatment_drug_vs_ctrl", type=shrink_method)
  slf.df <- as.data.frame(slf@listData) %>%
    mutate(sgRNA=slf@rownames,
           gene = gsub("^(.*?)_.*$","\\1",sgRNA)) %>%
    arrange(log2FoldChange)
  return(slf.df)
}

# Prep for input into 'drugz.py'.
prep_dz <- function(data){
  data %<>%
    select(gene,sgRNA,log2FoldChange) %>%
    dplyr::rename(GENE = gene, zscore = log2FoldChange)
  return(data)
}

c903.slf.ashr <- run_deseq2_lfc_shrink(c903, "ashr")
c904.slf.ashr <- run_deseq2_lfc_shrink(c904, "ashr")
c905.slf.ashr <- run_deseq2_lfc_shrink(c905, "ashr")
c906.slf.ashr <- run_deseq2_lfc_shrink(c906, "ashr")
c907.slf.ashr <- run_deseq2_lfc_shrink(c907, "ashr")
c908.slf.ashr <- run_deseq2_lfc_shrink(c908, "ashr")

write.table(prep_dz(c903.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c903-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")
write.table(prep_dz(c904.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c904-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")
write.table(prep_dz(c905.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c905-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")
write.table(prep_dz(c906.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c906-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")
write.table(prep_dz(c907.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c907-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")
write.table(prep_dz(c908.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c908-lfc-deseq2.txt",
            row.names=F, quote=F, sep="\t")

# Read in drugZ output files (ranked by negative hits) and process.
process_deseq2_output <- function(...){
  data_list <- list(...)
  names <- names(data_list)
  ret <- NULL
  for(i in 1:length(data_list)){
    screen_name <- names[i]
    ts <- data_list[[i]] %>%
      arrange(gene) %>%
      group_by(gene) %>%
      summarise(neg_hit = ifelse((median(padj) < params$fdr && median(log2FoldChange) < 0),1,0)) %>%
      ungroup()
    if(is.null(ret)){
      ret <- ts
    }else{
      ret <- cbind(ret, ts %>% dplyr::select(-gene))
    }
    # Rename columns.
    ret %<>%
      dplyr::rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit)
  }
  return(ret)
}

ht29.deseq2_hit_matrix.orig <- process_deseq2_output(c903.slf.ashr, c904.slf.ashr, c905.slf.ashr, c906.slf.ashr, 
                                                    c907.slf.ashr, c908.slf.ashr)