options(warn=-1) suppressMessages(library(knitr)) suppressMessages(library(tidyr)) suppressMessages(library(magrittr)) suppressMessages(library(ggplot2)) suppressMessages(library(dplyr)) suppressMessages(library(moments)) data_root <- "~/data/az_cruk" # Pan-cancer (Sanger), Bagel, and ribosomal proteins. load("../data/ess_genes.rda")
The overall goal is to develop a set of standard repeated-tests CRISPR datasets that can be used in conjunction with the Maximum Likelihood estimates of false-positive and false-negative rates (implemented in the R package perept
) to compare the performance of algorithm changes for the AZ-CRUK CRISPR analysis pipeline.
Three data sources will be used:
The HT29 data is not ideal as they are essentiality screens and the effect sizes will be large and not so relevant to drug treatment screens. However, there are a lot of replicates potentially providing power for discriminating the performance of different algorithms.
While the drug screens (2 and 3) are ideal screen types, they have very few replicates and will require sub-sampling of sgRNAs to create pseudo-replicates (respecting screen and sample origins). This is not ideal as it creates dependencies between the tests.
Approach
Caveats
ERS717283.plasmid
and CRISPR_C6596666.sample
) are used throughout the screens which undermines the independence between different tests.path_ht29 <- file.path(data_root,"ht29-francesco-iorio","01_normalised_and_FCs") load_normalised_counts <- function(path){ load(path) return(normed) } # Assemble the sgRNAs, genes, and the two plasmid count replicates. c903 <- load_normalised_counts(file.path(path_ht29,"HT29_c903.tsv_normCounts.RData")) c904 <- load_normalised_counts(file.path(path_ht29,"HT29_c904.tsv_normCounts.RData")) c905 <- load_normalised_counts(file.path(path_ht29,"HT29_c905.tsv_normCounts.RData")) c906 <- load_normalised_counts(file.path(path_ht29,"HT29_c906.tsv_normCounts.RData")) c907 <- load_normalised_counts(file.path(path_ht29,"HT29_c907.tsv_normCounts.RData")) c908 <- load_normalised_counts(file.path(path_ht29,"HT29_c908.tsv_normCounts.RData")) sgrna_comm <- Reduce(intersect, list(c903$sgRNA, c904$sgRNA, c905$sgRNA, c906$sgRNA, c907$sgRNA, c908$sgRNA)) c903 %<>% filter(sgRNA %in% sgrna_comm) c904 %<>% filter(sgRNA %in% sgrna_comm) c905 %<>% filter(sgRNA %in% sgrna_comm) c906 %<>% filter(sgRNA %in% sgrna_comm) c907 %<>% filter(sgRNA %in% sgrna_comm) c908 %<>% filter(sgRNA %in% sgrna_comm) common <- data.frame(c903[,1:3], c905$CRISPR_C6596666.sample) colnames(common)[3:4] <- c("plasmid.1","plasmid.2")
# c903 - 6 replicates, break into 3 non-overlapping sets of 2 replicates. write.table(data.frame(common,c903[,4:5]), file = file.path(params$out,"ht29-perept-data-c903-1.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c903[,6:7]), file = file.path(params$out,"ht29-perept-data-c903-2.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c903[,8:9]), file = file.path(params$out,"ht29-perept-data-c903-3.tsv"), row.names = F, quote=F, sep="\t") # c904 - 3 replicates, break into 2. write.table(data.frame(common,c904[,4:5]), file = file.path(params$out,"ht29-perept-data-c904-1.tsv"), row.names = F, quote=F, sep="\t") # c905 - 9 replicates, break into 4 sets of 2 replicates. write.table(data.frame(common,c905[,4:5]), file = file.path(params$out,"ht29-perept-data-c905-1.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c905[,6:7]), file = file.path(params$out,"ht29-perept-data-c905-2.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c905[,8:9]), file = file.path(params$out,"ht29-perept-data-c905-3.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c905[,10:11]), file = file.path(params$out,"ht29-perept-data-c905-4.tsv"), row.names = F, quote=F, sep="\t") # c906 - 6 replicates, break into 3 sets of 2 replicates. write.table(data.frame(common,c906[,4:5]), file = file.path(params$out,"ht29-perept-data-c906-1.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c906[,6:7]), file = file.path(params$out,"ht29-perept-data-c906-2.tsv"), row.names = F, quote=F, sep="\t") write.table(data.frame(common,c906[,8:9]), file = file.path(params$out,"ht29-perept-data-c906-3.tsv"), row.names = F, quote=F, sep="\t") # c907 - 3 replicates, break into 2. write.table(data.frame(common,c907[,4:5]), file = file.path(params$out,"ht29-perept-data-c907-1.tsv"), row.names = F, quote=F, sep="\t") # c908 - 3 replicates, break into 2. write.table(data.frame(common,c908[,4:5]), file = file.path(params$out,"ht29-perept-data-c908-1.tsv"), row.names = F, quote=F, sep="\t")
Just need to add both plasmids to each screen.
# c903. write.table(data.frame(common,c903[,4:9]), file = file.path(params$out_orig,"ht29-perept-data-c903.tsv"), row.names = F, quote=F, sep="\t") # c904. write.table(data.frame(common,c904[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c904.tsv"), row.names = F, quote=F, sep="\t") # c905. write.table(data.frame(common,c905[,4:12]), file = file.path(params$out_orig,"ht29-perept-data-c905.tsv"), row.names = F, quote=F, sep="\t") # c906. write.table(data.frame(common,c906[,4:9]), file = file.path(params$out_orig,"ht29-perept-data-c906.tsv"), row.names = F, quote=F, sep="\t") # c907. write.table(data.frame(common,c907[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c907.tsv"), row.names = F, quote=F, sep="\t") # c908. write.table(data.frame(common,c908[,4:6]), file = file.path(params$out_orig,"ht29-perept-data-c908.tsv"), row.names = F, quote=F, sep="\t")
# Read in Mageck 'gene_summary' files (ranked by negative hits) and process. process_mageck_output <- function(file_list){ ret <- NULL for(file in file_list){ screen_name <- gsub("^(.*?)\\.gene_summ.*$","\\1",tail(unlist(strsplit(file,"/")),1)) ts <- read.table(file, header=T, stringsAsFactors = F) %>% arrange(id) %>% rowwise() %>% mutate(neg_hit = ifelse(neg.fdr < params$fdr,1,0), pos_hit = ifelse(pos.fdr < params$fdr,1,0), neg_or_pos_hit = ifelse((neg_hit==1 || pos_hit==1),1,0)) %>% ungroup() %>% select(id,neg_hit,pos_hit,neg_or_pos_hit) if(is.null(ret)){ ret <- ts }else{ ret <- cbind(ret, ts %>% select(-id)) } # Rename columns. ret %<>% rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit, !!paste(screen_name,".pos_hit",sep="") := pos_hit, !!paste(screen_name,".neg_or_pos_hit",sep="") := neg_or_pos_hit) } return(ret) } # Paired replicates. files <- list.files(file.path(params$root,"output","mageck","paired_replicates"), pattern = "gene_summary.txt$", full.names = T) ht29.mageck_hit_matrix <- process_mageck_output(files) ht29.mageck_neg <- ht29.mageck_hit_matrix %>% select(id, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:14])) # Original screen structure. files.orig <- list.files(file.path(params$root,"output","mageck","original_screen_structure","cleanr"), pattern = "gene_summary.txt$", full.names = T) ht29.mageck_hit_matrix.orig <- process_mageck_output(files.orig) ht29.mageck_neg.orig <- ht29.mageck_hit_matrix.orig %>% select(id, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:7])) %>% rowwise() %>% mutate(ess.pan_cancer = ifelse(id%in%ess_genes$gene, ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==id],T,F),F), ess.bagel = ifelse(id%in%ess_genes$gene, ifelse(ess_genes$bagel_essential[ess_genes$gene==id],T,F),F), ess.ribosomal_proteins = ifelse(id%in%ess_genes$gene, ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==id],T,F),F)) %>% ungroup() cat("Pan-cancer aggregate sensitivity:") sum(ht29.mageck_neg.orig$pos_count[ht29.mageck_neg.orig$ess.pan_cancer])/(sum(ht29.mageck_neg.orig$ess.pan_cancer)*6)
# Read in drugZ output files (ranked by negative hits) and process. process_drugz_output <- function(file_list){ ret <- NULL for(file in file_list){ screen_name <- gsub("^(.*?)-drugz\\.txt$","\\1",tail(unlist(strsplit(file,"/")),1)) ts <- read.table(file, header=T, stringsAsFactors = F) %>% arrange(GENE) %>% rowwise() %>% mutate(neg_hit = ifelse(fdr_synth < params$fdr,1,0)) %>% ungroup() %>% select(GENE,neg_hit) if(is.null(ret)){ ret <- ts }else{ ret <- cbind(ret, ts %>% select(-GENE)) } # Rename columns. ret %<>% dplyr::rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit) } return(ret) } # Paired replicates. files.drugz <- list.files(file.path(params$root,"output","drugz","paired_replicates"), pattern = "txt$", full.names = T) ht29.drugz_hit_matrix <- process_drugz_output(files.drugz) ht29.drugz_neg <- ht29.drugz_hit_matrix %>% select(GENE, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:14])) # Original screen structure. files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr"), pattern = "txt$", full.names = T) ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig) ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>% select(GENE, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:7])) %>% rowwise() %>% mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F), ess.bagel = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F), ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>% ungroup() cat("Pan-cancer aggregate sensitivity:") sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6) # Original screen structure - median z-score gene summary. files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr_medzscore"), pattern = "txt$", full.names = T) ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig) ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>% select(GENE, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:7])) %>% rowwise() %>% mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F), ess.bagel = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F), ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>% ungroup() cat("Pan-cancer aggregate sensitivity:") sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6) # Original screen structure - DESeq2 lfc shrinkage ashr. files.drugz.orig <- list.files(file.path(params$root,"output","drugz","original_screen_structure","cleanr_lfc_ashr_deseq2-medz"), pattern = "txt$", full.names = T) ht29.drugz_hit_matrix.orig <- process_drugz_output(files.drugz.orig) ht29.drugz_neg.orig <- ht29.drugz_hit_matrix.orig %>% select(GENE, contains("neg_hit")) %>% mutate(pos_count = rowSums(.[2:7])) %>% rowwise() %>% mutate(ess.pan_cancer = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$PanCancer_Core_Fitness[ess_genes$gene==GENE],T,F),F), ess.bagel = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$bagel_essential[ess_genes$gene==GENE],T,F),F), ess.ribosomal_proteins = ifelse(GENE%in%ess_genes$gene, ifelse(ess_genes$Ribosomal_Proteins[ess_genes$gene==GENE],T,F),F)) %>% ungroup() cat("Pan-cancer aggregate sensitivity:") sum(ht29.drugz_neg.orig$pos_count[ht29.drugz_neg.orig$ess.pan_cancer])/(sum(ht29.drugz_neg.orig$ess.pan_cancer)*6)
# Add both plasmids into each normalised count data frame. c903 <- data.frame(common,c903[,4:ncol(c903)]) c904 <- data.frame(common,c904[,4:ncol(c904)]) c905 <- data.frame(common,c905[,4:ncol(c905)]) c906 <- data.frame(common,c906[,4:ncol(c906)]) c907 <- data.frame(common,c907[,4:ncol(c907)]) c908 <- data.frame(common,c908[,4:ncol(c908)]) # All 6 positive in Mageck, all 6 negative in drugZ. gm6 <- (ht29.mageck_neg.orig %>% filter(ess.pan_cancer & pos_count == 6))$id gd0 <- (ht29.drugz_neg.orig %>% filter(ess.pan_cancer & pos_count == 0))$GENE mp.dn.genes <- intersect(gm6, gd0) # All 6 positive in both. gd6 <- (ht29.drugz_neg.orig %>% filter(ess.pan_cancer & pos_count == 6))$GENE mp.dp.genes <- intersect(gm6, gd6) # Calculate log2 FC per guide (un-paired) and Sarle's bimodality index divided by the min(abs(logfc)): # large values indicate a small minimum logfc but a large skew (due to an outlier with low logfc). quantify_lowlf_skew <- function(data, screen_name){ # drugz steps: # 1. normalize by total read count * 10e6. # 2. (unpaired mode). Calculate log2fc using average across replicates. nc <- ncol(data) data %<>% mutate(mean_plasmid = rowMeans(.[3:4]), mean_treat = rowMeans(.[5:nc])) %>% group_by(gene, sgRNA) %>% mutate(logfc = log2(mean_treat/mean_plasmid)) %>% ungroup() %>% group_by(gene) %>% summarise(screen = screen_name, num_grnas = n(), sarles_bimod = (1+skewness(logfc)^2)/kurtosis(logfc), min_logfc = min(abs(logfc)), max_logfc = max(abs(logfc)), median_logfc = median(logfc), logfc_min_norm = median_logfc/min_logfc, low_lfc_skew = sarles_bimod*(1/max_logfc)) return(data) } lowlfc_skew <- rbind(quantify_lowlf_skew(c903, "c903"), quantify_lowlf_skew(c904, "c904"), quantify_lowlf_skew(c905, "c905"), quantify_lowlf_skew(c906, "c906"), quantify_lowlf_skew(c907, "c907"), quantify_lowlf_skew(c908, "c908")) %>% mutate(type = case_when(gene %in% mp.dn.genes ~ "Call_Only_Mageck", gene %in% mp.dp.genes ~ "Call_Both", TRUE ~ "Other"))
DESeq2
run_deseq2_lfc_shrink <- function(counts, shrink_method){ # Prep for DESeq2. counts <- counts %>% select(-gene) %>% mutate_if(is.numeric, round) rownames(counts) <- counts$sgRNA counts %<>% select(-sgRNA) # Treatment info. ns <- ncol(counts) samp <- data.frame(sample = colnames(counts), treatment = c("ctrl","ctrl",rep("drug",ns-2)), stringsAsFactors = F) data <- DESeqDataSetFromMatrix(counts, colData=samp, design=~treatment) dds <- DESeq(data) # EB shrinkage of logFC. slf <- lfcShrink(dds, coef="treatment_drug_vs_ctrl", type=shrink_method) slf.df <- as.data.frame(slf@listData) %>% mutate(sgRNA=slf@rownames, gene = gsub("^(.*?)_.*$","\\1",sgRNA)) %>% arrange(log2FoldChange) return(slf.df) } # Prep for input into 'drugz.py'. prep_dz <- function(data){ data %<>% select(gene,sgRNA,log2FoldChange) %>% dplyr::rename(GENE = gene, zscore = log2FoldChange) return(data) } c903.slf.ashr <- run_deseq2_lfc_shrink(c903, "ashr") c904.slf.ashr <- run_deseq2_lfc_shrink(c904, "ashr") c905.slf.ashr <- run_deseq2_lfc_shrink(c905, "ashr") c906.slf.ashr <- run_deseq2_lfc_shrink(c906, "ashr") c907.slf.ashr <- run_deseq2_lfc_shrink(c907, "ashr") c908.slf.ashr <- run_deseq2_lfc_shrink(c908, "ashr") write.table(prep_dz(c903.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c903-lfc-deseq2.txt", row.names=F, quote=F, sep="\t") write.table(prep_dz(c904.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c904-lfc-deseq2.txt", row.names=F, quote=F, sep="\t") write.table(prep_dz(c905.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c905-lfc-deseq2.txt", row.names=F, quote=F, sep="\t") write.table(prep_dz(c906.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c906-lfc-deseq2.txt", row.names=F, quote=F, sep="\t") write.table(prep_dz(c907.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c907-lfc-deseq2.txt", row.names=F, quote=F, sep="\t") write.table(prep_dz(c908.slf.ashr), "~/data/az_cruk/perept_data/ht29-iorio/original_screen_structure/lfc_ashr_deseq2/c908-lfc-deseq2.txt", row.names=F, quote=F, sep="\t")
# Read in drugZ output files (ranked by negative hits) and process. process_deseq2_output <- function(...){ data_list <- list(...) names <- names(data_list) ret <- NULL for(i in 1:length(data_list)){ screen_name <- names[i] ts <- data_list[[i]] %>% arrange(gene) %>% group_by(gene) %>% summarise(neg_hit = ifelse((median(padj) < params$fdr && median(log2FoldChange) < 0),1,0)) %>% ungroup() if(is.null(ret)){ ret <- ts }else{ ret <- cbind(ret, ts %>% dplyr::select(-gene)) } # Rename columns. ret %<>% dplyr::rename(!!paste(screen_name,".neg_hit",sep="") := neg_hit) } return(ret) } ht29.deseq2_hit_matrix.orig <- process_deseq2_output(c903.slf.ashr, c904.slf.ashr, c905.slf.ashr, c906.slf.ashr, c907.slf.ashr, c908.slf.ashr)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.