knitr::opts_chunk$set(echo = TRUE, dev = "CairoPNG")
This is a reproduction using the MAESTER script 3.3_TenX_CellLineMix_variants.R
Available here: https://github.com/petervangalen/MAESTER-2021/blob/main/3_Cell_line_mixes_variants/3.3_TenX_CellLineMix_variants.R
We load packages necessary for the MAESTER script.
options(stringsAsFactors = FALSE) options(scipen = 999) suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(SummarizedExperiment)) suppressPackageStartupMessages(library(Seurat)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(Matrix)) suppressPackageStartupMessages(library(ComplexHeatmap)) suppressPackageStartupMessages(library(ggrastr)) suppressPackageStartupMessages(library(readxl)) # Custom Functions cutf <- function(x, f=1, d="/") sapply(strsplit(x, d), function(i) paste(i[f], collapse=d)) computeAFMutMatrix <- function(SE){ cov <- assays(SE)[["coverage"]]+ 0.000001 ref_allele <- as.character(rowRanges(SE)$refAllele) getMutMatrix <- function(letter){ mat <- (assays(SE)[[paste0(letter, "_counts_fw")]] + assays(SE)[[paste0(letter, "_counts_rev")]]) / cov rownames(mat) <- paste0(as.character(1:dim(mat)[1]), "_", toupper(ref_allele), ">", letter) return(mat[toupper(ref_allele) != letter,]) } rbind(getMutMatrix("A"), getMutMatrix("C"), getMutMatrix("G"), getMutMatrix("T")) } popcol.tib <- read_excel("MAESTER_colors.xlsx") mycol.ch <- popcol.tib$hex names(mycol.ch) <- popcol.tib$name
# Import data for 10X 3' v3 Cell Line Mix (available at https://vangalenlab.bwh.harvard.edu/maester-2021/) seu <- readRDS("TenX_CellLineMix_Seurat_Keep.rds") maegatk_all.rse <- readRDS("TenX_CellLineMix_All_mr3_maegatk.rds") # Combine information common.cells <- intersect(colnames(seu), colnames(maegatk_all.rse)) # Put all objects in the same order seu <- seu[,common.cells] maegatk.rse <- maegatk_all.rse[,common.cells] # Prepare allele frequency matrix. af.dm <- data.matrix(computeAFMutMatrix(maegatk.rse))*100 # Collect cell information cells.tib <- tibble(cell = common.cells, orig.ident = seu$orig.ident, CellType_RNA = seu$CellType, UMAP_1 = seu$UMAP_1, UMAP_2 = seu$UMAP_2, Mean_Cov = maegatk.rse$depth) # Make groups of cell ids CellSubsets.ls <- list(unionCells = cells.tib$cell, K562 = filter(cells.tib, CellType_RNA == "K562")$cell, BT142 = filter(cells.tib, CellType_RNA == "BT142")$cell) # Get the mean allele frequency and coverage for every cell subset. mean_af.ls <- lapply(CellSubsets.ls, function(x) rowMeans(af.dm[,x])) mean_cov.ls <- lapply(CellSubsets.ls, function(x) rowMeans(assays(maegatk.rse)[["coverage"]][,x])[as.numeric(cutf(rownames(af.dm), d = "_"))]) names(mean_af.ls) <- paste0("mean_af.", names(mean_af.ls)) names(mean_cov.ls) <- paste0("mean_cov.", names(mean_cov.ls)) # Get the quantiles of the VAFs of each variant in each cell subset quantiles <- c("q01" = 0.01, "q10" = 0.1, "q50" = 0.5, "q90" = 0.9, "q99" = 0.99) quantiles.ls <- lapply(quantiles, function(x) lapply(CellSubsets.ls, function(y) apply(af.dm[,y], 1, quantile, x) )) # Get the mean quality for each variant. assays.ls <- lapply(maegatk.rse@assays$data, function(x) as.matrix(x)) qual.num <- sapply(rownames(af.dm), function(x) { #x <- "2141_T>C" pos <- as.numeric( cutf(x, d = "_") ) mut <- cutf(x, d = ">", f = 2) # Get the mean quality of reads for this call (only use cells in which the base was sequenced) - forward covered_fw <- assays.ls[[str_c(mut, "_counts_fw")]][pos,] > 0 qual_fw <- assays.ls[[str_c(mut, "_qual_fw")]][pos, covered_fw] # Same for reverse covered_rev <- assays.ls[[str_c(mut, "_counts_rev")]][pos,] > 0 qual_rev <- assays.ls[[str_c(mut, "_qual_rev")]][pos, covered_rev] qual <- mean(c(qual_fw, qual_rev)) return(qual) }) # Collect all information in a tibble vars.tib <- as_tibble(do.call(cbind, c(mean_af.ls, mean_cov.ls, unlist(quantiles.ls, recursive = F))), rownames = "var") vars.tib <- add_column(vars.tib, quality = qual.num, .before = 2) # Save for fast loading next time write_tsv(vars.tib, "TenX_CellLineMix_Variants1.txt")
Now, we select the variants of interest.
# Variants 20-fold coverage, VAF of <10% in the bottom 10% of cells and VAF of >90% in top 10% of cells. voi.ch <- vars.tib %>% filter(mean_cov.unionCells > 20, quality > 30, q10.unionCells < 10, q90.unionCells > 90) %>% .$var
# Generate matrices with coverage, allele frequency and reference / variant reads cov_voi.mat <- assays(maegatk.rse)[["coverage"]][as.numeric(cutf(voi.ch, d = "_")),] af_voi.mat <- af.dm[voi.ch,] # Add coverage and allele frequency info from variants of interest to cells.tib for (voi in voi.ch) { cells.tib <- cells.tib %>% left_join(as_tibble(assays(maegatk.rse)[["coverage"]][as.numeric(cutf(voi, d = "_")),], rownames = "cell"), by = "cell") %>% left_join(as_tibble(af.dm[voi,], rownames = "cell"), by = "cell") %>% rename(value.x = str_c("cov_", str_replace(voi, ">", ".")), value.y = str_c("af_", str_replace(voi, ">", "."))) } # Function to sum up the reads supporting either reference or mutant allele for a particulal variant of interest. supporting_calls <- function(voi, maegatk) { #voi <- voi.ch[1] #maegatk <- maegatk.rse # From the variant, determine position, reference and mutant allele pos <- as.numeric(cutf(voi, d = "_")) ref <- cutf(voi, d = "_|>|\\.", f = 2) mut <- cutf(voi, d = ">|\\.", f = 2) # Put all the counts for this variant from the maegatk object in a data frame counts.df <- do.call(cbind, lapply(assays(maegatk)[grepl("counts", names(assays(maegatk)))], function(x) x[pos,])) # Add up reads supporting reference or mutated allele support.df <- data.frame(ref = rowSums(counts.df[,grepl(paste0("^", ref), colnames(counts.df))]), mut = rowSums(counts.df[,grepl(paste0("^", mut), colnames(counts.df))])) support.df } # Calculate number of supporting reads for each variant; then combine supporting_calls.ls <- lapply(voi.ch, function(x) supporting_calls(x, maegatk.rse)) names(supporting_calls.ls) <- voi.ch combined_supporting_calls.df <- do.call(cbind, supporting_calls.ls) # Add up supporting reads: is it a K562 or BT142 cell? supporting_calls.tib <- tibble(cell = rownames(combined_supporting_calls.df), K562_supporting_calls = rowSums(combined_supporting_calls.df[,c("709_G>A.mut", "1888_G>A.mut", "6249_G>A.mut", "8697_G>A.mut", "11719_G>A.mut", "15452_C>A.mut", "1420_T>C.mut", "2141_T>C.mut", "4216_T>C.mut", "6524_T>C.mut", "9117_T>C.mut", "11251_A>G.mut", "11764_A>G.mut", "11812_A>G.mut", "15607_A>G.mut", "7990_C>T.ref", "12741_C>T.mut")]), BT142_supporting_calls = rowSums(combined_supporting_calls.df[,c("709_G>A.ref", "1888_G>A.ref", "6249_G>A.ref", "8697_G>A.ref", "11719_G>A.ref", "15452_C>A.ref", "1420_T>C.ref", "2141_T>C.ref", "4216_T>C.ref", "6524_T>C.ref", "9117_T>C.ref", "11251_A>G.ref", "11764_A>G.ref", "11812_A>G.ref", "15607_A>G.ref", "7990_C>T.mut", "12741_C>T.ref")])) # Combine information cells.tib <- left_join(cells.tib, supporting_calls.tib, by = "cell") # Label cells as K562, BT142, contaminated or NoCoverage based on the number of supporting reads cells.tib <- cells.tib %>% mutate(CellType_MT = ifelse(K562_supporting_calls + BT142_supporting_calls < 30, yes = "NoCoverage", no = ifelse(K562_supporting_calls > 3 & K562_supporting_calls > 10*BT142_supporting_calls, yes = "K562", no = ifelse(BT142_supporting_calls > 3 & BT142_supporting_calls > 2*K562_supporting_calls, yes = "BT142", no = "Contaminated")))) %>% mutate(CellType_MT = factor(CellType_MT, levels = c("Contaminated", "BT142", "K562", "NoCoverage"))) cells.ch <- cells.tib %>% filter(CellType_MT %in% c("K562", "BT142")) %>% select(cell, starts_with("cov_")) %>% filter(apply(.[,-1], 1, function(x) all(x > 3))) %>% .$cell # Create heatmap annotation (RNA-seq clusters) ha <- HeatmapAnnotation(CellType_RNA = gsub("BT142_Cycling", "BT142", cells.tib$CellType_RNA[cells.ch]), col = list(CellType_RNA = c("K562" = "#7BF581", "BT142" = "#8E87F5"))) # Visualize: clustered VAFs Heatmap(af_voi.mat[,cells.ch], col=c("#FCFCFC","#FFEDB0","#FFDF5F","#FEC510","#FA8E24","#F14C2B","#DA2828","#BE2222","#A31D1D"), show_row_names = T, show_column_names = F, cluster_columns = T, cluster_rows = F, name = "AF", top_annotation = ha, border = T, width = unit(3.5, "inch"), height = unit(3.5, "inch"), heatmap_legend_param = list(border = "black", at = c(0,25,50,75,100)), use_raster = T)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.