knitr::opts_chunk$set(echo = TRUE, dev = "CairoPNG")

Reproduction of cell mixture results using 10x data

This is a reproduction using the MAESTER script 3.3_TenX_CellLineMix_variants.R

Available here: https://github.com/petervangalen/MAESTER-2021/blob/main/3_Cell_line_mixes_variants/3.3_TenX_CellLineMix_variants.R

Loading necessary packages.

We load packages necessary for the MAESTER script.

options(stringsAsFactors = FALSE)
options(scipen = 999)
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(SummarizedExperiment))
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(Matrix))
suppressPackageStartupMessages(library(ComplexHeatmap))
suppressPackageStartupMessages(library(ggrastr))
suppressPackageStartupMessages(library(readxl))

# Custom Functions
cutf <- function(x, f=1, d="/") sapply(strsplit(x, d), function(i) paste(i[f], collapse=d))
computeAFMutMatrix <- function(SE){
  cov <- assays(SE)[["coverage"]]+ 0.000001
  ref_allele <- as.character(rowRanges(SE)$refAllele)
  getMutMatrix <- function(letter){
    mat <- (assays(SE)[[paste0(letter, "_counts_fw")]] + assays(SE)[[paste0(letter, "_counts_rev")]]) / cov
    rownames(mat) <- paste0(as.character(1:dim(mat)[1]), "_", toupper(ref_allele), ">", letter)
    return(mat[toupper(ref_allele) != letter,])
  }
  rbind(getMutMatrix("A"), getMutMatrix("C"), getMutMatrix("G"), getMutMatrix("T"))
}
popcol.tib <- read_excel("MAESTER_colors.xlsx")
mycol.ch <- popcol.tib$hex
names(mycol.ch) <- popcol.tib$name

Loading and preparing the data.

# Import data for 10X 3' v3 Cell Line Mix (available at https://vangalenlab.bwh.harvard.edu/maester-2021/)
seu <- readRDS("TenX_CellLineMix_Seurat_Keep.rds")
maegatk_all.rse <- readRDS("TenX_CellLineMix_All_mr3_maegatk.rds")

# Combine information
common.cells <- intersect(colnames(seu), colnames(maegatk_all.rse))

# Put all objects in the same order
seu <- seu[,common.cells]
maegatk.rse <- maegatk_all.rse[,common.cells]

# Prepare allele frequency matrix.
af.dm <- data.matrix(computeAFMutMatrix(maegatk.rse))*100

# Collect cell information
cells.tib <- tibble(cell = common.cells,
                    orig.ident = seu$orig.ident,
                    CellType_RNA = seu$CellType,
                    UMAP_1 = seu$UMAP_1,
                    UMAP_2 = seu$UMAP_2,
                    Mean_Cov = maegatk.rse$depth)

# Make groups of cell ids
CellSubsets.ls <- list(unionCells = cells.tib$cell,
                       K562 = filter(cells.tib, CellType_RNA == "K562")$cell,
                       BT142 = filter(cells.tib, CellType_RNA == "BT142")$cell)

# Get the mean allele frequency and coverage for every cell subset.
mean_af.ls <- lapply(CellSubsets.ls, function(x) rowMeans(af.dm[,x]))
mean_cov.ls <- lapply(CellSubsets.ls, function(x) rowMeans(assays(maegatk.rse)[["coverage"]][,x])[as.numeric(cutf(rownames(af.dm), d = "_"))])
names(mean_af.ls) <- paste0("mean_af.", names(mean_af.ls))
names(mean_cov.ls) <- paste0("mean_cov.", names(mean_cov.ls))

# Get the quantiles of the VAFs of each variant in each cell subset
quantiles <- c("q01" = 0.01, "q10" = 0.1, "q50" = 0.5, "q90" = 0.9, "q99" = 0.99)
quantiles.ls <- lapply(quantiles, function(x) lapply(CellSubsets.ls, function(y) apply(af.dm[,y], 1, quantile, x) ))

# Get the mean quality for each variant.
assays.ls <- lapply(maegatk.rse@assays$data, function(x) as.matrix(x))
qual.num <- sapply(rownames(af.dm), function(x) {
    #x <- "2141_T>C"
    pos <- as.numeric( cutf(x, d = "_") )
    mut <- cutf(x, d = ">", f = 2)
    # Get the mean quality of reads for this call (only use cells in which the base was sequenced) - forward
    covered_fw <- assays.ls[[str_c(mut, "_counts_fw")]][pos,] > 0
    qual_fw <- assays.ls[[str_c(mut, "_qual_fw")]][pos, covered_fw]
    # Same for reverse
    covered_rev <- assays.ls[[str_c(mut, "_counts_rev")]][pos,] > 0
    qual_rev <- assays.ls[[str_c(mut, "_qual_rev")]][pos, covered_rev]
    qual <- mean(c(qual_fw, qual_rev))
    return(qual)
})

# Collect all information in a tibble
vars.tib <- as_tibble(do.call(cbind, c(mean_af.ls, mean_cov.ls, unlist(quantiles.ls, recursive = F))), rownames = "var")
vars.tib <- add_column(vars.tib, quality = qual.num, .before = 2)

# Save for fast loading next time
write_tsv(vars.tib, "TenX_CellLineMix_Variants1.txt")

Selecting the Variants of Interest

Now, we select the variants of interest.

# Variants 20-fold coverage, VAF of <10% in the bottom 10% of cells and VAF of >90% in top 10% of cells.
voi.ch <- vars.tib %>% filter(mean_cov.unionCells > 20,
                              quality > 30,
                              q10.unionCells < 10,
                              q90.unionCells > 90) %>% .$var

Visualisation

# Generate matrices with coverage, allele frequency and reference / variant reads
cov_voi.mat <- assays(maegatk.rse)[["coverage"]][as.numeric(cutf(voi.ch, d = "_")),]
af_voi.mat <- af.dm[voi.ch,]

# Add coverage and allele frequency info from variants of interest to cells.tib
for (voi in voi.ch) {
  cells.tib <- cells.tib %>%
    left_join(as_tibble(assays(maegatk.rse)[["coverage"]][as.numeric(cutf(voi, d = "_")),], rownames = "cell"), by = "cell") %>%
    left_join(as_tibble(af.dm[voi,], rownames = "cell"), by = "cell") %>%
    rename(value.x = str_c("cov_", str_replace(voi, ">", ".")), value.y = str_c("af_", str_replace(voi, ">", ".")))
}

# Function to sum up the reads supporting either reference or mutant allele for a particulal variant of interest.
supporting_calls <- function(voi, maegatk) {
    #voi <- voi.ch[1]
    #maegatk <- maegatk.rse
    # From the variant, determine position, reference and mutant allele
    pos <- as.numeric(cutf(voi, d = "_"))
    ref <- cutf(voi, d = "_|>|\\.", f = 2)
    mut <- cutf(voi, d = ">|\\.", f = 2)

    # Put all the counts for this variant from the maegatk object in a data frame
    counts.df <- do.call(cbind, lapply(assays(maegatk)[grepl("counts", names(assays(maegatk)))], function(x) x[pos,]))

    # Add up reads supporting reference or mutated allele
    support.df <- data.frame(ref = rowSums(counts.df[,grepl(paste0("^", ref), colnames(counts.df))]),
                             mut = rowSums(counts.df[,grepl(paste0("^", mut), colnames(counts.df))]))

    support.df
}

# Calculate number of supporting reads for each variant; then combine
supporting_calls.ls <- lapply(voi.ch, function(x) supporting_calls(x, maegatk.rse))
names(supporting_calls.ls) <- voi.ch
combined_supporting_calls.df <- do.call(cbind, supporting_calls.ls)

# Add up supporting reads: is it a K562 or BT142 cell?
supporting_calls.tib <- tibble(cell = rownames(combined_supporting_calls.df),
    K562_supporting_calls = rowSums(combined_supporting_calls.df[,c("709_G>A.mut", "1888_G>A.mut", "6249_G>A.mut", "8697_G>A.mut", "11719_G>A.mut", "15452_C>A.mut", "1420_T>C.mut", "2141_T>C.mut", "4216_T>C.mut", "6524_T>C.mut", "9117_T>C.mut", "11251_A>G.mut", "11764_A>G.mut", "11812_A>G.mut", "15607_A>G.mut", "7990_C>T.ref", "12741_C>T.mut")]),
    BT142_supporting_calls = rowSums(combined_supporting_calls.df[,c("709_G>A.ref", "1888_G>A.ref", "6249_G>A.ref", "8697_G>A.ref", "11719_G>A.ref", "15452_C>A.ref", "1420_T>C.ref", "2141_T>C.ref", "4216_T>C.ref", "6524_T>C.ref", "9117_T>C.ref", "11251_A>G.ref", "11764_A>G.ref", "11812_A>G.ref", "15607_A>G.ref", "7990_C>T.mut", "12741_C>T.ref")]))

# Combine information
cells.tib <- left_join(cells.tib, supporting_calls.tib, by = "cell")

# Label cells as K562, BT142, contaminated or NoCoverage based on the number of supporting reads
cells.tib <- cells.tib %>% mutate(CellType_MT = ifelse(K562_supporting_calls + BT142_supporting_calls < 30, yes = "NoCoverage", no =
    ifelse(K562_supporting_calls > 3 & K562_supporting_calls > 10*BT142_supporting_calls, yes = "K562", no =
    ifelse(BT142_supporting_calls > 3 & BT142_supporting_calls > 2*K562_supporting_calls, yes = "BT142", no = "Contaminated")))) %>%
    mutate(CellType_MT = factor(CellType_MT, levels = c("Contaminated", "BT142", "K562", "NoCoverage")))

cells.ch <- cells.tib %>% filter(CellType_MT %in% c("K562", "BT142")) %>%
    select(cell, starts_with("cov_")) %>%
    filter(apply(.[,-1], 1, function(x) all(x > 3))) %>%
    .$cell

# Create heatmap annotation (RNA-seq clusters)
ha <- HeatmapAnnotation(CellType_RNA = gsub("BT142_Cycling", "BT142", cells.tib$CellType_RNA[cells.ch]),
                        col = list(CellType_RNA = c("K562" = "#7BF581", "BT142" = "#8E87F5")))

# Visualize: clustered VAFs
Heatmap(af_voi.mat[,cells.ch],
        col=c("#FCFCFC","#FFEDB0","#FFDF5F","#FEC510","#FA8E24","#F14C2B","#DA2828","#BE2222","#A31D1D"),
        show_row_names = T,
        show_column_names = F,
        cluster_columns = T,
        cluster_rows = F,
        name = "AF",
        top_annotation = ha,
        border = T,
        width = unit(3.5, "inch"),
        height = unit(3.5, "inch"),
        heatmap_legend_param = list(border = "black", at = c(0,25,50,75,100)),
        use_raster = T)


CostaLab/sigurd documentation built on Feb. 10, 2025, 11:08 p.m.