# Set seed for reproducibility set.seed(1454944673L) library(knitr) opts_chunk[["set"]]( audodep = TRUE, cache = FALSE, cache.lazy = FALSE, error = TRUE, fig.height = 10L, fig.width = 10L, message = FALSE, tidy = TRUE, warning = FALSE )
library(cowplot) library(ggplot2) library(ggridges) library(scales) library(tidyverse) library(SingleCellExperiment) library(Matrix) # Load bcbioSingleCell object se = readRDS(params$se_file) metrics = colData(se) %>% as.data.frame
plot_total_cells = function(m){ m %>% ggplot(aes(x=sample)) + geom_bar() + ggtitle("NCells") + theme(axis.text.x = element_text(angle=90, hjust = 1)) } plot_saturation = function(m){ plot_grid( m %>% ggplot(aes(saturation_rate, sample)) + geom_density_ridges() + ggtitle("Saturation rate"), m %>% ggplot(aes(dupReads, saturation_rate, color=sample)) + geom_point() + geom_smooth() + scale_x_log10() + geom_vline(xintercept = 15000) + theme(legend.position="none") + ggtitle("Saturation rate vs Total Reads"), m %>% ggplot(aes(dupMeanReads, saturation_rate, color=sample)) + geom_point() + geom_smooth() + geom_vline(xintercept = 1) + theme(legend.position="bottom") + ggtitle("nGenes vs Total Reads") ) } plot_metrics = function(m){ plot_grid( m %>% ggplot(aes(y=sample, x=nReads)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 5000), m %>% ggplot(aes(y=sample, x=nUMI)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 500), m %>% ggplot(aes(y=sample, x=nGenes)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 500), m %>% ggplot(aes(y=sample, x=mitoRatio)) + geom_density_ridges() + geom_vline(xintercept = params$max_mito_ratio) ) } plot_correlation = function(m){ plot_grid( m %>% ggplot(aes(x=nUMI, y=nReads, color=mitoRatio)) + geom_point(alpha = 0.4) + scale_x_log10() + scale_y_log10() + geom_vline(xintercept = 800) + facet_wrap(~sample), m %>% ggplot(aes(x=nUMI, y=nGenes, color=mitoRatio)) + geom_point() + scale_x_log10() + scale_y_log10() + geom_vline(xintercept = 800)+ facet_wrap(~sample) ) } plot_novelty = function(m){ m %>% ggplot(aes(x=log10GenesPerUMI, y = sample)) + geom_density_ridges() }
plot_total_cells(metrics)
plot_saturation(metrics)
These four plots show:
counts of how many reads are assigned to a given cellular barcode. It is normal for single cell RNA-seq data to contain a large number of low complexity barcodes. The bcbio pipeline filters out most of these barcodes, and here we have applied a threshold cutoff of a minimum of 1000 reads and UMIs > 100 per cell.
unique molecular identifier (UMI)-deconvoluted counts per cell. In general, the distributions should be relatively uniform per sample.
genes with a non-zero count measurement per cell. Seeing gene detection in the range of 500
-5000
is normal for most single-cell experiments.
mitochondrial gene expression as a biomarker of cellular stress during sample preparation.
plot_metrics(metrics)
If we graph out the total number of UMI counts per cell vs. the genes detected per cell and as well vs. reads, we can assess whether there is a large population of low quality cells with low counts and/or gene detection.
plot_correlation(metrics)
Another way to QC the data is to look for less novelty, that is cells that have less genes detected per count than other cells. We can see the samples where we sequenced each cell less have a higher overall novelty, that is because we have not started saturated the sequencing for any given gene for these samples. Outlier cells in these samples might be cells that we have a less complex RNA species than other cells. Sometimes we can detect contamination with low complexity cell types like red blood cells via this metric.
plot_novelty(metrics)
Doing very low filtering:
Taking the top 3000 cells
This possible can be relaxed, but difficult to know at this stage. Normally you try some different cutoff and make the seurat analysis and check the clusters/genes to decide what make more sense.
iSEE is very good visualization tool for inspection of the data:
# same or sample-specific cutoffs cutoff = data.frame(sample=levels(metrics$sample), umi_limit = 800) # it could be one value for each cell keep = metrics %>% tibble::rownames_to_column("samples") %>% left_join(cutoff, by = "sample") %>% dplyr::filter(nUMI >= umi_limit, nGenes>=params$min_genes, mitoRatio<params$max_mito_ratio) %>% .[["samples"]] # take top 3000 cells to compare better between samples lapply(levels(metrics$sample), function(s){ metrics %>% tibble::rownames_to_column("samples") %>% dplyr::filter(sample == s) %>% arrange(desc(nUMI)) %>% head(3000) %>% .[["samples"]] }) %>% unlist -> keep3000 keep_genes = rowSums(assay(se)>0) > params$min_cells_per_gene se_3000 = se[keep_genes,keep3000] metrics_3000 = colData(se_3000) %>% as.data.frame() se_c = se[keep_genes,keep] metrics_clean = colData(se_c) %>% as.data.frame()
plot_total_cells(metrics_3000)
plot_saturation(metrics_3000)
plot_metrics(metrics_3000) plot_correlation(metrics_3000)
plot_novelty(metrics_3000)
plot_total_cells(metrics_clean)
plot_saturation(metrics_clean)
plot_metrics(metrics_clean) plot_correlation(metrics_clean)
plot_novelty(metrics_clean)
saveRDS(se_c, file = file.path(params$data_dir, "se_filtered.rds"))
library(Seurat) seurat <- CreateSeuratObject( raw.data = assay(se_c), meta.data = metrics_clean) saveRDS(seurat, file = file.path(params$data_dir, "seurat.rds"))
Files in data
folder are:
se = readRDS("se.rds")
, is the singleCellExperiment object with all
the cells with soft filtering from pipeline anlaysis : cells with reads > 1000 and umis > 100.se = readRDS("se_filtered.rds")
, is the singleCellExperiment object after filtering.seurat = readRDS("seurat.rds")
is the Seurat object after filtering.devtools::session_info()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.