# Set seed for reproducibility
set.seed(1454944673L)
library(knitr)
opts_chunk[["set"]](
    audodep = TRUE,
    cache = FALSE,
    cache.lazy = FALSE,
    error = TRUE,
    fig.height = 10L,
    fig.width = 10L,
    message = FALSE,
    tidy = TRUE,
    warning = FALSE
)
library(cowplot)
library(ggplot2)
library(ggridges)
library(scales)
library(tidyverse)
library(SingleCellExperiment)
library(Matrix)

# Load bcbioSingleCell object
se = readRDS(params$se_file)
metrics = colData(se) %>% as.data.frame

Overview

plot_total_cells = function(m){
    m %>% 
  ggplot(aes(x=sample)) + geom_bar() + ggtitle("NCells") +
    theme(axis.text.x = element_text(angle=90, hjust = 1))

}

plot_saturation = function(m){
    plot_grid(
    m %>% 
        ggplot(aes(saturation_rate, sample)) +
        geom_density_ridges() +
        ggtitle("Saturation rate"),
    m %>% 
        ggplot(aes(dupReads, saturation_rate, color=sample)) +
        geom_point() +
        geom_smooth() +
        scale_x_log10() +
        geom_vline(xintercept = 15000) +
        theme(legend.position="none") +
        ggtitle("Saturation rate vs Total Reads"),

    m %>% 
        ggplot(aes(dupMeanReads, saturation_rate, color=sample)) +
        geom_point() +
        geom_smooth() +
        geom_vline(xintercept = 1) +
        theme(legend.position="bottom") +
        ggtitle("nGenes vs Total Reads")
    )
}

plot_metrics = function(m){
    plot_grid(
        m %>% 
            ggplot(aes(y=sample,  x=nReads)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 5000),
        m %>% 
            ggplot(aes(y=sample,  x=nUMI)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 500),
        m %>% 
            ggplot(aes(y=sample,  x=nGenes)) + geom_density_ridges() + scale_x_log10() + geom_vline(xintercept = 500),
        m %>% 
            ggplot(aes(y=sample,  x=mitoRatio)) + geom_density_ridges() + geom_vline(xintercept = params$max_mito_ratio)
    )
}

plot_correlation =  function(m){
    plot_grid(
    m %>% 
        ggplot(aes(x=nUMI, y=nReads, color=mitoRatio)) + geom_point(alpha = 0.4) + scale_x_log10() + scale_y_log10() + geom_vline(xintercept = 800) +
        facet_wrap(~sample),
    m %>% 
        ggplot(aes(x=nUMI, y=nGenes, color=mitoRatio)) + geom_point() + scale_x_log10() + scale_y_log10() + geom_vline(xintercept = 800)+
        facet_wrap(~sample)
    )
}

plot_novelty = function(m){
    m %>%
    ggplot(aes(x=log10GenesPerUMI, y = sample)) +
    geom_density_ridges()
}

Quality control metrics

Cells/Genes per sample

plot_total_cells(metrics)

Saturation

plot_saturation(metrics)

QC plots

These four plots show:

plot_metrics(metrics)

UMIs vs. genes detected vs. reads

If we graph out the total number of UMI counts per cell vs. the genes detected per cell and as well vs. reads, we can assess whether there is a large population of low quality cells with low counts and/or gene detection.

plot_correlation(metrics)

Novelty

Another way to QC the data is to look for less novelty, that is cells that have less genes detected per count than other cells. We can see the samples where we sequenced each cell less have a higher overall novelty, that is because we have not started saturated the sequencing for any given gene for these samples. Outlier cells in these samples might be cells that we have a less complex RNA species than other cells. Sometimes we can detect contamination with low complexity cell types like red blood cells via this metric.

plot_novelty(metrics)

Filter cells

Doing very low filtering:

Taking the top 3000 cells

This possible can be relaxed, but difficult to know at this stage. Normally you try some different cutoff and make the seurat analysis and check the clusters/genes to decide what make more sense.

iSEE is very good visualization tool for inspection of the data:

# same or sample-specific cutoffs
cutoff = data.frame(sample=levels(metrics$sample),
                    umi_limit = 800) # it could be one value for each cell
keep = metrics %>% tibble::rownames_to_column("samples") %>% 
    left_join(cutoff, by = "sample") %>% 
    dplyr::filter(nUMI >= umi_limit,
                  nGenes>=params$min_genes,
                  mitoRatio<params$max_mito_ratio) %>%
    .[["samples"]]

# take top 3000 cells to compare better between samples
lapply(levels(metrics$sample), function(s){
    metrics %>% tibble::rownames_to_column("samples") %>% 
        dplyr::filter(sample == s) %>% 
        arrange(desc(nUMI)) %>% 
        head(3000) %>% 
        .[["samples"]]
}) %>% unlist -> keep3000

keep_genes = rowSums(assay(se)>0) > params$min_cells_per_gene

se_3000 = se[keep_genes,keep3000]
metrics_3000 = colData(se_3000) %>% as.data.frame()

se_c = se[keep_genes,keep]
metrics_clean = colData(se_c) %>% as.data.frame()

Clean with top 3000 cells

Cells/Genes per sample

plot_total_cells(metrics_3000)

Saturation

plot_saturation(metrics_3000)

QC plots after filtering

plot_metrics(metrics_3000)

plot_correlation(metrics_3000)
plot_novelty(metrics_3000)

Clean with custom nUMI cutoffs

Cells/Genes per sample

plot_total_cells(metrics_clean)

Saturation

plot_saturation(metrics_clean)

QC plots after filtering

plot_metrics(metrics_clean)

plot_correlation(metrics_clean)
plot_novelty(metrics_clean)
saveRDS(se_c, file = file.path(params$data_dir, "se_filtered.rds"))
library(Seurat)
seurat <- CreateSeuratObject(
  raw.data = assay(se_c), meta.data = metrics_clean)
saveRDS(seurat, file = file.path(params$data_dir, "seurat.rds"))

Files in data folder are:

session

devtools::session_info()


hbc/hbcABC documentation built on Sept. 5, 2019, 9:51 p.m.