# knitr options knitr::opts_chunk$set( echo = FALSE ) library(SingleCellExperiment) library(dplyr) library(ggplot2) # Set default ggplot theme theme_set(theme_bw() + theme(plot.margin = margin(rep(20, 4))))
# save some typing later library_id <- params$library unfiltered_sce <- params$unfiltered_sce filtered_sce <- params$filtered_sce processed_sce <- params$processed_sce has_filtered <- !is.null(filtered_sce) has_processed <- !is.null(processed_sce) # if there is no filtered sce, use the unfiltered for both if (!has_filtered) { filtered_sce <- unfiltered_sce } # grab sample id from filtered sce, if missing set sample id to NA if (is.null(metadata(filtered_sce)$sample_id)) { sample_id <- NA } else { sample_id <- metadata(filtered_sce)$sample_id } # add cell stats if missing if (is.null(unfiltered_sce$sum)) { unfiltered_sce <- scuttle::addPerCellQCMetrics(unfiltered_sce) } if (is.null(filtered_sce$sum)) { filtered_sce <- scuttle::addPerCellQCMetrics(filtered_sce) } if (is.null(filtered_sce$subsets_mito_percent)) { filtered_sce$subsets_mito_percent <- NA_real_ skip_miQC <- TRUE } else { skip_miQC <- FALSE } # try to add miQC if it is missing if (is.null(metadata(filtered_sce)$miQC_model) & !skip_miQC) { filtered_sce <- scpcaTools::add_miQC(filtered_sce) } ## Check for additional modalities modalities <- c("RNA-seq") has_cite <- "CITEseq" %in% altExpNames(filtered_sce) if (has_cite) { modalities <- c(modalities, "CITE-seq") } # check for cellhash to add to list of modalities has_cellhash <- "cellhash" %in% altExpNames(filtered_sce) if (has_cellhash) { modalities <- c(modalities, "Multiplex") } # check for umap, need to be sure that processed_sce exists first if (has_processed) { has_umap <- "UMAP" %in% reducedDimNames(processed_sce) } else { has_umap <- FALSE }
r library_id
# only print out multiplexed warning if more than one sample is present has_multiplex <- length(sample_id) > 1 if (has_multiplex) { # convert sample id to bullet separated list multiplex_samples <- paste0("<li>", paste(sample_id, collapse = "</li><li>", "</li>")) glue::glue(" <div class=\"alert alert-warning\"> This library is multiplexed and contains data from more than one sample. Data from the following samples are included in this library: {multiplex_samples} </div> ") }
# extract sce metadata containing processing information as table unfiltered_meta <- metadata(unfiltered_sce) sample_information <- tibble::tibble( "Library id" = library_id, "Sample id" = paste(sample_id, collapse = ", "), "Tech version" = format(unfiltered_meta$tech_version), # format to keep nulls "Data modalities" = paste(modalities, collapse = ", "), "Cells reported by alevin-fry" = format(unfiltered_meta$af_num_cells, big.mark = ",", scientific = FALSE), "Number of genes assayed" = format(nrow(unfiltered_sce), big.mark = ",", scientific = FALSE), "RNA-seq reads sequenced" = format(unfiltered_meta$total_reads, big.mark = ",", scientific = FALSE), "Percent RNA-seq reads mapped to transcripts" = paste0(round((unfiltered_meta$mapped_reads / unfiltered_meta$total_reads) * 100, 2), "%") ) if (has_cite) { cite_exp <- altExp(filtered_sce, "CITEseq") cite_meta <- metadata(cite_exp) sample_information <- sample_information |> mutate( "Number of ADTs assayed" = format(nrow(cite_exp), big.mark = ",", scientific = FALSE), "CITE-seq reads sequenced" = format(cite_meta$total_reads, big.mark = ",", scientific = FALSE), "Percent CITE-seq reads mapped to ADTs" = paste0(round(cite_meta$mapped_reads / cite_meta$total_reads * 100, digits = 2), "%") ) } if (has_cellhash) { multiplex_exp <- altExp(filtered_sce, "cellhash") multiplex_meta <- metadata(multiplex_exp) sample_information <- sample_information |> mutate( "Number of HTOs assayed" = format(nrow(multiplex_exp), big.mark = ",", scientific = FALSE), "Multiplex reads sequenced" = format(multiplex_meta$total_reads, big.mark = ",", scientific = FALSE), "Percent of multiplex reads mapped to HTOs" = paste0(round(multiplex_meta$mapped_reads / multiplex_meta$total_reads * 100, digits = 2), "%") ) } sample_information <- sample_information |> mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |> # reformat nulls t() # make table with sample information knitr::kable(sample_information, align = "r") |> kableExtra::kable_styling( bootstrap_options = "striped", full_width = FALSE, position = "left" ) |> kableExtra::column_spec(2, monospace = TRUE)
# define transcript type transcript_type <- paste(unfiltered_meta$transcript_type, collapse = " ") processing_info <- tibble::tibble( "Salmon version" = format(unfiltered_meta$salmon_version), "Alevin-fry version" = format(unfiltered_meta$alevinfry_version), "Transcriptome index" = format(unfiltered_meta$reference_index), "Alevin-fry droplet detection" = format(unfiltered_meta$af_permit_type), "Resolution" = format(unfiltered_meta$af_resolution), "Transcripts included" = dplyr::case_when( transcript_type == "total spliced" ~ "Total and spliced only", transcript_type == "spliced" ~ "Spliced only", TRUE ~ transcript_type ) ) |> mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |> t() # make table with processing information knitr::kable(processing_info, align = "r") |> kableExtra::kable_styling( bootstrap_options = "striped", full_width = FALSE, position = "left" ) |> kableExtra::column_spec(2, monospace = TRUE)
basic_statistics <- tibble::tibble( "Method used to filter empty droplets" = metadata(filtered_sce)$filtering_method, "Number of cells post filtering empty droplets" = format(ncol(filtered_sce), big.mark = ","), "Percent of reads in cells" = paste0(round((sum(filtered_sce$sum) / sum(unfiltered_sce$sum)) * 100, 2), "%"), "Median UMI count per cell" = format(median(filtered_sce$sum), big.mark = ","), "Median genes detected per cell" = format(median(filtered_sce$detected), big.mark = ","), "Median percent reads mitochondrial" = paste0(round(median(filtered_sce$subsets_mito_percent), 2), "%") ) # if processed sce exists add filtering and normalization table if (has_processed) { processed_meta <- metadata(processed_sce) basic_statistics <- basic_statistics |> mutate( "Method used to filter low quality cells" = format(processed_meta$scpca_filter_method), "Cells after filtering low quality cells" = format(dim(processed_sce)[2], big.mark = ",", scientific = FALSE), "Normalization method" = format(processed_meta$normalization), "Minimum genes per cell cutoff" = format(processed_meta$min_gene_cutoff) ) if (processed_meta$scpca_filter_method == "miQC") { basic_statistics <- basic_statistics |> mutate( "Probability of compromised cell cutoff" = format(processed_meta$prob_compromised_cutoff, big.mark = ",", scientific = FALSE) ) } } basic_statistics <- basic_statistics |> mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |> # reformat nulls t() # make table with basic statistics knitr::kable(basic_statistics, align = "r") |> kableExtra::kable_styling( bootstrap_options = "striped", full_width = FALSE, position = "left" ) |> kableExtra::column_spec(2, monospace = TRUE)
if (has_filtered && (metadata(filtered_sce)$filtering_method == "UMI cutoff")) { glue::glue(" <div class=\"alert alert-warning\"> This library may contain a low number of cells and was unable to be filtered using `DropletUtils`. Droplets with a total UMI count ≥ {metadata(filtered_sce)$umi_cutoff} are included in the filtered `SingleCellExperiment` object. </div> ") }
# check for number of filtered cells min_filtered <- 100 if (has_filtered) { if (ncol(filtered_sce) < min_filtered) { glue::glue(" <div class=\"alert alert-warning\"> This library contains fewer than {min_filtered} cells in the filtered `SingleCellExperiment` object. This may affect the interpretation of results. </div> ") } } # check for number of cells post processing min_processed <- 50 if (has_processed) { if (ncol(processed_sce) < min_processed) { glue::glue(" <div class=\"alert alert-warning\"> This library contains fewer than {min_processed} cells in the processed `SingleCellExperiment` object after removal of low quality cells. UMAP is unable to be calculated and plots will not be shown. </div> ") } }
unfiltered_celldata <- data.frame(colData(unfiltered_sce)) |> mutate( rank = rank(-unfiltered_sce$sum, ties.method = "first"), # using full spec for clarity filter_pass = colnames(unfiltered_sce) %in% colnames(filtered_sce) ) |> select(sum, rank, filter_pass) |> filter(sum > 0) # remove zeros for plotting grouped_celldata <- unfiltered_celldata |> mutate(rank_group = floor(rank / 100)) |> group_by(rank_group) |> summarize( med_sum = median(sum), med_rank = median(rank), pct_passed = sum(filter_pass) / n() * 100 ) top_celldata <- unfiltered_celldata |> filter(rank <= 50) |> mutate(filter_pct = ifelse(filter_pass, 100, 0)) ggplot(grouped_celldata, aes(x = med_rank, y = med_sum, color = pct_passed)) + geom_point( mapping = aes(x = rank, y = sum, color = filter_pct), data = top_celldata, alpha = 0.5 ) + geom_line(size = 2, lineend = "round", linejoin = "round") + scale_x_log10(labels = scales::label_number(accuracy = 1)) + scale_y_log10(labels = scales::label_number(accuracy = 1)) + scale_color_gradient2( low = "grey70", mid = "forestgreen", high = "darkgreen", midpoint = 50 ) + labs( x = "Rank", y = "Total UMI count", color = "% passing\ncell filter" ) + theme( legend.position = c(0, 0), legend.justification = c(0, 0), legend.background = element_rect(color = "grey20", size = 0.25), legend.box.margin = margin(rep(5, 4)) )
The total UMI count of each droplet (barcode) plotted against the rank of that droplet allows visualization of the distribution of sequencing depth across droplets.
The droplets that are expected to contain cells were identified with DropletUtils::emptyDropsCellRanger()
, unless otherwise specified in the Cell Statistics
table, which uses both the total UMI counts and expressed gene content (adapted from Lun et al. 2019).
As the boundary between droplets passing and failing this filter is not solely dependent on total UMI count, some regions contain droplets in both categories.
The color in this plot indicates the percentage of droplets in a region passing the filter.
filtered_celldata <- data.frame(colData(filtered_sce)) ggplot( filtered_celldata, aes( x = sum, y = detected, color = subsets_mito_percent ) ) + geom_point(alpha = 0.3) + scale_color_viridis_c(limits = c(0, 100)) + scale_x_continuous(labels = scales::label_number(accuracy = 1)) + scale_y_continuous(labels = scales::label_number(accuracy = 1)) + labs( x = "Total UMI count", y = "Number of genes detected", color = "Percent reads\nmitochondrial" ) + theme( legend.position = c(0, 1), legend.justification = c(0, 1), legend.background = element_rect(color = "grey20", size = 0.25), legend.box.margin = margin(rep(5, 4)) )
The above plot of cell metrics includes only droplets which have passed the emptyDrops()
filter.
The plot will usually display a strong (but curved) relationship between the total UMI count and the number of genes detected.
Cells with low UMI counts and high mitochondrial percentages may require further filtering.
if (skip_miQC) { cat("miQC model not created, skipping miQC plot. Usually this is because mitochondrial gene data was not available.") } else { # remove prob_compromised if it exists, as this will cause errors with plotModel filtered_sce$prob_compromised <- NULL miQC_model <- metadata(filtered_sce)$miQC_model if (is.null(miQC_model) || length(miQC_model@components) < 2) { # model didn't fit, just plot metrics miQC_plot <- miQC::plotMetrics(filtered_sce) } else { miQC_plot <- miQC::plotModel(filtered_sce, model = miQC_model) } miQC_plot + coord_cartesian(ylim = c(0, 100)) + scale_x_continuous(labels = scales::label_number(accuracy = 1)) + labs( x = "Number of genes detected", y = "Percent reads mitochondrial" ) + theme( legend.position = c(1, 1), legend.justification = c(1, 1), legend.background = element_rect(color = "grey20", size = 0.25), legend.box.margin = margin(rep(5, 4)) ) }
We calculate the probability that a cell is compromised due to degradation or rupture using miQC
(Hippen et al. 2021).
This relies on fitting a mixture model using the number of genes expressed by a cell and the percentage of mitochondrial reads.
The expected plot will show a characteristic triangular shape and two model fit lines.
Cells with low numbers of genes expressed may have both low and high mitochondrial percentage, but cells with many genes tend to have a low mitochondrial percentage.
Compromised cells are likely to have a fewer genes detected and higher percentage of mitochondrial reads.
If the model has failed to fit properly, the pattern of cells may differ, and there may not be model fit lines. This can be the result of a low-quality library or may occur if there is no mitochondrial content, as in the case of a high-quality single-nucleus sample. In such situations, the calculated probability of compromise may not be valid (see miQC vignette for more details).
The below plot highlights cells that were removed prior to normalization and dimensionality reduction.
Cells that should be removed are those that are identified to be low quality cells, such as cells with high probability of being compromised, calculated by miQC
.
The method of filtering is indicated above the plot as either miQC
or Minimum gene cutoff
.
If miQC
, cells below the specified probability compromised cutoff and above the minimum number of unique genes identified are kept for downstream analyses.
If only a Minimum gene cutoff
is used, then miQC
is not used and only those cells that pass the minimum number of unique genes identified threshold are retained.
The dotted vertical line indicates the minimum gene cutoff used for filtering.
if (has_filtered & has_processed) { # grab cutoffs and filtering method from processed sce min_gene_cutoff <- processed_meta$min_gene_cutoff filter_method <- processed_meta$scpca_filter_method # add column to coldata labeling cells to keep/remove based on filtering method filtered_coldata_df <- colData(filtered_sce) |> as.data.frame() |> tibble::rownames_to_column("barcode") |> dplyr::mutate(scpca_filter = ifelse(barcode %in% colnames(processed_sce), "Keep", "Remove" )) ggplot(filtered_coldata_df, aes(x = detected, y = subsets_mito_percent, color = scpca_filter)) + geom_point(alpha = 0.5, size = 1) + geom_vline(xintercept = min_gene_cutoff, linetype = "dashed") + labs( x = "Number of genes detected", y = "Mitochondrial percentage", color = "Filter", title = stringr::str_replace(filter_method, "_", " ") ) + theme( plot.title = element_text(hjust = 0.5), legend.position = c(1, 1), legend.justification = c(1, 1), legend.background = element_rect(color = "grey20", size = 0.25), legend.title = element_text(hjust = 0.5) ) } else { glue::glue(" <div class=\"alert alert-warning\"> No filtering of low quality cells was performed on this library. </div> ") }
The raw counts from all cells that remain after filtering low quality cells are then normalized prior to selection of highly variable genes and dimensionality reduction.
R session information
if (requireNamespace("sessioninfo", quietly = TRUE)) { sessioninfo::session_info() } else { sessionInfo() }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.