In AlexsLemonade/scpcaTools: Single cell data tools for ScPCA

# knitr options
knitr::opts_chunk$set(
  echo = FALSE
)

library(SingleCellExperiment)
library(dplyr)
library(ggplot2)

# Set default ggplot theme
theme_set(theme_bw() +
  theme(plot.margin = margin(rep(20, 4))))

# save some typing later
library_id <- params$library
unfiltered_sce <- params$unfiltered_sce
filtered_sce <- params$filtered_sce
processed_sce <- params$processed_sce

has_filtered <- !is.null(filtered_sce)
has_processed <- !is.null(processed_sce)

# if there is no filtered sce, use the unfiltered for both
if (!has_filtered) {
  filtered_sce <- unfiltered_sce
}

# grab sample id from filtered sce, if missing set sample id to NA
if (is.null(metadata(filtered_sce)$sample_id)) {
  sample_id <- NA
} else {
  sample_id <- metadata(filtered_sce)$sample_id
}

# add cell stats if missing
if (is.null(unfiltered_sce$sum)) {
  unfiltered_sce <- scuttle::addPerCellQCMetrics(unfiltered_sce)
}
if (is.null(filtered_sce$sum)) {
  filtered_sce <- scuttle::addPerCellQCMetrics(filtered_sce)
}
if (is.null(filtered_sce$subsets_mito_percent)) {
  filtered_sce$subsets_mito_percent <- NA_real_
  skip_miQC <- TRUE
} else {
  skip_miQC <- FALSE
}

# try to add miQC if it is missing
if (is.null(metadata(filtered_sce)$miQC_model) & !skip_miQC) {
  filtered_sce <- scpcaTools::add_miQC(filtered_sce)
}

## Check for additional modalities
modalities <- c("RNA-seq")

has_cite <- "CITEseq" %in% altExpNames(filtered_sce)
if (has_cite) {
  modalities <- c(modalities, "CITE-seq")
}

# check for cellhash to add to list of modalities
has_cellhash <- "cellhash" %in% altExpNames(filtered_sce)
if (has_cellhash) {
  modalities <- c(modalities, "Multiplex")
}

# check for umap, need to be sure that processed_sce exists first
if (has_processed) {
  has_umap <- "UMAP" %in% reducedDimNames(processed_sce)
} else {
  has_umap <- FALSE
}

Processing Information for `r library_id`

# only print out multiplexed warning if more than one sample is present
has_multiplex <- length(sample_id) > 1

if (has_multiplex) {
  # convert sample id to bullet separated list
  multiplex_samples <- paste0("<li>", paste(sample_id, collapse = "</li><li>", "</li>"))
  glue::glue("
    <div class=\"alert alert-warning\">

    This library is multiplexed and contains data from more than one sample.
    Data from the following samples are included in this library:

    {multiplex_samples}

    </div>
  ")
}

Raw Sample Metrics

# extract sce metadata containing processing information as table
unfiltered_meta <- metadata(unfiltered_sce)

sample_information <- tibble::tibble(
  "Library id" = library_id,
  "Sample id" = paste(sample_id, collapse = ", "),
  "Tech version" = format(unfiltered_meta$tech_version), # format to keep nulls
  "Data modalities" = paste(modalities, collapse = ", "),
  "Cells reported by alevin-fry" =
    format(unfiltered_meta$af_num_cells, big.mark = ",", scientific = FALSE),
  "Number of genes assayed" =
    format(nrow(unfiltered_sce), big.mark = ",", scientific = FALSE),
  "RNA-seq reads sequenced" =
    format(unfiltered_meta$total_reads, big.mark = ",", scientific = FALSE),
  "Percent RNA-seq reads mapped to transcripts" =
    paste0(round((unfiltered_meta$mapped_reads / unfiltered_meta$total_reads) * 100, 2), "%")
)
if (has_cite) {
  cite_exp <- altExp(filtered_sce, "CITEseq")
  cite_meta <- metadata(cite_exp)

  sample_information <- sample_information |>
    mutate(
      "Number of ADTs assayed" =
        format(nrow(cite_exp), big.mark = ",", scientific = FALSE),
      "CITE-seq reads sequenced" =
        format(cite_meta$total_reads, big.mark = ",", scientific = FALSE),
      "Percent CITE-seq reads mapped to ADTs" =
        paste0(round(cite_meta$mapped_reads / cite_meta$total_reads * 100, digits = 2), "%")
    )
}
if (has_cellhash) {
  multiplex_exp <- altExp(filtered_sce, "cellhash")
  multiplex_meta <- metadata(multiplex_exp)

  sample_information <- sample_information |>
    mutate(
      "Number of HTOs assayed" =
        format(nrow(multiplex_exp), big.mark = ",", scientific = FALSE),
      "Multiplex reads sequenced" =
        format(multiplex_meta$total_reads, big.mark = ",", scientific = FALSE),
      "Percent of multiplex reads mapped to HTOs" =
        paste0(round(multiplex_meta$mapped_reads / multiplex_meta$total_reads * 100, digits = 2), "%")
    )
}


sample_information <- sample_information |>
  mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |> # reformat nulls
  t()

# make table with sample information
knitr::kable(sample_information, align = "r") |>
  kableExtra::kable_styling(
    bootstrap_options = "striped",
    full_width = FALSE,
    position = "left"
  ) |>
  kableExtra::column_spec(2, monospace = TRUE)

Pre-Processing Information

# define transcript type
transcript_type <- paste(unfiltered_meta$transcript_type, collapse = " ")

processing_info <- tibble::tibble(
  "Salmon version" = format(unfiltered_meta$salmon_version),
  "Alevin-fry version" = format(unfiltered_meta$alevinfry_version),
  "Transcriptome index" = format(unfiltered_meta$reference_index),
  "Alevin-fry droplet detection" = format(unfiltered_meta$af_permit_type),
  "Resolution" = format(unfiltered_meta$af_resolution),
  "Transcripts included" = dplyr::case_when(
    transcript_type == "total spliced" ~ "Total and spliced only",
    transcript_type == "spliced" ~ "Spliced only",
    TRUE ~ transcript_type
  )
) |>
  mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |>
  t()


# make table with processing information
knitr::kable(processing_info, align = "r") |>
  kableExtra::kable_styling(
    bootstrap_options = "striped",
    full_width = FALSE,
    position = "left"
  ) |>
  kableExtra::column_spec(2, monospace = TRUE)

RNA-seq Experiment Summary

Cell Statistics

basic_statistics <- tibble::tibble(
  "Method used to filter empty droplets"          = metadata(filtered_sce)$filtering_method,
  "Number of cells post filtering empty droplets" = format(ncol(filtered_sce), big.mark = ","),
  "Percent of reads in cells"                     = paste0(round((sum(filtered_sce$sum) / sum(unfiltered_sce$sum)) * 100, 2), "%"),
  "Median UMI count per cell"                     = format(median(filtered_sce$sum), big.mark = ","),
  "Median genes detected per cell"                = format(median(filtered_sce$detected), big.mark = ","),
  "Median percent reads mitochondrial"            = paste0(round(median(filtered_sce$subsets_mito_percent), 2), "%")
)

# if processed sce exists add filtering and normalization table
if (has_processed) {
  processed_meta <- metadata(processed_sce)

  basic_statistics <- basic_statistics |>
    mutate(
      "Method used to filter low quality cells" = format(processed_meta$scpca_filter_method),
      "Cells after filtering low quality cells" = format(dim(processed_sce)[2], big.mark = ",", scientific = FALSE),
      "Normalization method"                    = format(processed_meta$normalization),
      "Minimum genes per cell cutoff"           = format(processed_meta$min_gene_cutoff)
    )
  if (processed_meta$scpca_filter_method == "miQC") {
    basic_statistics <- basic_statistics |>
      mutate(
        "Probability of compromised cell cutoff" = format(processed_meta$prob_compromised_cutoff, big.mark = ",", scientific = FALSE)
      )
  }
}


basic_statistics <- basic_statistics |>
  mutate(across(.fns = ~ ifelse(.x == "NULL", "N/A", .x))) |> # reformat nulls
  t()

# make table with basic statistics
knitr::kable(basic_statistics, align = "r") |>
  kableExtra::kable_styling(
    bootstrap_options = "striped",
    full_width = FALSE,
    position = "left"
  ) |>
  kableExtra::column_spec(2, monospace = TRUE)

if (has_filtered &&
  (metadata(filtered_sce)$filtering_method == "UMI cutoff")) {
  glue::glue("
    <div class=\"alert alert-warning\">

    This library may contain a low number of cells and was unable to be filtered using `DropletUtils`.
    Droplets with a total UMI count ≥ {metadata(filtered_sce)$umi_cutoff} are included in the filtered `SingleCellExperiment` object.

    </div>
  ")
}

# check for number of filtered cells
min_filtered <- 100
if (has_filtered) {
  if (ncol(filtered_sce) < min_filtered) {
    glue::glue("
      <div class=\"alert alert-warning\">

      This library contains fewer than {min_filtered} cells in the filtered `SingleCellExperiment` object.
      This may affect the interpretation of results.

      </div>
    ")
  }
}

# check for number of cells post processing
min_processed <- 50
if (has_processed) {
  if (ncol(processed_sce) < min_processed) {
    glue::glue("
      <div class=\"alert alert-warning\">

      This library contains fewer than {min_processed} cells in the processed `SingleCellExperiment` object after removal of low quality cells.
      UMAP is unable to be calculated and plots will not be shown.

      </div>
    ")
  }
}

Knee Plot

unfiltered_celldata <- data.frame(colData(unfiltered_sce)) |>
  mutate(
    rank = rank(-unfiltered_sce$sum, ties.method = "first"), # using full spec for clarity
    filter_pass = colnames(unfiltered_sce) %in% colnames(filtered_sce)
  ) |>
  select(sum, rank, filter_pass) |>
  filter(sum > 0) # remove zeros for plotting


grouped_celldata <- unfiltered_celldata |>
  mutate(rank_group = floor(rank / 100)) |>
  group_by(rank_group) |>
  summarize(
    med_sum = median(sum),
    med_rank = median(rank),
    pct_passed = sum(filter_pass) / n() * 100
  )

top_celldata <- unfiltered_celldata |>
  filter(rank <= 50) |>
  mutate(filter_pct = ifelse(filter_pass, 100, 0))

ggplot(grouped_celldata, aes(x = med_rank, y = med_sum, color = pct_passed)) +
  geom_point(
    mapping = aes(x = rank, y = sum, color = filter_pct),
    data = top_celldata,
    alpha = 0.5
  ) +
  geom_line(size = 2, lineend = "round", linejoin = "round") +
  scale_x_log10(labels = scales::label_number(accuracy = 1)) +
  scale_y_log10(labels = scales::label_number(accuracy = 1)) +
  scale_color_gradient2(
    low = "grey70",
    mid = "forestgreen",
    high = "darkgreen",
    midpoint = 50
  ) +
  labs(
    x = "Rank",
    y = "Total UMI count",
    color = "% passing\ncell filter"
  ) +
  theme(
    legend.position = c(0, 0),
    legend.justification = c(0, 0),
    legend.background = element_rect(color = "grey20", size = 0.25),
    legend.box.margin = margin(rep(5, 4))
  )

The total UMI count of each droplet (barcode) plotted against the rank of that droplet allows visualization of the distribution of sequencing depth across droplets. The droplets that are expected to contain cells were identified with DropletUtils::emptyDropsCellRanger(), unless otherwise specified in the Cell Statistics table, which uses both the total UMI counts and expressed gene content (adapted from Lun et al. 2019). As the boundary between droplets passing and failing this filter is not solely dependent on total UMI count, some regions contain droplets in both categories. The color in this plot indicates the percentage of droplets in a region passing the filter.

Cell Read Metrics

filtered_celldata <- data.frame(colData(filtered_sce))

ggplot(
  filtered_celldata,
  aes(
    x = sum,
    y = detected,
    color = subsets_mito_percent
  )
) +
  geom_point(alpha = 0.3) +
  scale_color_viridis_c(limits = c(0, 100)) +
  scale_x_continuous(labels = scales::label_number(accuracy = 1)) +
  scale_y_continuous(labels = scales::label_number(accuracy = 1)) +
  labs(
    x = "Total UMI count",
    y = "Number of genes detected",
    color = "Percent reads\nmitochondrial"
  ) +
  theme(
    legend.position = c(0, 1),
    legend.justification = c(0, 1),
    legend.background = element_rect(color = "grey20", size = 0.25),
    legend.box.margin = margin(rep(5, 4))
  )

The above plot of cell metrics includes only droplets which have passed the emptyDrops() filter. The plot will usually display a strong (but curved) relationship between the total UMI count and the number of genes detected. Cells with low UMI counts and high mitochondrial percentages may require further filtering.

miQC Model Diagnostics

if (skip_miQC) {
  cat("miQC model not created, skipping miQC plot. Usually this is because mitochondrial gene data was not available.")
} else {
  # remove prob_compromised if it exists, as this will cause errors with plotModel
  filtered_sce$prob_compromised <- NULL
  miQC_model <- metadata(filtered_sce)$miQC_model

  if (is.null(miQC_model) || length(miQC_model@components) < 2) {
    # model didn't fit, just plot metrics
    miQC_plot <- miQC::plotMetrics(filtered_sce)
  } else {
    miQC_plot <- miQC::plotModel(filtered_sce, model = miQC_model)
  }

  miQC_plot +
    coord_cartesian(ylim = c(0, 100)) +
    scale_x_continuous(labels = scales::label_number(accuracy = 1)) +
    labs(
      x = "Number of genes detected",
      y = "Percent reads mitochondrial"
    ) +
    theme(
      legend.position = c(1, 1),
      legend.justification = c(1, 1),
      legend.background = element_rect(color = "grey20", size = 0.25),
      legend.box.margin = margin(rep(5, 4))
    )
}

We calculate the probability that a cell is compromised due to degradation or rupture using miQC (Hippen et al. 2021). This relies on fitting a mixture model using the number of genes expressed by a cell and the percentage of mitochondrial reads. The expected plot will show a characteristic triangular shape and two model fit lines. Cells with low numbers of genes expressed may have both low and high mitochondrial percentage, but cells with many genes tend to have a low mitochondrial percentage. Compromised cells are likely to have a fewer genes detected and higher percentage of mitochondrial reads.

If the model has failed to fit properly, the pattern of cells may differ, and there may not be model fit lines. This can be the result of a low-quality library or may occur if there is no mitochondrial content, as in the case of a high-quality single-nucleus sample. In such situations, the calculated probability of compromise may not be valid (see miQC vignette for more details).

Removing low quality cells

The below plot highlights cells that were removed prior to normalization and dimensionality reduction. Cells that should be removed are those that are identified to be low quality cells, such as cells with high probability of being compromised, calculated by miQC. The method of filtering is indicated above the plot as either miQC or Minimum gene cutoff. If miQC, cells below the specified probability compromised cutoff and above the minimum number of unique genes identified are kept for downstream analyses. If only a Minimum gene cutoff is used, then miQC is not used and only those cells that pass the minimum number of unique genes identified threshold are retained. The dotted vertical line indicates the minimum gene cutoff used for filtering.

if (has_filtered & has_processed) {
  # grab cutoffs and filtering method from processed sce
  min_gene_cutoff <- processed_meta$min_gene_cutoff
  filter_method <- processed_meta$scpca_filter_method

  # add column to coldata labeling cells to keep/remove based on filtering method
  filtered_coldata_df <- colData(filtered_sce) |>
    as.data.frame() |>
    tibble::rownames_to_column("barcode") |>
    dplyr::mutate(scpca_filter = ifelse(barcode %in% colnames(processed_sce),
      "Keep",
      "Remove"
    ))

  ggplot(filtered_coldata_df, aes(x = detected, y = subsets_mito_percent, color = scpca_filter)) +
    geom_point(alpha = 0.5, size = 1) +
    geom_vline(xintercept = min_gene_cutoff, linetype = "dashed") +
    labs(
      x = "Number of genes detected",
      y = "Mitochondrial percentage",
      color = "Filter",
      title = stringr::str_replace(filter_method, "_", " ")
    ) +
    theme(
      plot.title = element_text(hjust = 0.5),
      legend.position = c(1, 1),
      legend.justification = c(1, 1),
      legend.background = element_rect(color = "grey20", size = 0.25),
      legend.title = element_text(hjust = 0.5)
    )
} else {
  glue::glue("
    <div class=\"alert alert-warning\">

    No filtering of low quality cells was performed on this library.

    </div>
  ")
}

The raw counts from all cells that remain after filtering low quality cells are then normalized prior to selection of highly variable genes and dimensionality reduction.

Session Info

R session information

if (requireNamespace("sessioninfo", quietly = TRUE)) {
  sessioninfo::session_info()
} else {
  sessionInfo()
}

AlexsLemonade/scpcaTools documentation built on July 12, 2024, 8:34 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

AlexsLemonade/scpcaTools
Single cell data tools for ScPCA

In AlexsLemonade/scpcaTools: Single cell data tools for ScPCA

Processing Information for `r library_id`

Raw Sample Metrics

Pre-Processing Information

RNA-seq Experiment Summary

Cell Statistics

Knee Plot

Cell Read Metrics

miQC Model Diagnostics

Removing low quality cells

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

AlexsLemonade/scpcaTools Single cell data tools for ScPCA

In AlexsLemonade/scpcaTools: Single cell data tools for ScPCA

Processing Information for r library_id

Raw Sample Metrics

Pre-Processing Information

RNA-seq Experiment Summary

Cell Statistics

Knee Plot

Cell Read Metrics

miQC Model Diagnostics

Removing low quality cells

Session Info

R Package Documentation

Browse R Packages

We want your feedback!

AlexsLemonade/scpcaTools
Single cell data tools for ScPCA

Processing Information for `r library_id`