In uzh/ezRun: An R meta-package for the analysis of Next Generation Sequencing Data

Started on r format(Sys.time(), "%Y-%m-%d %H:%M:%S")

library(SummarizedExperiment)
library(plotly)
library(scater)
library(ezRun)
library(DT)
library(htmltools)
library(Matrix)
library(Seurat)
library(pheatmap)
library(DropletUtils)
library(cowplot)
library(tidyverse)
## ----------------------------------------------
## debug
# title:  "`r metadata(sce)$param$name`"
# sce <- readRDS("/srv/gstore/projects/p3271/SCCountQC_41088_2019-11-12--14-43-47/1834_SCCountQC/sce.rds")

debug <- FALSE

param <- metadata(sce)$param
## ----------------------------------------------
## picard metrics
evalPicard = switch (toupper(param$scProtocol),
  "SMART-SEQ2" = TRUE,
  "10X"=FALSE,
  NA
)

param$minReadsPerGene <- switch(toupper(param$scProtocol),
                                "SMART-SEQ2"=3,
                                "10X"=1)

isMito <- toupper(as.character(seqnames(rowRanges(sce)))) %in% 
  toupper(c("chrM", "MT", "M"))

## sceFixName for section "Reads on top 50 genes"
sceFixName = sce
rownames(sceFixName) <- uniquifyFeatureNames(ID=rowData(sceFixName)$gene_id,
                                             names=rowData(sceFixName)$gene_name)

sceFixName <- addPerCellQC(sceFixName, subsets=list(Mito=isMito),
                           detection_limit=max(param$minReadsPerGene-1, 0))
sceFixName <- addPerFeatureQC(sceFixName,
                              detection_limit=max(param$minReadsPerGene-1, 0))

## datasetAllCells for section "Heatmap of reads per cell on the plate"
if(toupper(param$scProtocol) == "SMART-SEQ2"){
  datasetAllCells <- data.frame(colData(sce),
                                check.names = FALSE, stringsAsFactors=FALSE)
}

CountQC_Result {.tabset}

Overview

Total Number of Cells: r ncol(sce)
Median Genes per Cell = r median(sceFixName$detected)
Median Reads per Cell = r format(median(sceFixName$sum), scientific=FALSE)
Number of input reads (Smart-Seq2 only) = r ifelse(is.null(metadata(sce)$mlog), NA, metadata(sce)$mlog["Number of input reads",2])
% of reads mapped to multiple loci (Smart-Seq2 only) = r ifelse(is.null(metadata(sce)$mlog), NA, metadata(sce)$mlog["% of reads mapped to multiple loci",2])

Saturation

This plot shows the Median Genes per Cell as a function of downsampled sequencing depth in mean reads per cell, up to the observed sequencing depth. The slope of the curve near the endpoint can be interpreted as an upper bound to the benefit to be gained from increasing the sequencing depth beyond this point.

proportions <- (1:10)/10
medianNrGenesPerCell <- numeric(length(proportions))
for(i in 1:length(proportions)){
  set.seed(100)
  new.counts <- downsampleMatrix(assay(sce, "counts"), prop=proportions[i])
  medianNrGenesPerCell[i] <- median(Matrix::colSums(new.counts >= param$minReadsPerGene))
}
toPlot <- tibble(x=proportions, y=medianNrGenesPerCell)
xax <- list(
  title = "Proportion of current library size",
  zeroline = FALSE
)
yax <- list(
  title = "Median Genes per Cell",
  zeroline = FALSE
)

plot_ly(data=toPlot, x=~x, y=~y, type = 'scatter', mode = 'lines') %>%
  layout(title = "Median Genes per Cell",
         xaxis = xax, yaxis = yax)

Diagnostic plots of cells

Interactive plot showing the read counts vs the number of expressed genes and mitochondial content for all cells. Each dot is a cell.

p1 <- plotColData(sceFixName, x = "sum", y="detected", colour_by="Batch") +
  xlab("Read count") + ylab("Number of expressed genes") +
  theme_minimal_grid()
p1 <- ggplotly(p1)

p2 <- plotColData(sceFixName, x = "sum", y="subsets_Mito_percent", 
                 colour_by="Batch") +
  xlab("Read count") + ylab("Mitochondrial percent") +
  theme_minimal_grid()
p2 <- ggplotly(p2)

subplot(p1, p2, titleX=TRUE, titleY=TRUE, nrows=1) %>%
  layout(showlegend = FALSE)

QC metrics on the cells

Histograms of read counts, number of expressed genes and proportion of reads assigned to mitochondrial genes for all cells

yax <- list(
  title = "Number of cells"
)
p1 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE),
              x=~sum) %>%
  layout(xaxis = list(title="Read counts (thousands)"), yaxis = yax)
p2 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE),
              x=~detected) %>%
  layout(xaxis = list(title="Number of expressed genes"), yaxis = yax)
p3 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE),
              x=~subsets_Mito_percent) %>%
  layout(xaxis = list(title="Mitochondrial proportion (%)"), yaxis = yax)
subplot(p1, p2, p3, titleX=TRUE, titleY=TRUE, nrows=1) %>%
  layout(showlegend = FALSE)

p1 <- plotColData(sceFixName, y="sum", x="Batch") +
  ylab("Library size") + theme_minimal_grid()
p2 <- plotColData(sceFixName, y="detected", x="Batch") +
  ylab("Number of expressed genes") + theme_minimal_grid()
p3 <- plotColData(sceFixName, y="subsets_Mito_percent", x="Batch") +
   ylab("Mitochondrial percent") + theme_minimal_grid()
p <- plot_grid(p1, p2, p3, ncol=3)
print(p)

Highly expressed features

Percentage of total counts assigned to the top 50 (by default) most-expressed features. Each row in the plot below corresponds to a gene, and each bar corresponds to the expression of a gene in a single cell. The circle indicates the median expression of each gene, with which genes are sorted.

We expect to see the "usual suspects", i.e., mitochondrial genes, actin, ribosomal protein, MALAT1. A few spike-in transcripts may also be present here, though if all of the spike-ins are in the top 50, it suggests that too much spike-in RNA was added. A large number of pseudo-genes or predicted genes may indicate problems with alignment.

The feature control is mitochondrial gene.

plotHighestExprs(sceFixName, n=50, exprs_values = "counts")

Heatmap of reads per cell on the plate

Sequenced reads on the plate layout.The heatmap values are in $log10$ scale. The colorScale ranges from half of median value to twice of median value.

This is only available for plate based single cell protocol.

plateMatrix <- plateStatistics(datasetAllCells)

if(!is.na(plateMatrix)){
  plateMatrix$`user ready made`$`LibConc_100_800bp [Characteristic]`=NULL
  l <- htmltools::tagList()
  ## This is the way of plotting plotly figures within a loop in knitr code chunk
  for(plateName in names(plateMatrix)){
    for(colname in names(plateMatrix[[plateName]])){
      p <- heatmapPlate(plateMatrix[[plateName]][[colname]],
                        title=paste(plateName, colname, sep=": "),
                        center=TRUE, log10=TRUE,
                        width = 500*(1 + sqrt(5))/2, height = 500)
      ## tagList ignores chunk optiosn for figrue size.
      ## control the size in plotly.
      l[[paste(plateName, colname, sep=": ")]] <- as_widget(p)
    }
  }
  l
}

Gene Detection Rates

scePC = sce
scePC = scePC[rowData(scePC)$biotypes == "protein_coding", ]
widthOffset = 200
logWidthBreaks = c(9.5, 10.5, 11.5)
gcBreaks = c(0.42, 0.48, 0.53, 0.57, 0.62)
refWidthBin = "(9.5 - 10.5]" 
refGcBin = "(0.53 - 0.57]"
  ## define the gene strata
logWidth = log2(rowData(scePC)$featWidth + widthOffset)
lwClasses = ezCut(logWidth, breaks = logWidthBreaks)
gcClasses = ezCut(rowData(scePC)$gc, gcBreaks)
geneBins = tapply((1:nrow(scePC)), list(logWidth=lwClasses, gc=gcClasses), identity)

fracPresentByGene = Matrix::rowMeans(assays(scePC)$counts > param$minReadsPerGene)
fracPresentBinned = ezMatrix(0, rows=rownames(geneBins), cols=colnames(geneBins))
for (i in 1:length(geneBins)){
  fracPresentBinned[i] = mean(fracPresentByGene[geneBins[[i]]])
}

ezHeatmap(fracPresentBinned[nrow(fracPresentBinned):1, ], lim=c(0, 0.2),
          Rowv = FALSE, Colv = FALSE, main="", colors=gray((1:256)/256),
          cexRow=1, cexCol=1, key.xlab="detection rate", key.title="",
          xlab="GC", ylab="log2 gene length", margins=c(8,8))

ezHeatmap(log2((fracPresentBinned[nrow(fracPresentBinned):1, ] + 1e-3) / 
                 fracPresentBinned[refWidthBin, refGcBin]), lim=c(-2, 2),
          Rowv = FALSE, Colv = FALSE, main="",
          colors=getBlueRedScale(),
          cexRow=1, cexCol=1, key.xlab="detection rate", key.title="",
          xlab="GC", ylab="log2 gene length", margins=c(8,8))

binCombs = expand.grid(logWidth=rownames(geneBins), gc=colnames(geneBins))
gcWidthScores = ezMatrix(NA, rows=paste(binCombs$logWidth, binCombs$gc), 
                         cols=colnames(scePC))
for (i in 1:nrow(binCombs)){
  idx = geneBins[[binCombs$logWidth[i], binCombs$gc[i]]]
  gcWidthScores[paste(binCombs$logWidth[i], binCombs$gc[i]), ] = Matrix::colMeans(assays(scePC)$counts[idx, , drop=FALSE] > param$minReadsPerGene)
}
gcWidthScoresRel = log2((gcWidthScores + 0.01) / Matrix::rowMeans(gcWidthScores+ 0.01))

gcWidthScoresRanged <- shrinkToRange(gcWidthScores, c(0, 0.2))
pheatmap(gcWidthScoresRanged[ ,order(Matrix::colSums(assays(scePC)$counts > 3))],
         scale="none",
         legend_breaks = c(0, 0.05, 0.1, 0.15, 0.2),
         legend_labels = c("0", "0.05", "0.1", "0.15", "detection bias\n0.2"),
         color = colorRampPalette(c("black", "white"))(100),
         breaks=seq(from=0, to=0.2, length.out=101),
         cluster_rows=FALSE, cluster_cols=FALSE,
         show_rownames=TRUE, show_colnames = FALSE,
         main="Detection bias of cells over width and gc"
         )

gcWidthScoresRelRanged <- shrinkToRange(gcWidthScoresRel, c(-2, 2))
pheatmap(gcWidthScoresRelRanged[ ,order(Matrix::colSums(assays(scePC)$counts > 3))],
         scale="none",
         legend_breaks = c(-2, -1, 0, 1, 2),
         legend_labels = c("-2", "-1", "0", "1", "detection bias\n2"),
         color = colorRampPalette(c("blue", "white", "red"))(100),
         breaks=seq(from=-2, to=2, length.out=101),
         cluster_rows=TRUE, cluster_cols=FALSE,
         show_rownames=TRUE, show_colnames = FALSE,
         main="Relative detection bias of cells over width and gc"
         )

Picard Metrics

A overview of scRNA-Seq QC metrics from Picard over the plate. For the detais of the metrics, please refer to https://broadinstitute.github.io/picard/picard-metric-definitions.html

This is only available for plate based single cell protocol.

cd <- data.frame(colData(sce), check.names = FALSE)
cd$nGenesDetected <- Matrix::colSums(assays(sce)$counts > 3)

plateMatrix <- plateStatistics(cd,
                               colname=c("PF_READS", "nGenesDetected",
                                         "PCT_RIBOSOMAL_BASES", "PF_READS_ALIGNED",
                                         "PF_MISMATCH_RATE", "PCT_ADAPTER",
                                         "PCT_MRNA_BASES", "MEDIAN_CV_COVERAGE",
                                         "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS",
                                         "PERCENT_DUPLICATION"))
metricsNonLog <- c("PF_MISMATCH_RATE", "PCT_ADAPTER", "MEDIAN_5PRIME_BIAS",
                   "MEDIAN_3PRIME_BIAS", "PERCENT_DUPLICATION")
plateName <- names(plateMatrix)[1]
l <- htmltools::tagList()
for(plateName in names(plateMatrix)){
  for(colname in setdiff(names(plateMatrix[[plateName]]), 
                         c("PF_READS", "nGenesDetected"))){
    matrix2PlotOrigin <- matrix2Plot <- plateMatrix[[plateName]][[colname]]
    if(colname %in% metricsNonLog){
      ## non-log scale
      med_matrix2Plot <- median(matrix2Plot, na.rm=TRUE)
      range_colour <- sort(c(0, 2*med_matrix2Plot)) ## in case for negative values
      p1 <- heatmapPlate(matrix2PlotOrigin, log10 = FALSE, center = TRUE,
                         title=paste(plateName, colname, sep=": "),
                         colors="RdYlGn")
    }else{
      ## log-scale
      stopifnot(all(matrix2Plot >= 0, na.rm = TRUE)) ## It has to be positive to log
      matrix2Plot[matrix2Plot==0] <- min(0.25 * matrix2Plot[matrix2Plot >0],
                                         na.rm = TRUE)
      matrix2Plot <- log10(matrix2Plot)
      med_matrix2Plot <- median(matrix2Plot, na.rm=TRUE)
      range_colour <- c(med_matrix2Plot-log10(2), med_matrix2Plot+log10(2))
      p1 <- heatmapPlate(matrix2PlotOrigin, log10=TRUE, center = TRUE,
                       title=paste(plateName, colname, sep=": "), 
                       colors="RdYlGn")
    }

    ## To make the scatter plot have the same colourscale as heatmap
    ## There are two options:
    ## 1. fake the data into cd2 with minimal and maximal colourrange.
    ##    Certainly it's not elegent.
    # cd2 <- data.frame(PF_READS=c(-1, -1, c(plateMatrix[[plateName]]$PF_READS)),
    #                   nGenesDetected=c(-1,-1, c(plateMatrix[[plateName]]$nGenesDetected)),
    #                   metric=c(range_colour, c(matrix2Plot)),
    #                   metricOriginal=c(range_colour, c(plateMatrix[[plateName]][[colname]]))
    #                   )
    ## color by metric, plot value by metricOriginal
    ## 2. use the cmin and cmax in scatter plot. The colour of points is right,
    ##   but the color bar is wrong.
    ##   Anyway we only use the color bar from heatmap.
    ## I use option 2.
    cd2 <- data.frame(PF_READS=c(plateMatrix[[plateName]]$PF_READS),
                      nGenesDetected=c(plateMatrix[[plateName]]$nGenesDetected),
                      matrix2Plot=c(matrix2Plot),
                      matrix2PlotOrigin=c(matrix2PlotOrigin),
                      position=apply(expand.grid(rownames(matrix2PlotOrigin),
                                                 colnames(matrix2PlotOrigin)), 
                                     1, paste, collapse=": ")
                      )
    if(colname == "PF_READS_ALIGNED"){
      cd2 <- transform(cd2, matrix2PlotOrigin=matrix2PlotOrigin/PF_READS)
    }
    p2 <- plot_ly(data = cd2, x = ~PF_READS, y = ~nGenesDetected,
                    color = ~matrix2Plot, text=~position,
                    type="scatter", mode = 'markers', colors="RdYlGn",
                    marker=list(cmin=range_colour[1], cmax=range_colour[2])) %>%
            hide_colorbar() 
    p3 <- plot_ly(data = cd2, x = ~PF_READS, y = ~matrix2PlotOrigin,
                  color = ~matrix2Plot, text=~position,
                  type="scatter", mode = 'markers', colors="RdYlGn",
                  marker=list(cmin=range_colour[1], cmax=range_colour[2])) %>%
            hide_colorbar()
    p2 <- p2 %>%
              layout(xaxis=list(type="log"),
                     yaxis=list(type="log", title="# of Genes Detected"))
    if(colname %in% metricsNonLog){
      p3 <- p3 %>%
              layout(xaxis=list(type="log"), yaxis=list(title=colname))
    }else{
      p3 <- p3 %>%
            layout(xaxis=list(type="log"),
                   yaxis=list(type="log", title=colname))
    }
    l[[paste(plateName, colname, sep=": ")]] <- 
      as_widget(subplot(p1, p2, p3, nrows=1, titleX=TRUE, titleY=TRUE) %>%
                    layout(showlegend = FALSE))
  }
}
l

Picard Metrics Summary

This is only available for plate based single cell protocol.

metricsDT <- colData(sce)[, (tail(grep("\\[File\\]", colnames(colData(sce))), 1)+1):ncol(colData(sce))]
datatable(data.frame(metricsDT, check.names = FALSE), filter = 'top',
          caption="scRNASeq QC Metrics")

x <- list(
  title = "Prime bias"
)

p <- plot_ly(alpha = 0.6) %>%
  add_histogram(x = ~colData(sce)$`bias5`, name="5 prime bias") %>%
  add_histogram(x = ~colData(sce)$`bias3`, name="3 prime bias") %>%
  layout(barmode = "overlay", title="Mean prime bias",
         xaxis=x)

p

p2 <- plot_ly(alpha = 0.6) %>%
  add_histogram(x = ~colData(sce)$`MEDIAN_5PRIME_BIAS`, name="5 prime bias") %>%
  add_histogram(x = ~colData(sce)$`MEDIAN_3PRIME_BIAS`, name="3 prime bias") %>%
  layout(barmode = "overlay", title="Median prime bias: Picard",
         xaxis=x)
p2

Data availability

Expression matrix

The raw count matrix and tpm are available here:

# Raw counts
rawCount <- as.matrix(assays(sce)$counts)
rawCount <- as_tibble(rawCount, rownames=paste0(param$featureLevel, "_id"))
rawCount <- bind_cols(gene_name=rowData(sce)$gene_name, rawCount)
write_tsv(rawCount, file="rawCount.txt")
zipped = zipFile("rawCount.txt")
cat("\n")
cat(paste0("[", zipped, "](", zipped, ")"))
cat("\n")
invisible(file.remove(c("rawCount.txt")))

# TPM
tpm <- as.matrix(getTpm(sce))
tpm <- as_tibble(tpm, rownames=paste0(param$featureLevel, "_id"))
tpm <- bind_cols(gene_name=rowData(sce)$gene_name, tpm)
write_tsv(tpm, file="tpm.txt")
zipped = zipFile("tpm.txt")
cat("\n")
cat(paste0("[", zipped, "](", zipped, ")"))
cat("\n")
invisible(file.remove(c("tpm.txt")))

# CPM
cpm <- as.matrix(getCpm(sce))
cpm <- as_tibble(cpm, rownames=paste0(param$featureLevel, "_id"))
cpm <- bind_cols(gene_name=rowData(sce)$gene_name, cpm)
write_tsv(cpm, file="cpm.txt")
zipped = zipFile("cpm.txt")
cat("\n")
cat(paste0("[", zipped, "](", zipped, ")"))
cat("\n")
invisible(file.remove(c("cpm.txt")))

SessionInfo

ezSessionInfo()

uzh/ezRun documentation built on April 14, 2024, 5:09 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

uzh/ezRun
An R meta-package for the analysis of Next Generation Sequencing Data

In uzh/ezRun: An R meta-package for the analysis of Next Generation Sequencing Data

CountQC_Result {.tabset}

Overview

Saturation

Diagnostic plots of cells

QC metrics on the cells

Highly expressed features

Heatmap of reads per cell on the plate

Gene Detection Rates

Picard Metrics

Picard Metrics Summary

Data availability

SessionInfo

R Package Documentation

Browse R Packages

We want your feedback!

uzh/ezRun An R meta-package for the analysis of Next Generation Sequencing Data

In uzh/ezRun: An R meta-package for the analysis of Next Generation Sequencing Data

CountQC_Result {.tabset}

Overview

Saturation

Diagnostic plots of cells

QC metrics on the cells

Highly expressed features

Heatmap of reads per cell on the plate

Gene Detection Rates

Picard Metrics

Picard Metrics Summary

Data availability

SessionInfo

R Package Documentation

Browse R Packages

We want your feedback!

uzh/ezRun
An R meta-package for the analysis of Next Generation Sequencing Data