Started on r format(Sys.time(), "%Y-%m-%d %H:%M:%S")
library(SummarizedExperiment) library(plotly) library(scater) library(ezRun) library(DT) library(htmltools) library(Matrix) library(Seurat) library(pheatmap) library(DropletUtils) library(cowplot) library(tidyverse) ## ---------------------------------------------- ## debug # title: "`r metadata(sce)$param$name`" # sce <- readRDS("/srv/gstore/projects/p3271/SCCountQC_41088_2019-11-12--14-43-47/1834_SCCountQC/sce.rds") debug <- FALSE param <- metadata(sce)$param ## ---------------------------------------------- ## picard metrics evalPicard = switch (toupper(param$scProtocol), "SMART-SEQ2" = TRUE, "10X"=FALSE, NA ) param$minReadsPerGene <- switch(toupper(param$scProtocol), "SMART-SEQ2"=3, "10X"=1) isMito <- toupper(as.character(seqnames(rowRanges(sce)))) %in% toupper(c("chrM", "MT", "M")) ## sceFixName for section "Reads on top 50 genes" sceFixName = sce rownames(sceFixName) <- uniquifyFeatureNames(ID=rowData(sceFixName)$gene_id, names=rowData(sceFixName)$gene_name) sceFixName <- addPerCellQC(sceFixName, subsets=list(Mito=isMito), detection_limit=max(param$minReadsPerGene-1, 0)) sceFixName <- addPerFeatureQC(sceFixName, detection_limit=max(param$minReadsPerGene-1, 0)) ## datasetAllCells for section "Heatmap of reads per cell on the plate" if(toupper(param$scProtocol) == "SMART-SEQ2"){ datasetAllCells <- data.frame(colData(sce), check.names = FALSE, stringsAsFactors=FALSE) }
r ncol(sce)
r median(sceFixName$detected)
r format(median(sceFixName$sum), scientific=FALSE)
r ifelse(is.null(metadata(sce)$mlog), NA, metadata(sce)$mlog["Number of input reads",2])
r ifelse(is.null(metadata(sce)$mlog), NA, metadata(sce)$mlog["% of reads mapped to multiple loci",2])
This plot shows the Median Genes per Cell as a function of downsampled sequencing depth in mean reads per cell, up to the observed sequencing depth. The slope of the curve near the endpoint can be interpreted as an upper bound to the benefit to be gained from increasing the sequencing depth beyond this point.
proportions <- (1:10)/10 medianNrGenesPerCell <- numeric(length(proportions)) for(i in 1:length(proportions)){ set.seed(100) new.counts <- downsampleMatrix(assay(sce, "counts"), prop=proportions[i]) medianNrGenesPerCell[i] <- median(Matrix::colSums(new.counts >= param$minReadsPerGene)) } toPlot <- tibble(x=proportions, y=medianNrGenesPerCell) xax <- list( title = "Proportion of current library size", zeroline = FALSE ) yax <- list( title = "Median Genes per Cell", zeroline = FALSE ) plot_ly(data=toPlot, x=~x, y=~y, type = 'scatter', mode = 'lines') %>% layout(title = "Median Genes per Cell", xaxis = xax, yaxis = yax)
Interactive plot showing the read counts vs the number of expressed genes and mitochondial content for all cells. Each dot is a cell.
p1 <- plotColData(sceFixName, x = "sum", y="detected", colour_by="Batch") + xlab("Read count") + ylab("Number of expressed genes") + theme_minimal_grid() p1 <- ggplotly(p1) p2 <- plotColData(sceFixName, x = "sum", y="subsets_Mito_percent", colour_by="Batch") + xlab("Read count") + ylab("Mitochondrial percent") + theme_minimal_grid() p2 <- ggplotly(p2) subplot(p1, p2, titleX=TRUE, titleY=TRUE, nrows=1) %>% layout(showlegend = FALSE)
Histograms of read counts, number of expressed genes and proportion of reads assigned to mitochondrial genes for all cells
yax <- list( title = "Number of cells" ) p1 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE), x=~sum) %>% layout(xaxis = list(title="Read counts (thousands)"), yaxis = yax) p2 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE), x=~detected) %>% layout(xaxis = list(title="Number of expressed genes"), yaxis = yax) p3 <- plot_ly(data=data.frame(colData(sceFixName), check.names = FALSE), x=~subsets_Mito_percent) %>% layout(xaxis = list(title="Mitochondrial proportion (%)"), yaxis = yax) subplot(p1, p2, p3, titleX=TRUE, titleY=TRUE, nrows=1) %>% layout(showlegend = FALSE) p1 <- plotColData(sceFixName, y="sum", x="Batch") + ylab("Library size") + theme_minimal_grid() p2 <- plotColData(sceFixName, y="detected", x="Batch") + ylab("Number of expressed genes") + theme_minimal_grid() p3 <- plotColData(sceFixName, y="subsets_Mito_percent", x="Batch") + ylab("Mitochondrial percent") + theme_minimal_grid() p <- plot_grid(p1, p2, p3, ncol=3) print(p)
Percentage of total counts assigned to the top 50 (by default) most-expressed features. Each row in the plot below corresponds to a gene, and each bar corresponds to the expression of a gene in a single cell. The circle indicates the median expression of each gene, with which genes are sorted.
We expect to see the "usual suspects", i.e., mitochondrial genes, actin, ribosomal protein, MALAT1. A few spike-in transcripts may also be present here, though if all of the spike-ins are in the top 50, it suggests that too much spike-in RNA was added. A large number of pseudo-genes or predicted genes may indicate problems with alignment.
The feature control is mitochondrial gene.
plotHighestExprs(sceFixName, n=50, exprs_values = "counts")
Sequenced reads on the plate layout.The heatmap values are in $log10$ scale. The colorScale ranges from half of median value to twice of median value.
This is only available for plate based single cell protocol.
plateMatrix <- plateStatistics(datasetAllCells) if(!is.na(plateMatrix)){ plateMatrix$`user ready made`$`LibConc_100_800bp [Characteristic]`=NULL l <- htmltools::tagList() ## This is the way of plotting plotly figures within a loop in knitr code chunk for(plateName in names(plateMatrix)){ for(colname in names(plateMatrix[[plateName]])){ p <- heatmapPlate(plateMatrix[[plateName]][[colname]], title=paste(plateName, colname, sep=": "), center=TRUE, log10=TRUE, width = 500*(1 + sqrt(5))/2, height = 500) ## tagList ignores chunk optiosn for figrue size. ## control the size in plotly. l[[paste(plateName, colname, sep=": ")]] <- as_widget(p) } } l }
scePC = sce scePC = scePC[rowData(scePC)$biotypes == "protein_coding", ] widthOffset = 200 logWidthBreaks = c(9.5, 10.5, 11.5) gcBreaks = c(0.42, 0.48, 0.53, 0.57, 0.62) refWidthBin = "(9.5 - 10.5]" refGcBin = "(0.53 - 0.57]" ## define the gene strata logWidth = log2(rowData(scePC)$featWidth + widthOffset) lwClasses = ezCut(logWidth, breaks = logWidthBreaks) gcClasses = ezCut(rowData(scePC)$gc, gcBreaks) geneBins = tapply((1:nrow(scePC)), list(logWidth=lwClasses, gc=gcClasses), identity) fracPresentByGene = Matrix::rowMeans(assays(scePC)$counts > param$minReadsPerGene) fracPresentBinned = ezMatrix(0, rows=rownames(geneBins), cols=colnames(geneBins)) for (i in 1:length(geneBins)){ fracPresentBinned[i] = mean(fracPresentByGene[geneBins[[i]]]) } ezHeatmap(fracPresentBinned[nrow(fracPresentBinned):1, ], lim=c(0, 0.2), Rowv = FALSE, Colv = FALSE, main="", colors=gray((1:256)/256), cexRow=1, cexCol=1, key.xlab="detection rate", key.title="", xlab="GC", ylab="log2 gene length", margins=c(8,8)) ezHeatmap(log2((fracPresentBinned[nrow(fracPresentBinned):1, ] + 1e-3) / fracPresentBinned[refWidthBin, refGcBin]), lim=c(-2, 2), Rowv = FALSE, Colv = FALSE, main="", colors=getBlueRedScale(), cexRow=1, cexCol=1, key.xlab="detection rate", key.title="", xlab="GC", ylab="log2 gene length", margins=c(8,8))
binCombs = expand.grid(logWidth=rownames(geneBins), gc=colnames(geneBins)) gcWidthScores = ezMatrix(NA, rows=paste(binCombs$logWidth, binCombs$gc), cols=colnames(scePC)) for (i in 1:nrow(binCombs)){ idx = geneBins[[binCombs$logWidth[i], binCombs$gc[i]]] gcWidthScores[paste(binCombs$logWidth[i], binCombs$gc[i]), ] = Matrix::colMeans(assays(scePC)$counts[idx, , drop=FALSE] > param$minReadsPerGene) } gcWidthScoresRel = log2((gcWidthScores + 0.01) / Matrix::rowMeans(gcWidthScores+ 0.01)) gcWidthScoresRanged <- shrinkToRange(gcWidthScores, c(0, 0.2)) pheatmap(gcWidthScoresRanged[ ,order(Matrix::colSums(assays(scePC)$counts > 3))], scale="none", legend_breaks = c(0, 0.05, 0.1, 0.15, 0.2), legend_labels = c("0", "0.05", "0.1", "0.15", "detection bias\n0.2"), color = colorRampPalette(c("black", "white"))(100), breaks=seq(from=0, to=0.2, length.out=101), cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=TRUE, show_colnames = FALSE, main="Detection bias of cells over width and gc" ) gcWidthScoresRelRanged <- shrinkToRange(gcWidthScoresRel, c(-2, 2)) pheatmap(gcWidthScoresRelRanged[ ,order(Matrix::colSums(assays(scePC)$counts > 3))], scale="none", legend_breaks = c(-2, -1, 0, 1, 2), legend_labels = c("-2", "-1", "0", "1", "detection bias\n2"), color = colorRampPalette(c("blue", "white", "red"))(100), breaks=seq(from=-2, to=2, length.out=101), cluster_rows=TRUE, cluster_cols=FALSE, show_rownames=TRUE, show_colnames = FALSE, main="Relative detection bias of cells over width and gc" )
A overview of scRNA-Seq QC metrics from Picard over the plate. For the detais of the metrics, please refer to https://broadinstitute.github.io/picard/picard-metric-definitions.html
This is only available for plate based single cell protocol.
cd <- data.frame(colData(sce), check.names = FALSE) cd$nGenesDetected <- Matrix::colSums(assays(sce)$counts > 3)
plateMatrix <- plateStatistics(cd, colname=c("PF_READS", "nGenesDetected", "PCT_RIBOSOMAL_BASES", "PF_READS_ALIGNED", "PF_MISMATCH_RATE", "PCT_ADAPTER", "PCT_MRNA_BASES", "MEDIAN_CV_COVERAGE", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "PERCENT_DUPLICATION")) metricsNonLog <- c("PF_MISMATCH_RATE", "PCT_ADAPTER", "MEDIAN_5PRIME_BIAS", "MEDIAN_3PRIME_BIAS", "PERCENT_DUPLICATION") plateName <- names(plateMatrix)[1] l <- htmltools::tagList() for(plateName in names(plateMatrix)){ for(colname in setdiff(names(plateMatrix[[plateName]]), c("PF_READS", "nGenesDetected"))){ matrix2PlotOrigin <- matrix2Plot <- plateMatrix[[plateName]][[colname]] if(colname %in% metricsNonLog){ ## non-log scale med_matrix2Plot <- median(matrix2Plot, na.rm=TRUE) range_colour <- sort(c(0, 2*med_matrix2Plot)) ## in case for negative values p1 <- heatmapPlate(matrix2PlotOrigin, log10 = FALSE, center = TRUE, title=paste(plateName, colname, sep=": "), colors="RdYlGn") }else{ ## log-scale stopifnot(all(matrix2Plot >= 0, na.rm = TRUE)) ## It has to be positive to log matrix2Plot[matrix2Plot==0] <- min(0.25 * matrix2Plot[matrix2Plot >0], na.rm = TRUE) matrix2Plot <- log10(matrix2Plot) med_matrix2Plot <- median(matrix2Plot, na.rm=TRUE) range_colour <- c(med_matrix2Plot-log10(2), med_matrix2Plot+log10(2)) p1 <- heatmapPlate(matrix2PlotOrigin, log10=TRUE, center = TRUE, title=paste(plateName, colname, sep=": "), colors="RdYlGn") } ## To make the scatter plot have the same colourscale as heatmap ## There are two options: ## 1. fake the data into cd2 with minimal and maximal colourrange. ## Certainly it's not elegent. # cd2 <- data.frame(PF_READS=c(-1, -1, c(plateMatrix[[plateName]]$PF_READS)), # nGenesDetected=c(-1,-1, c(plateMatrix[[plateName]]$nGenesDetected)), # metric=c(range_colour, c(matrix2Plot)), # metricOriginal=c(range_colour, c(plateMatrix[[plateName]][[colname]])) # ) ## color by metric, plot value by metricOriginal ## 2. use the cmin and cmax in scatter plot. The colour of points is right, ## but the color bar is wrong. ## Anyway we only use the color bar from heatmap. ## I use option 2. cd2 <- data.frame(PF_READS=c(plateMatrix[[plateName]]$PF_READS), nGenesDetected=c(plateMatrix[[plateName]]$nGenesDetected), matrix2Plot=c(matrix2Plot), matrix2PlotOrigin=c(matrix2PlotOrigin), position=apply(expand.grid(rownames(matrix2PlotOrigin), colnames(matrix2PlotOrigin)), 1, paste, collapse=": ") ) if(colname == "PF_READS_ALIGNED"){ cd2 <- transform(cd2, matrix2PlotOrigin=matrix2PlotOrigin/PF_READS) } p2 <- plot_ly(data = cd2, x = ~PF_READS, y = ~nGenesDetected, color = ~matrix2Plot, text=~position, type="scatter", mode = 'markers', colors="RdYlGn", marker=list(cmin=range_colour[1], cmax=range_colour[2])) %>% hide_colorbar() p3 <- plot_ly(data = cd2, x = ~PF_READS, y = ~matrix2PlotOrigin, color = ~matrix2Plot, text=~position, type="scatter", mode = 'markers', colors="RdYlGn", marker=list(cmin=range_colour[1], cmax=range_colour[2])) %>% hide_colorbar() p2 <- p2 %>% layout(xaxis=list(type="log"), yaxis=list(type="log", title="# of Genes Detected")) if(colname %in% metricsNonLog){ p3 <- p3 %>% layout(xaxis=list(type="log"), yaxis=list(title=colname)) }else{ p3 <- p3 %>% layout(xaxis=list(type="log"), yaxis=list(type="log", title=colname)) } l[[paste(plateName, colname, sep=": ")]] <- as_widget(subplot(p1, p2, p3, nrows=1, titleX=TRUE, titleY=TRUE) %>% layout(showlegend = FALSE)) } } l
This is only available for plate based single cell protocol.
metricsDT <- colData(sce)[, (tail(grep("\\[File\\]", colnames(colData(sce))), 1)+1):ncol(colData(sce))] datatable(data.frame(metricsDT, check.names = FALSE), filter = 'top', caption="scRNASeq QC Metrics")
x <- list( title = "Prime bias" ) p <- plot_ly(alpha = 0.6) %>% add_histogram(x = ~colData(sce)$`bias5`, name="5 prime bias") %>% add_histogram(x = ~colData(sce)$`bias3`, name="3 prime bias") %>% layout(barmode = "overlay", title="Mean prime bias", xaxis=x) p p2 <- plot_ly(alpha = 0.6) %>% add_histogram(x = ~colData(sce)$`MEDIAN_5PRIME_BIAS`, name="5 prime bias") %>% add_histogram(x = ~colData(sce)$`MEDIAN_3PRIME_BIAS`, name="3 prime bias") %>% layout(barmode = "overlay", title="Median prime bias: Picard", xaxis=x) p2
Expression matrix
The raw count matrix and tpm are available here:
# Raw counts rawCount <- as.matrix(assays(sce)$counts) rawCount <- as_tibble(rawCount, rownames=paste0(param$featureLevel, "_id")) rawCount <- bind_cols(gene_name=rowData(sce)$gene_name, rawCount) write_tsv(rawCount, file="rawCount.txt") zipped = zipFile("rawCount.txt") cat("\n") cat(paste0("[", zipped, "](", zipped, ")")) cat("\n") invisible(file.remove(c("rawCount.txt"))) # TPM tpm <- as.matrix(getTpm(sce)) tpm <- as_tibble(tpm, rownames=paste0(param$featureLevel, "_id")) tpm <- bind_cols(gene_name=rowData(sce)$gene_name, tpm) write_tsv(tpm, file="tpm.txt") zipped = zipFile("tpm.txt") cat("\n") cat(paste0("[", zipped, "](", zipped, ")")) cat("\n") invisible(file.remove(c("tpm.txt"))) # CPM cpm <- as.matrix(getCpm(sce)) cpm <- as_tibble(cpm, rownames=paste0(param$featureLevel, "_id")) cpm <- bind_cols(gene_name=rowData(sce)$gene_name, cpm) write_tsv(cpm, file="cpm.txt") zipped = zipFile("cpm.txt") cat("\n") cat(paste0("[", zipped, "](", zipped, ")")) cat("\n") invisible(file.remove(c("cpm.txt")))
ezSessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.