#' Generate and output 10X read alignment data quality metrics
#'
#' Read BAM file generated by Cell Ranger pipeline and output QC metrics
#' including number of aligned reads and reads aligned to an gene.
#'
#' @param bam Paths to input BAM files generated by Cell Ranger pipeline. These
#' files are usually named \emph{"possorted_genome_bam.bam"} in the
#' \emph{"outs"} folder of the top-level project output folders, respectively.
#' @param experiment A character vector of experiment names. Represents the
#' group label for each BAM file, e.g. "patient1, patient2, ...". The
#' length of \code{experiment} equals the number of BAM files to be processed.
#' @param filter Paths to the filtered barcode files. Should be in the same
#' length and order of the input BAM files. These files are named
#' \emph{"barcodes.tsv"} located at
#' \emph{outs/filtered_gene_bc_matrices/<reference_genome>/barcodes.tsv}.
#' @param validCb Path to the cell barcode whitelist file. By default uses the
#' file "737K-august-2016.txt" which is compatible with the v2 chemistry
#' protocol. The file can be inspected by calling
#' \code{data(validCb, package = "scruff")}. If the library is generated
#' using the v1 chemistry protocol, the path to the v1 barcode whitelist file
#' ("737K-april-2014_rc.txt") needs to be provided. For library generated by
#' v3 chemistry protocol, path to "3M-february-2018.txt" is needed.
#' @param tags BAM tags used for collecting QC metrics. Contains
#' non-standard tags locally-defined by Cell Ranger pipeline. Should not be
#' changed in most cases.
#' @param yieldSize The number of records (alignments) to yield when drawing
#' successive subsets from a BAM file, providing the number of successive
#' records to be returned on each yield. This parameter is passed to the
#' \code{yieldSize} argument of the \code{BamFile} function in
#' \code{Rsamtools} package. Default is \strong{1e06}.
#' @param outDir Output directory. The location to write resulting QC table.
#' @param cores Number of cores used for parallelization. Default is
#' \code{max(1, parallelly::availableCores() - 2)}, i.e. the number of
#' available cores minus 2.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object. The
#' \code{colData} contains the number of aligned reads
#' (reads_mapped_to_genome) and reads aligned
#' to genes (reads_mapped_to_genes).
#' @examples
#' # first 5000 records in the bam file downloaded from here:
#' # http://sra-download.ncbi.nlm.nih.gov/srapub_files/
#' # SRR5167880_E18_20160930_Neurons_Sample_01.bam
#' # see details here:
#' # https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP096558
#' # and here:
#' # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE93421
#' bamfile10x <- system.file("extdata",
#' "SRR5167880_E18_20160930_Neurons_Sample_01_5000.bam",
#' package = "scruff")
#'
#' # library(TENxBrainData)
#' # library(data.table)
#' # tenx <- TENxBrainData()
#' # # get filtered barcodes for sample 01
#' # filteredBcIndex <- tstrsplit(colData(tenx)[, "Barcode"], "-")[[2]] == 1
#' # filteredBc <- colData(tenx)[filteredBcIndex, ][["Barcode"]]
#'
#' filteredBc <- system.file("extdata",
#' "SRR5167880_E18_20160930_Neurons_Sample_01_filtered_barcode.tsv",
#' package = "scruff")
#' # QC results are saved to current working directory
#' sce <- tenxBamqc(bam = bamfile10x,
#' experiment = "Neurons_Sample_01",
#' filter = filteredBc)
#' sce
#' @import Rsamtools
#' @export
tenxBamqc <- function(bam,
experiment,
filter,
validCb = NA,
tags = c("NH", "GX", "CB", "MM"),
yieldSize = 1000000,
outDir = "./",
cores = max(1, parallelly::availableCores() - 2)) {
.checkCores(cores)
smc <- parallelly::supportsMulticore()
print(match.call(expand.dots = TRUE))
if (length(experiment) != length(bam)) {
stop("'bam' and 'experiment' have unequal lengths!")
}
if (length(experiment) != length(filter)) {
stop("'bam' and 'filter' have unequal lengths!")
}
if (is.na(validCb)) {
cbfile <- NA
utils::data(validCb, package = "scruff", envir = environment())
} else {
cbfile <- validCb
validCb <- data.table::fread(validCb, header = FALSE)
}
# cbtop10000 for checking cell barcodes
utils::data(cbtop10000, package = "scruff", envir = environment())
message(Sys.time(), " Start collecting QC metrics for BAM files ",
paste(bam, collapse = " "))
resDt <- data.table::data.table(cell_barcode = character(),
reads_mapped_to_genome = integer(),
reads_mapped_to_genes = integer(),
experiment = character())
for (i in seq_len(length(bam))) {
message(Sys.time(), " Processing file ", bam[i])
.tenxCheckCellBarcodes(bam[i], cbfile, validCb)
bamfl <- Rsamtools::BamFile(bam[i], yieldSize = yieldSize)
param <- Rsamtools::ScanBamParam(tag = tags)
# initialize valid cell barcodes
vcb <- data.table::as.data.table(validCb)
colnames(vcb)[1] <- "cell_barcode"
#vcb[, cell_barcode := paste0(cell_barcode, "-1")]
vcb[, reads_mapped_to_genome := 0]
vcb[, reads_mapped_to_genes := 0]
open(bamfl)
# Initialize iterator
.bamIter <- .bamIterator(bamfl, param)
if (!isTRUE(smc)) {
qcL <- BiocParallel::bpiterate(
ITER = .bamIter,
FUN = .tenxBamqcUnit,
vcb = vcb,
BPPARAM = BiocParallel::SnowParam(
workers = cores)
)
} else {
qcL <- BiocParallel::bpiterate(
ITER = .bamIter,
FUN = .tenxBamqcUnit,
vcb = vcb,
BPPARAM = BiocParallel::MulticoreParam(
workers = cores)
)
}
close(bamfl)
qcDt <- base::Reduce("+", qcL)
qcDt <- data.table::as.data.table(qcDt, keep.rownames = TRUE)
colnames(qcDt)[1] <- "cell_barcode"
qcDt <- qcDt[reads_mapped_to_genome != 0, ]
if (nrow(qcDt) == 0) {
stop("No reads are aligned to the genome in file",
bam[i])
}
filterDt <- data.table::fread(filter[i], header = FALSE)
colnames(filterDt)[1] <- "cb"
filterDt[, cb := data.table::tstrsplit(cb, "-")[[1]]]
qcDt <- qcDt[cell_barcode %in% filterDt[, cb], ]
qcDt[, experiment := experiment[i]]
resDt <- rbind(resDt, qcDt)
}
resDt[, number_of_cells := TRUE]
message(Sys.time(), " Finished collecting QC metrics")
message(Sys.time(), " Write QC results to output directory")
data.table::fwrite(resDt,
sep = "\t",
file = file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
"_10x_bamqc_filtered.tsv")))
scruffsce <- SingleCellExperiment::SingleCellExperiment(
matrix(ncol = nrow(resDt)))
summaryDF <- S4Vectors::DataFrame(resDt,
row.names = paste(resDt[, cell_barcode],
resDt[, experiment],
sep = "-"))
SummarizedExperiment::colData(scruffsce) <- summaryDF
message(Sys.time(),
" ... Save SingleCellExperiment object to ",
file.path(outDir, paste0(
format(Sys.time(),
"%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
))
)
save(scruffsce, file = file.path(outDir, paste0(
format(Sys.time(), "%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
)))
return(scruffsce)
}
.tenxBamqcUnit <- function(bamGA, vcb, ...) {
bamdt <- data.table::data.table(readname = base::names(bamGA),
chr = as.vector(GenomicAlignments::seqnames(bamGA)),
strand = S4Vectors::decode(BiocGenerics::strand(bamGA)),
readstart = BiocGenerics::start(bamGA),
readend = BiocGenerics::end(bamGA),
NH = S4Vectors::mcols(bamGA)$NH,
GX = S4Vectors::mcols(bamGA)$GX,
CB = S4Vectors::mcols(bamGA)$CB,
MM = S4Vectors::mcols(bamGA)$MM)
bamdt[, CB := data.table::tstrsplit(CB, "-")[[1]]]
genomeReadsDt <- bamdt[, .(readname, CB)]
# number of aligned reads, including multimappers
genomeReads <- base::table(genomeReadsDt[, CB])
# gene reads (uniquely mapped reads)
# use reads (MM = 1 | MAPQ = 255)
geneReadsDt <- bamdt[(NH == 1 & !is.na(GX) | MM == 1) & !is.na(CB),
.(readname, CB, GX)]
# if ";" is in GX then this read is not uniquely mapped
geneReadsDt <- geneReadsDt[!base::grepl(";", GX), ]
geneReads <- base::table(geneReadsDt[, CB])
qcdt <- data.table::copy(vcb)
# sanity check
if (nrow(qcdt[cell_barcode %in% genomeReadsDt[, CB], ]) !=
length(unique(genomeReadsDt[!is.na(CB), CB]))) {
stop("Some of the corrected cell barcodes in the BAM file do not exist",
" in barcode whitelist. Maybe you are using an older version of ",
"cell barcode whitelist with 14-base-long barcodes?")
}
qcdt[cell_barcode %in% genomeReadsDt[, CB],
reads_mapped_to_genome := as.numeric(genomeReads[cell_barcode])]
qcdt[cell_barcode %in% geneReadsDt[, CB],
reads_mapped_to_genes := as.numeric(geneReads[cell_barcode])]
resmatrix <- base::as.matrix(qcdt, rownames = 1)
rownames(resmatrix) <- qcdt[, cell_barcode]
return(resmatrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.