scruff: Single Cell RNA-Seq UMI Filtering Facilitator (scruff)

Documented in tenxBamqc

#' Generate and output 10X read alignment data quality metrics
#'
#' Read BAM file generated by Cell Ranger pipeline and output QC metrics
#'  including number of aligned reads and reads aligned to an gene.
#'
#' @param bam Paths to input BAM files generated by Cell Ranger pipeline. These
#'  files are usually named \emph{"possorted_genome_bam.bam"} in the
#'   \emph{"outs"} folder of the top-level project output folders, respectively.
#' @param experiment A character vector of experiment names. Represents the
#'  group label for each BAM file, e.g. "patient1, patient2, ...". The
#'  length of \code{experiment} equals the number of BAM files to be processed.
#' @param filter Paths to the filtered barcode files. Should be in the same
#'  length and order of the input BAM files. These files are named
#'  \emph{"barcodes.tsv"} located at
#'  \emph{outs/filtered_gene_bc_matrices/<reference_genome>/barcodes.tsv}.
#' @param validCb Path to the cell barcode whitelist file. By default uses the
#'  file "737K-august-2016.txt" which is compatible with the v2 chemistry
#'  protocol. The file can be inspected by calling
#'  \code{data(validCb, package = "scruff")}. If the library is generated
#'  using the v1 chemistry protocol, the path to the v1 barcode whitelist file
#'  ("737K-april-2014_rc.txt") needs to be provided. For library generated by
#'  v3 chemistry protocol, path to "3M-february-2018.txt" is needed.
#' @param tags BAM tags used for collecting QC metrics. Contains
#'  non-standard tags locally-defined by Cell Ranger pipeline. Should not be
#'  changed in most cases.
#' @param yieldSize The number of records (alignments) to yield when drawing
#'  successive subsets from a BAM file, providing the number of successive
#'  records to be returned on each yield. This parameter is passed to the
#'  \code{yieldSize} argument of the \code{BamFile} function in
#'  \code{Rsamtools} package. Default is \strong{1e06}.
#' @param outDir Output directory. The location to write resulting QC table.
#' @param cores Number of cores used for parallelization. Default is
#'  \code{max(1, parallelly::availableCores() - 2)}, i.e. the number of
#'  available cores minus 2.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object. The
#'  \code{colData} contains the number of aligned reads
#'  (reads_mapped_to_genome) and reads aligned
#'  to genes (reads_mapped_to_genes).
#' @examples
#' # first 5000 records in the bam file downloaded from here:
#' # http://sra-download.ncbi.nlm.nih.gov/srapub_files/
#' # SRR5167880_E18_20160930_Neurons_Sample_01.bam
#' # see details here:
#' # https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP096558
#' # and here:
#' # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE93421
#' bamfile10x <- system.file("extdata",
#'     "SRR5167880_E18_20160930_Neurons_Sample_01_5000.bam",
#'     package = "scruff")
#'
#' # library(TENxBrainData)
#' # library(data.table)
#' # tenx <- TENxBrainData()
#' # # get filtered barcodes for sample 01
#' # filteredBcIndex <- tstrsplit(colData(tenx)[, "Barcode"], "-")[[2]] == 1
#' # filteredBc <- colData(tenx)[filteredBcIndex, ][["Barcode"]]
#'
#' filteredBc <- system.file("extdata",
#'     "SRR5167880_E18_20160930_Neurons_Sample_01_filtered_barcode.tsv",
#'     package = "scruff")
#' # QC results are saved to current working directory
#' sce <- tenxBamqc(bam = bamfile10x,
#'     experiment = "Neurons_Sample_01",
#'     filter = filteredBc)
#' sce
#' @import Rsamtools
#' @export
tenxBamqc <- function(bam,
    experiment,
    filter,
    validCb = NA,
    tags = c("NH", "GX", "CB", "MM"),
    yieldSize = 1000000,
    outDir = "./",
    cores = max(1, parallelly::availableCores() - 2)) {

    .checkCores(cores)

    smc <- parallelly::supportsMulticore()

    print(match.call(expand.dots = TRUE))

    if (length(experiment) != length(bam)) {
        stop("'bam' and 'experiment' have unequal lengths!")
    }

    if (length(experiment) != length(filter)) {
        stop("'bam' and 'filter' have unequal lengths!")
    }

    if (is.na(validCb)) {
        cbfile <- NA
        utils::data(validCb, package = "scruff", envir = environment())
    } else {
        cbfile <- validCb
        validCb <- data.table::fread(validCb, header = FALSE)
    }

    # cbtop10000 for checking cell barcodes
    utils::data(cbtop10000, package = "scruff", envir = environment())

    message(Sys.time(), " Start collecting QC metrics for BAM files ",
        paste(bam, collapse = " "))

    resDt <- data.table::data.table(cell_barcode = character(),
        reads_mapped_to_genome = integer(),
        reads_mapped_to_genes = integer(),
        experiment = character())

    for (i in seq_len(length(bam))) {

        message(Sys.time(), " Processing file ", bam[i])

        .tenxCheckCellBarcodes(bam[i], cbfile, validCb)

        bamfl <- Rsamtools::BamFile(bam[i], yieldSize = yieldSize)

        param <- Rsamtools::ScanBamParam(tag = tags)

        # initialize valid cell barcodes
        vcb <- data.table::as.data.table(validCb)

        colnames(vcb)[1] <- "cell_barcode"

        #vcb[, cell_barcode := paste0(cell_barcode, "-1")]
        vcb[, reads_mapped_to_genome := 0]
        vcb[, reads_mapped_to_genes := 0]

        open(bamfl)

        # Initialize iterator
        .bamIter <- .bamIterator(bamfl, param)

        if (!isTRUE(smc)) {
            qcL <- BiocParallel::bpiterate(
                ITER = .bamIter,
                FUN = .tenxBamqcUnit,
                vcb = vcb,
                BPPARAM = BiocParallel::SnowParam(
                    workers = cores)
            )
        } else {
            qcL <- BiocParallel::bpiterate(
                ITER = .bamIter,
                FUN = .tenxBamqcUnit,
                vcb = vcb,
                BPPARAM = BiocParallel::MulticoreParam(
                    workers = cores)
            )
        }

        close(bamfl)
        qcDt <- base::Reduce("+", qcL)
        qcDt <- data.table::as.data.table(qcDt, keep.rownames = TRUE)
        colnames(qcDt)[1] <- "cell_barcode"
        qcDt <- qcDt[reads_mapped_to_genome != 0, ]

        if (nrow(qcDt) == 0) {
            stop("No reads are aligned to the genome in file",
                bam[i])
        }

        filterDt <- data.table::fread(filter[i], header = FALSE)
        colnames(filterDt)[1] <- "cb"
        filterDt[, cb := data.table::tstrsplit(cb, "-")[[1]]]
        qcDt <- qcDt[cell_barcode %in% filterDt[, cb], ]
        qcDt[, experiment := experiment[i]]
        resDt <- rbind(resDt, qcDt)
    }

    resDt[, number_of_cells := TRUE]

    message(Sys.time(), " Finished collecting QC metrics")
    message(Sys.time(), " Write QC results to output directory")

    data.table::fwrite(resDt,
        sep = "\t",
        file = file.path(outDir, paste0(
            format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
            "_10x_bamqc_filtered.tsv")))

    scruffsce <- SingleCellExperiment::SingleCellExperiment(
        matrix(ncol = nrow(resDt)))

    summaryDF <- S4Vectors::DataFrame(resDt,
        row.names = paste(resDt[, cell_barcode],
            resDt[, experiment],
            sep = "-"))

    SummarizedExperiment::colData(scruffsce) <- summaryDF

    message(Sys.time(),
        " ... Save SingleCellExperiment object to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
        ))
    )

    save(scruffsce, file = file.path(outDir, paste0(
        format(Sys.time(), "%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
    )))

    return(scruffsce)
}


.tenxBamqcUnit <- function(bamGA, vcb, ...) {

    bamdt <- data.table::data.table(readname = base::names(bamGA),
        chr = as.vector(GenomicAlignments::seqnames(bamGA)),
        strand = S4Vectors::decode(BiocGenerics::strand(bamGA)),
        readstart = BiocGenerics::start(bamGA),
        readend = BiocGenerics::end(bamGA),
        NH = S4Vectors::mcols(bamGA)$NH,
        GX = S4Vectors::mcols(bamGA)$GX,
        CB = S4Vectors::mcols(bamGA)$CB,
        MM = S4Vectors::mcols(bamGA)$MM)

    bamdt[, CB := data.table::tstrsplit(CB, "-")[[1]]]

    genomeReadsDt <- bamdt[, .(readname, CB)]

    # number of aligned reads, including multimappers
    genomeReads <- base::table(genomeReadsDt[, CB])

    # gene reads (uniquely mapped reads)
    # use reads (MM = 1 | MAPQ = 255)
    geneReadsDt <- bamdt[(NH == 1 & !is.na(GX) | MM == 1) & !is.na(CB),
        .(readname, CB, GX)]

    # if ";" is in GX then this read is not uniquely mapped
    geneReadsDt <- geneReadsDt[!base::grepl(";", GX), ]

    geneReads <- base::table(geneReadsDt[, CB])

    qcdt <- data.table::copy(vcb)

    # sanity check
    if (nrow(qcdt[cell_barcode %in% genomeReadsDt[, CB], ]) !=
            length(unique(genomeReadsDt[!is.na(CB), CB]))) {
        stop("Some of the corrected cell barcodes in the BAM file do not exist",
            " in barcode whitelist. Maybe you are using an older version of ",
            "cell barcode whitelist with 14-base-long barcodes?")
    }

    qcdt[cell_barcode %in% genomeReadsDt[, CB],
        reads_mapped_to_genome := as.numeric(genomeReads[cell_barcode])]

    qcdt[cell_barcode %in% geneReadsDt[, CB],
        reads_mapped_to_genes := as.numeric(geneReads[cell_barcode])]

    resmatrix <- base::as.matrix(qcdt, rownames = 1)
    rownames(resmatrix) <- qcdt[, cell_barcode]
    return(resmatrix)
}