scruff: Single Cell RNA-Seq UMI Filtering Facilitator (scruff)

Documented in tenxBamqc

#' Generate and output 10X read alignment data quality metrics
#'
#' Read BAM file generated by Cell Ranger pipeline and output QC metrics
#'  including number of aligned reads and reads aligned to an gene.
#'
#' @param bam Paths to input BAM files generated by Cell Ranger pipeline. These
#'  files are usually named \emph{"possorted_genome_bam.bam"} in the
#'   \emph{"outs"} folder of the top-level project output folders, respectively.
#' @param experiment A character vector of experiment names. Represents the
#'  group label for each BAM file, e.g. "patient1, patient2, ...". The
#'  length of \code{experiment} equals the number of BAM files to be processed.
#' @param filter Paths to the filtered barcode files. Should be in the same
#'  length and order of the input BAM files. These files are named
#'  \emph{"barcodes.tsv"} located at
#'  \emph{outs/filtered_gene_bc_matrices/<reference_genome>/}.
#' @param validCb Path to the cell barcode whitelist file. By default uses the
#'  file "737K-august-2016.txt" which is compatible with the v2 chemistry
#'  protocol. The file can be inspected by calling
#'  \code{data(validCb, package = "scruff")}. If the library is generated
#'  using the v1 chemistry protocol, the path to the v1 barcode whitelist file
#'  ("737K-april-2014_rc.txt") needs to be provided.
#' @param tags BAM tags used for collecting QC metrics. Contains
#'  non-standard tags locally-defined by Cell Ranger pipeline. Should not be
#'  changed in most cases.
#' @param yieldSize The number of records (alignments) to yield when drawing
#'  successive subsets from a BAM file, providing the number of successive 
#'  records to be returned on each yield. This parameter is passed to the
#'  \code{yieldSize} argument of the \link[Rsamtools]{BamFile} function in
#'  \link[Rsamtools]{Rsamtools} package. Default is \strong{1e06}.
#' @param outDir Output directory. The location to write resulting QC table.
#' @param cores Number of cores used for parallelization. Default is
#'  \code{max(1, parallel::detectCores() - 2)}, i.e. the number of available
#'  cores minus 2.
#' @return ggplot object showing the number of aligned reads and reads aligned
#'  to an gene.
#' @examples
#' # first 5000 records in the bam file downloaded from here:
#' # http://sra-download.ncbi.nlm.nih.gov/srapub_files/
#' # SRR5167880_E18_20160930_Neurons_Sample_01.bam
#' # see details here:
#' # https://trace.ncbi.nlm.nih.gov/Traces/sra/?study=SRP096558
#' # and here:
#' # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE93421
#' bamfile10x <- system.file("extdata",
#'     "SRR5167880_E18_20160930_Neurons_Sample_01_5000.bam",
#'     package = "scruff")
#' 
#' # library(TENxBrainData)
#' # library(data.table)
#' # tenx <- TENxBrainData()
#' # # get filtered barcodes for sample 01
#' # filteredBcIndex <- tstrsplit(colData(tenx)[, "Barcode"], "-")[[2]] == 1
#' # filteredBc <- colData(tenx)[filteredBcIndex, ][["Barcode"]]
#' 
#' filteredBc <- system.file("extdata",
#'     "SRR5167880_E18_20160930_Neurons_Sample_01_filtered_barcode.tsv",
#'     package = "scruff")
#' # QC results are saved to current working directory
#' qcDt <- tenxBamqc(bam = bamfile10x,
#'     experiment = "Neurons_Sample_01",
#'     filter = filteredBc)
#' qcDt
#' @import Rsamtools
#' @export
tenxBamqc <- function(bam,
    experiment,
    filter,
    validCb = NA,
    tags = c("NH", "GX", "CB", "MM"),
    yieldSize = 1000000,
    outDir = "./",
    cores = max(1, parallel::detectCores() - 2)) {
    
    .checkCores(cores)
    
    isWindows <- .Platform$OS.type == "windows"
    
    print(match.call(expand.dots = TRUE))
    
    if (length(experiment) != length(bam)) {
        stop("'bam' and 'experiment' have unequal lengths!")
    }
    
    if (length(experiment) != length(filter)) {
        stop("'bam' and 'filter' have unequal lengths!")
    }
    
    message(Sys.time(), " Start collecting QC metrics for BAM files ",
        paste(bam, collapse = " "))
    
    if (is.na(validCb)) {
        utils::data(validCb, package = "scruff", envir = environment())
    } else {
        validCb <- data.table::fread(validCb, header = FALSE)
    }
    
    
    resDt <- data.table::data.table(cell_barcode = character(),
        reads_mapped_to_genome = integer(),
        reads_mapped_to_genes = integer(),
        experiment = character())
    
    for (i in seq_len(length(bam))) {
        
        message(Sys.time(), " Processing file ", bam[i])
        
        bamfl <- Rsamtools::BamFile(bam[i], yieldSize = yieldSize)
        
        param <- Rsamtools::ScanBamParam(tag = tags)
        
        # initialize valid cell barcodes
        vcb <- data.table::as.data.table(validCb)
        
        colnames(vcb)[1] <- "cell_barcode"
        
        #vcb[, cell_barcode := paste0(cell_barcode, "-1")]
        vcb[, reads_mapped_to_genome := 0]
        vcb[, reads_mapped_to_genes := 0]
        
        open(bamfl)
        
        # Initialize iterator
        .bamIter <- .bamIterator(bamfl, param)
        
        if (isWindows) {
            qcL <- BiocParallel::bpiterate(
                ITER = .bamIter,
                FUN = .tenxBamqcUnit,
                vcb = vcb,
                BPPARAM = BiocParallel::SnowParam(
                    workers = cores)
            )
        } else {
            qcL <- BiocParallel::bpiterate(
                ITER = .bamIter,
                FUN = .tenxBamqcUnit,
                vcb = vcb,
                BPPARAM = BiocParallel::MulticoreParam(
                    workers = cores)
            )
        }
        
        close(bamfl)
        qcDt <- base::Reduce("+", qcL)
        qcDt <- data.table::as.data.table(qcDt, keep.rownames = TRUE)
        colnames(qcDt)[1] <- "cell_barcode"
        qcDt <- qcDt[reads_mapped_to_genome != 0, ]
        
        if (nrow(qcDt) == 0) {
            stop("No reads are aligned to the genome in file",
                bam[i])
        }
        
        filterDt <- data.table::fread(filter[i], header = FALSE)
        colnames(filterDt)[1] <- "cb"
        filterDt[, cb := data.table::tstrsplit(cb, "-")[[1]]]
        qcDt <- qcDt[cell_barcode %in% filterDt[, cb], ]
        qcDt[, experiment := experiment[i]]
        resDt <- rbind(resDt, qcDt)
    }
    
    resDt[, number_of_cells := TRUE]
    
    message(Sys.time(), " Finished collecting QC metrics")
    message(Sys.time(), " Write QC results to output directory")
    
    data.table::fwrite(resDt,
        sep = "\t",
        file = file.path(outDir, paste0(
            format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
            "_10x_bamqc_filtered.tsv")))
    
    scruffsce <- SingleCellExperiment::SingleCellExperiment(
        matrix(ncol = nrow(resDt)))
    
    summaryDF <- S4Vectors::DataFrame(resDt,
        row.names = paste(resDt[, cell_barcode],
            resDt[, experiment],
            sep = "-"))
    
    SummarizedExperiment::colData(scruffsce) <- summaryDF
    
    message(Sys.time(),
        " ... Save SingleCellExperiment object to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
        ))
    )
    
    save(scruffsce, file = file.path(outDir, paste0(
        format(Sys.time(), "%Y%m%d_%H%M%S"), "_10X_QC_sce.rda"
    )))
    
    return (scruffsce)
}


.tenxBamqcUnit <- function(bamGA, vcb, ...) {
    
    bamdt <- data.table::data.table(readname = base::names(bamGA),
        chr = as.vector(GenomicAlignments::seqnames(bamGA)),
        strand = S4Vectors::decode(BiocGenerics::strand(bamGA)),
        readstart = BiocGenerics::start(bamGA),
        readend = BiocGenerics::end(bamGA),
        NH = S4Vectors::mcols(bamGA)$NH,
        GX = S4Vectors::mcols(bamGA)$GX,
        CB = S4Vectors::mcols(bamGA)$CB,
        MM = S4Vectors::mcols(bamGA)$MM)
    
    bamdt[, CB := data.table::tstrsplit(CB, "-")[[1]]]
    
    genomeReadsDt <- bamdt[, .(readname, CB)]
    
    # number of aligned reads, including multimappers
    genomeReads <- base::table(genomeReadsDt[, CB])
    
    # gene reads (uniquely mapped reads)
    # use reads (MM = 1 | MAPQ = 255)
    geneReadsDt <- bamdt[(NH == 1 & !is.na(GX) | MM == 1) & !is.na(CB),
        .(readname, CB, GX)]
    
    # if ";" is in GX then this read is not uniquely mapped
    geneReadsDt <- geneReadsDt[!base::grepl(";", GX), ]
    
    geneReads <- base::table(geneReadsDt[, CB])
    
    qcdt <- data.table::copy(vcb)
    
    # sanity check
    if (nrow(qcdt[cell_barcode %in% genomeReadsDt[, CB], ]) !=
            length(unique(genomeReadsDt[!is.na(CB), CB]))) {
        stop("Some of the corrected cell barcodes in the BAM file do not exist",
            " in barcode whitelist. Maybe you are using an older version of ",
            "cell barcode whitelist with 14-base-long barcodes?")
    }
    
    qcdt[cell_barcode %in% genomeReadsDt[, CB],
        reads_mapped_to_genome := as.numeric(genomeReads[cell_barcode])]
    
    qcdt[cell_barcode %in% geneReadsDt[, CB],
        reads_mapped_to_genes := as.numeric(geneReads[cell_barcode])]
    
    resmatrix <- base::as.matrix(qcdt, rownames = 1)
    rownames(resmatrix) <- qcdt[, cell_barcode]
    return (resmatrix)
}