scruff: Single Cell RNA-Seq UMI Filtering Facilitator (scruff)

Documented in countUMI

#' Count the number of UMIs for each gene and output count matrix
#'
#' Count unique \emph{UMI:gene} pairs for single cell RNA-sequencing alignment
#'  files. Write resulting count matrix to output directory. Columns are
#'  samples (cells) and rows are gene IDs. The input sequence alignment files
#'  must be generated using FASTQ files generated by the \code{demultiplex}
#'  function in scruff package. Return a SingleCellExperiment object containing
#'  the count matrix, cell and gene annotations, and all QC metrics.
#'
#' @param sce A \code{SingleCellExperiment} object of which the \code{colData}
#'  slot contains the \strong{alignment_path} column with paths to input
#'  cell-specific sequence alignment files (BAM or SAM format).
#' @param reference Path to the reference GTF file. The TxDb object of the GTF
#'  file will be generated and saved in the current working directory with
#'  ".sqlite" suffix.
#' @param umiEdit Maximally allowed Hamming distance for UMI correction. For
#'  read alignments in each gene, by comparing to a more abundant UMI with more
#'  reads, UMIs having fewer reads and with mismatches equal or fewer than
#'  \code{umiEdit} will be assigned a corrected UMI (the UMI with more reads).
#'  Default is 0, meaning no UMI correction is performed. Doing UMI correction
#'  will decrease the number of transcripts per gene.
#' @param format Format of input sequence alignment files. \strong{"BAM"} or
#'  \strong{"SAM"}. Default is \strong{"BAM"}.
#' @param outDir Output directory for UMI counting results. UMI corrected count
#'  matrix will be stored in this directory. Default is \code{"./Count"}.
#' @param cellPerWell Number of cells per well. Can be an integer (e.g. 1)
#'  indicating the number of cells in each well or an vector with length equal
#'  to the total number of cells in the input alignment files specifying the
#'  number of cells in each file. Default is 1.
#' @param cores Number of cores used for parallelization. Default is
#'  \code{max(1, parallelly::availableCores() - 2)}, i.e. the number of
#'  available cores minus 2.
#' @param outputPrefix Prefix for expression table filename. Default is
#'  \code{"countUMI"}.
#' @param verbose Print log messages. Useful for debugging. Default to
#'  \strong{FALSE}.
#' @param logfilePrefix Prefix for log file. Default is current date and time
#'  in the format of \code{format(Sys.time(), "\%Y\%m\%d_\%H\%M\%S")}.
#' @return A \strong{SingleCellExperiment} object.
#' @examples
#' \dontrun{
#' data(barcodeExample, package = "scruff")
#' # The SingleCellExperiment object returned by alignRsubread function and the
#' # alignment BAM files are required for running countUMI function
#' # First demultiplex example FASTQ files
#' fastqs <- list.files(system.file("extdata", package = "scruff"),
#'     pattern = "\\.fastq\\.gz", full.names = TRUE)
#'
#' de <- demultiplex(
#'     project = "example",
#'     experiment = c("1h1"),
#'     lane = c("L001"),
#'     read1Path = c(fastqs[1]),
#'     read2Path = c(fastqs[2]),
#'     barcodeExample,
#'     bcStart = 1,
#'     bcStop = 8,
#'     umiStart = 9,
#'     umiStop = 12,
#'     keep = 75,
#'     overwrite = TRUE)
#'
#' # Alignment
#' library(Rsubread)
#' # Create index files for GRCm38_MT.
#' fasta <- system.file("extdata", "GRCm38_MT.fa", package = "scruff")
#' # Specify the basename for Rsubread index
#' indexBase <- "GRCm38_MT"
#' buildindex(basename = indexBase, reference = fasta, indexSplit = FALSE)
#'
#' al <- alignRsubread(de, indexBase, overwrite = TRUE)
#'
#' # Counting
#' gtf <- system.file("extdata", "GRCm38_MT.gtf", package = "scruff")
#' sce = countUMI(al, gtf, cellPerWell=c(rep(1, 46), 0, 0))
#' }
#'
#' # or use the built-in SingleCellExperiment object generated using
#' # example dataset (see ?sceExample)
#' data(sceExample, package = "scruff")
#' @import data.table rtracklayer
#' @importFrom GenomicFeatures exonsBy
#' @importFrom txdbmaker makeTxDbFromGFF
#' @rawNamespace import(GenomicAlignments, except = c(second, last, first))
#' @export
countUMI <- function(sce,
    reference,
    umiEdit = 0,
    format = "BAM",
    outDir = "./Count",
    cellPerWell = 1,
    cores = max(1, parallelly::availableCores() - 2),
    outputPrefix = "countUMI",
    verbose = FALSE,
    logfilePrefix = format(Sys.time(), "%Y%m%d_%H%M%S")) {

    .checkCores(cores)
    geneAnnotation <- .getGeneAnnotation(reference)

    seqid <- "seqid"
    if (!seqid %in% colnames(geneAnnotation)) {
        seqid <- "seqnames"
    }

    gtfcolnames <- c("gene_id",
        "gene_name",
        "gene_biotype",
        seqid,
        "start",
        "end",
        "strand",
        "source")

    .checkGTF(geneAnnotation, gtfcolnames)

    smc <- parallelly::supportsMulticore()

    message(Sys.time(), " Start UMI counting ...")
    print(match.call(expand.dots = TRUE))

    if (ncol(sce) != length(cellPerWell) & length(cellPerWell) != 1) {
        stop("The length of cellPerWell is not equal to the column number",
            " of the SCE object.")
    }

    alignmentFilePaths <- SummarizedExperiment::colData(sce)$alignment_path

    if (!all(file.exists(alignmentFilePaths))) {
        stop("Partial or all alignment files nonexistent. ",
            "Please check paths are correct.\n",
            alignmentFilePaths)
    }

    logfile <- paste0(logfilePrefix, "_countUMI_log.txt")

    if (verbose) {
        .logMessages(Sys.time(),
            "... Start UMI counting",
            logfile = logfile,
            append = FALSE)
        .logMessages(Sys.time(),
            alignmentFilePaths,
            logfile = logfile,
            append = TRUE)
        message("... Input alignment files:")
        print(alignmentFilePaths)
    } else {
        .logMessages(Sys.time(),
            "... Start UMI counting",
            logfile = NULL,
            append = FALSE)
    }

    message(Sys.time(),
        " ... Creating output directory",
        outDir)
    dir.create(file.path(outDir),
        showWarnings = FALSE,
        recursive = TRUE)

    message(Sys.time(),
        " ... Loading TxDb file")
    features <- suppressPackageStartupMessages(.gtfReadDb(reference))

    tryCatch(
        GenomeInfoDb::seqlevelsStyle(features) <- "NCBI",
        error = function(e) {
            warning("found no sequence renaming map compatible with seqname",
                " style 'NCBI' for the input reference ", basename(reference))
        }
    )

    if (format == "SAM") {
        if (!isTRUE(smc)) {
            cl <- parallelly::makeClusterPSOCK(cores)
        } else {
            cl <- parallel::makeForkCluster(cores)
        }
        alignmentFilePaths <- parallel::parLapply(cl = cl,
            X = alignmentFilePaths,
            fun = .toBam,
            logfile, overwrite = FALSE, index = FALSE)
        parallel::stopCluster(cl)
        alignmentFilePaths <- unlist(alignmentFilePaths)
    }


    if (!isTRUE(smc)) {
        cl <- parallelly::makeClusterPSOCK(cores)
    } else {
        cl <- parallel::makeForkCluster(cores)
    }

    exprL <- suppressPackageStartupMessages(
        parallel::parLapply(cl = cl,
            X = alignmentFilePaths,
            fun = .countUmiUnit,
            features,
            umiEdit,
            logfile,
            verbose))

    parallel::stopCluster(cl)

    expr <- do.call(cbind, exprL)
    expr <- data.table::as.data.table(expr, keep.rownames = TRUE)

    colnames(expr)[1] <- "geneid"

    message(Sys.time(),
        " ... Write UMI filtered count matrix to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_",
            outputPrefix, ".tsv"
        ))
    )

    data.table::fwrite(expr, file.path(outDir, paste0(
        format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
        outputPrefix, ".tsv"
    )), sep = "\t")

    message(Sys.time(),
        " ... Add count matrix and QC metrics to SCE object.")

    scruffsce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = as.matrix(
            expr[!geneid %in% c("reads_mapped_to_genome",
                "reads_mapped_to_genes",
                "median_reads_per_umi",
                "avg_reads_per_umi",
                "median_reads_per_corrected_umi",
                "avg_reads_per_corrected_umi"), -"geneid"],
            rownames = expr[!geneid %in% c("reads_mapped_to_genome",
                "reads_mapped_to_genes",
                "median_reads_per_umi",
                "avg_reads_per_umi",
                "median_reads_per_corrected_umi",
                "avg_reads_per_corrected_umi"),
                geneid])))

    readmapping <- t(expr[geneid %in% c("reads_mapped_to_genome",
        "reads_mapped_to_genes",
        "median_reads_per_umi",
        "avg_reads_per_umi",
        "median_reads_per_corrected_umi",
        "avg_reads_per_corrected_umi"), -"geneid"])
    colnames(readmapping) <- c("reads_mapped_to_genome",
        "reads_mapped_to_genes",
        "median_reads_per_umi",
        "avg_reads_per_umi",
        "median_reads_per_corrected_umi",
        "avg_reads_per_corrected_umi")

    SummarizedExperiment::rowData(scruffsce) <-
        S4Vectors::DataFrame(geneAnnotation[order(gene_id), ],
            row.names = geneAnnotation[order(gene_id), gene_id])

    #  SingleCellExperiment::isSpike(scruffsce,
    #      "ERCC") <- grepl("ERCC-",
    #          rownames(scruffsce))

    # SingleCellExperiment::isSpike(scruffsce,
    #     "ERCC") <- which(SummarizedExperiment::rowData(scruffsce)[,
    #         "source"] == "ERCC")

    message(Sys.time(),
        " ... Save gene annotation data to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_",
            outputPrefix, "_gene_annot.tsv"
        ))
    )

    data.table::fwrite(geneAnnotation,
        sep = "\t",
        file = file.path(outDir, paste0(
            format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
            outputPrefix, "_gene_annot.tsv"
        )))

    # UMI filtered transcripts QC metrics
    # total counts exclude ERCC
    totalCounts <- base::colSums(as.data.frame(
        SummarizedExperiment::assay(scruffsce)
        [which(SummarizedExperiment::rowData(scruffsce)[,
            "source"] != "ERCC"), ]))

    # MT counts
    mtCounts <- base::colSums(as.data.frame(
        SummarizedExperiment::assay(scruffsce)
        [grep("MT", SummarizedExperiment::rowData(scruffsce)
            [, seqid], ignore.case = TRUE), ]))

    # gene number exclude ERCC
    cm <- SummarizedExperiment::assay(scruffsce)[
        which(SummarizedExperiment::rowData(scruffsce)[,
            "source"] != "ERCC"), , drop = FALSE]

    geneNumber <- vapply(colnames(cm), function(cells) {
        sum(cm[, cells] != 0)
    }, integer(1))

    # protein coding genes and counts
    if ("gene_biotype" %in% colnames(geneAnnotation)) {
        proteinCodingGene <- geneAnnotation[gene_biotype == "protein_coding",
            gene_id]
        proGene <- vapply(colnames(cm), function(cells) {
            sum(cm[proteinCodingGene, cells] != 0)
        }, integer(1))
        proCounts <- base::colSums(as.data.frame(cm[proteinCodingGene, ]))
    }

    qcdf <- cbind(SummarizedExperiment::colData(sce),
        readmapping,
        list(total_counts = totalCounts,
            mt_counts = mtCounts,
            genes = geneNumber,
            protein_coding_genes = proGene,
            protein_coding_counts = proCounts,
            number_of_cells = cellPerWell))

    message(Sys.time(),
        " ... Save cell-specific quality metrics to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_",
            outputPrefix, "_QC.tsv"
        ))
    )

    data.table::fwrite(as.data.frame(qcdf),
        sep = "\t",
        file = file.path(outDir, paste0(
            format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
            outputPrefix, "_QC.tsv"
        )))

    SummarizedExperiment::colData(scruffsce) <- qcdf

    message(Sys.time(),
        " ... Save SingleCellExperiment object to ",
        file.path(outDir, paste0(
            format(Sys.time(),
                "%Y%m%d_%H%M%S"), "_",
            outputPrefix, "_sce.rda"
        ))
    )

    save(scruffsce, file = file.path(outDir, paste0(
        format(Sys.time(), "%Y%m%d_%H%M%S"), "_",
        outputPrefix, "_sce.rda"
    )))

    if (verbose) {
        .logMessages(Sys.time(),
                    "... UMI counting done!",
                    logfile = logfile,
                    append = FALSE)
        message(" ... UMI counting done!")
    } else {
        .logMessages(Sys.time(),
                    " ... UMI counting done!",
                    logfile = NULL,
                    append = FALSE)
    }

    return(scruffsce)
}


.countUmiUnit <- function(i, features, umiEdit, logfile, verbose) {
    if (verbose) {
        .logMessages(Sys.time(),
            "... UMI counting sample",
            i,
            logfile = logfile,
            append = TRUE)
    }

    # if sequence alignment file is empty
    if (file.size(i) == 0) {
        countUmiDt <- data.table::data.table(
            gene_id = c(sort(names(features)),
                "reads_mapped_to_genome",
                "reads_mapped_to_genes",
                "median_reads_per_umi",
                "avg_reads_per_umi",
                "median_reads_per_corrected_umi",
                "avg_reads_per_corrected_umi"))
        cell <- .removeLastExtension(i)
        countUmiDt[[cell]] <- 0

        return(data.frame(countUmiDt,
            row.names = 1,
            check.names = FALSE,
            fix.empty.names = FALSE))
    }

    bfl <- Rsamtools::BamFile(i)
    bamGA <- GenomicAlignments::readGAlignments(bfl, use.names = TRUE)

    tryCatch(
        GenomeInfoDb::seqlevelsStyle(bamGA) <- "NCBI",
        error = function(e) {
            warning("found no sequence renaming map compatible with the seqid.",
                " style 'NCBI' for the BAM file ", basename(i))
        }
    )

    genomeReads <- data.table::data.table(
        name = names(bamGA),
        seqnames = as.vector(GenomicAlignments::seqnames(bamGA)))

    if (length(unique(genomeReads[, name])) != nrow(genomeReads)) {
        stop("Corrupt BAM file ",
            i,
            ". Duplicate read names detected.",
            " Try rerunning demultiplexing and alignment functions",
            " with appropriate number of cores and set nBestLocations = 1.")
    }

    # reads mapped to genome (exclude ERCC spike-in)
    readsMappedToGenome <- nrow(
        genomeReads[!grepl("ERCC", genomeReads[, seqnames]), .(name)])

    # UMI filtering
    ol <- GenomicAlignments::findOverlaps(features, bamGA)

    oldt <- data.table::data.table(
        gene_id = base::names(features)[S4Vectors::queryHits(ol)],
        name = base::names(bamGA)[S4Vectors::subjectHits(ol)],
        pos = BiocGenerics::start(bamGA)[S4Vectors::subjectHits(ol)]
    )

    # remove ambiguous gene alignments (union mode filtering)
    if (nrow(oldt) != 0) {
        oldt <- oldt[!(
            base::duplicated(oldt, by = "name") |
                base::duplicated(oldt, by = "name", fromLast = TRUE)), ]
    }

    # if 0 count in the cell
    if (nrow(oldt) == 0) {
        # clean up
        countUmiDt <- data.table::data.table(
            gene_id = c(names(features),
                "reads_mapped_to_genome",
                "reads_mapped_to_genes",
                "median_reads_per_umi",
                "avg_reads_per_umi",
                "median_reads_per_corrected_umi",
                "avg_reads_per_corrected_umi"))
        cell <- .removeLastExtension(i)
        countUmiDt[[cell]] <- 0
        countUmiDt[gene_id == "reads_mapped_to_genome",
            cell] <- readsMappedToGenome

        # coerce to data frame to keep rownames for cbind combination
        countUmiDt <- data.frame(countUmiDt,
            row.names = 1,
            check.names = FALSE,
            fix.empty.names = FALSE)

    } else {
        # move UMI to a separate column
        oldt[, umi := data.table::last(data.table::tstrsplit(name, ":"))]
        oldt[, inferred_umi := umi]

        # reads mapped to genes
        readsMappedToGenes <- nrow(oldt[!grepl("ERCC", oldt[, gene_id]), ])

        # median number of reads per umi
        rpu <- table(oldt[, umi])
        medReadsUmi <- median(rpu)
        avgReadsUmi <- mean(rpu)

        medReadsUmic <- 0
        avgReadsUmic <- 0

        # UMI filtering and correction

        # strict way of doing UMI filtering:
        # reads with different pos are considered unique trancsript molecules
        # countUmi <- base::table(
        #     unique(oldt[, .(gene_id, umi, pos)])
        #     [, gene_id])

        # The way CEL-Seq pipeline does UMI filtering:
        # Reads with different UMI tags are
        # considered unique trancsript molecules
        # Read positions do not matter

        # UMI correction

        if (umiEdit != 0) {
            for (g in unique(oldt[, gene_id])) {

                umis <- sort(table(oldt[gene_id == g, inferred_umi]),
                    decreasing = TRUE)
                j <- 1

                while (j < length(umis)) {
                    u <- names(umis)[j]
                    sdm <- stringdist::stringdistmatrix(u, names(umis),
                        method = "hamming", nthread = 1)
                    sdm[which(sdm == 0)] <- NA
                    mindist <- min(sdm, na.rm = TRUE)

                    if (mindist <= umiEdit & mindist != 0) {
                        inds <- which(sdm == mindist)
                        oldt[umi %in% names(umis)[inds],
                            inferred_umi := names(umis)[j]]
                    }

                    umis <- sort(table(oldt[gene_id == g, inferred_umi]),
                        decreasing = TRUE)
                    j <- j + 1
                }
            }
            # after correction median numbre of reads per umi
            rpuc <- table(oldt[, inferred_umi])
            medReadsUmic <- median(rpuc)
            avgReadsUmic <- mean(rpuc)
        }

        countUmi <- base::table(unique(oldt[,
            .(gene_id, inferred_umi)])[, gene_id])

        # clean up
        countUmiDt <- data.table::data.table(
            gene_id = c(names(features),
                "reads_mapped_to_genome",
                "reads_mapped_to_genes",
                "median_reads_per_umi",
                "avg_reads_per_umi",
                "median_reads_per_corrected_umi",
                "avg_reads_per_corrected_umi"))
        cell <- .removeLastExtension(i)
        countUmiDt[[cell]] <- 0
        countUmiDt[gene_id == "reads_mapped_to_genome",
            cell] <- readsMappedToGenome
        countUmiDt[gene_id == "reads_mapped_to_genes",
            cell] <- readsMappedToGenes
        countUmiDt[gene_id == "median_reads_per_umi",
            cell] <- medReadsUmi
        countUmiDt[gene_id == "avg_reads_per_umi",
            cell] <- avgReadsUmi
        countUmiDt[gene_id == "median_reads_per_corrected_umi",
            cell] <- medReadsUmic
        countUmiDt[gene_id == "avg_reads_per_corrected_umi",
            cell] <- avgReadsUmic
        countUmiDt[gene_id %in% names(countUmi),
            eval(cell) := as.numeric(countUmi[gene_id])]

        # coerce to data frame to keep rownames for cbind combination
        countUmiDt <- data.frame(countUmiDt,
            row.names = 1,
            check.names = FALSE,
            fix.empty.names = FALSE)
    }
    return(countUmiDt)
}