bcbioSinglecell: Single-Cell RNA-Seq Utilities

Documented in bcbioSingleCell

setOldClass(Classes = c("grouped_df", "tbl_df", "tibble"))



#' @rdname bcbioSingleCell
#' @aliases NULL
#' @exportClass bcbioSingleCell
#' @usage NULL
bcbioSingleCell <- setClass(
    Class = "bcbioSingleCell",
    contains = "SingleCellExperiment"
)



# Constructors =================================================================
#' `bcbioSingleCell` Object and Constructor
#'
#' `bcbioSingleCell` is an S4 class that extends `SingleCellExperiment`, and is
#' designed to store a bcbio single-cell RNA-seq analysis. This class contains
#' read counts saved as a sparse matrix (`dgCMatrix`), sample metadata, and cell
#' quality control metrics.
#'
#' @section Remote Data:
#'
#' When working in RStudio, we recommend connecting to the bcbio-nextgen
#'   run directory as a remote connection over
#'   [sshfs](https://github.com/osxfuse/osxfuse/wiki/SSHFS).
#'
#' @note `bcbioSingleCell` extended `SummarizedExperiment` prior to v0.1.0,
#'   where we migrated to `SingleCellExperiment`.
#'
#' @rdname bcbioSingleCell
#' @aliases bcbioSingleCell-class
#' @family S4 Object
#' @docType class
#' @author Michael Steinbaugh, Rory Kirchner
#'
#' @inheritParams bcbioBase::prepareSummarizedExperiment
#' @inheritParams general
#' @param uploadDir Path to final upload directory. This path is set when
#'   running "`bcbio_nextgen -w template`".
#' @param organism Organism name. Use the full latin name (e.g.
#'   "Homo sapiens"), since this will be input downstream to
#'   AnnotationHub and ensembldb, unless `gffFile` is set.
#' @param sampleMetadataFile Sample barcode metadata file. Optional for runs
#'   with demultiplixed index barcodes (e.g. SureCell), but otherwise required
#'   for runs with multipliexed FASTQs containing multiple index barcodes (e.g.
#'   inDrop).
#' @param ensemblRelease *Optional.* Ensembl release version. If `NULL`,
#'   defaults to current release, and does not typically need to be
#'   user-defined. Passed to AnnotationHub for `EnsDb` annotation matching,
#'   unless `gffFile` is set.
#' @param genomeBuild *Optional.* Ensembl genome build name (e.g. "GRCh38").
#'   This will be passed to AnnotationHub for `EnsDb` annotation matching,
#'   unless `gffFile` is set.
#' @param gffFile *Optional, not recommended.* By default, we recommend leaving
#'   this `NULL` for genomes that are supported on Ensembl. In this case, the
#'   row annotations ([rowRanges()]) will be obtained automatically from Ensembl
#'   by passing the `organism`, `genomeBuild`, and `ensemblRelease` arguments to
#'   AnnotationHub and ensembldb. For a genome that is not supported on Ensembl
#'   and/or AnnotationHub, a GFF/GTF (General Feature Format) file is required.
#'   Generally, we recommend using a GTF (GFFv2) file here over a GFF3 file if
#'   possible, although all GFF formats are supported. The function will
#'   internally generate a `TxDb` containing transcript-to-gene mappings and
#'   construct a `GRanges` object containing the genomic ranges ([rowRanges()]).
#' @param ... Additional arguments, to be stashed in the [metadata()] slot.
#'
#' @return `bcbioSingleCell`.
#' @export
#'
#' @seealso
#' - [SingleCellExperiment::SingleCellExperiment()].
#' - `.S4methods(class = "bcbioSingleCell")`.
#'
#' @examples
#' uploadDir <- system.file("extdata/indrops", package = "bcbioSingleCell")
#' x <- bcbioSingleCell(
#'     uploadDir = uploadDir,
#'     organism = "Homo sapiens",
#'     sampleMetadataFile = file.path(uploadDir, "metadata.csv"),
#'     ensemblRelease = 87L
#' )
#' show(x)
#' validObject(x)
bcbioSingleCell <- function(
    uploadDir,
    organism = NULL,
    sampleMetadataFile = NULL,
    interestingGroups = "sampleName",
    ensemblRelease = NULL,
    genomeBuild = NULL,
    transgeneNames = NULL,
    spikeNames = NULL,
    gffFile = NULL,
    ...
) {
    dots <- list(...)
    pipeline <- "bcbio"

    # Legacy arguments =========================================================
    call <- match.call(expand.dots = TRUE)
    # annotable
    if ("annotable" %in% names(call)) {
        stop("Use `gffFile` instead of `annotable`")
    }
    # ensemblVersion
    if ("ensemblVersion" %in% names(call)) {
        warning("Use `ensemblRelease` instead of `ensemblVersion`")
        ensemblRelease <- call[["ensemblVersion"]]
        dots[["ensemblVersion"]] <- NULL
    }
    # gtfFile
    if ("gtfFile" %in% names(call)) {
        warning("Use `gffFile` instead of `gtfFile`")
        gffFile <- call[["gtfFile"]]
        dots[["gtfFile"]] <- NULL
    }
    # organism
    if (!"organism" %in% names(call)) {
        warning("`organism` is now recommended, to acquire gene annotations")
    }
    dots <- Filter(Negate(is.null), dots)

    # Assert checks ============================================================
    assert_is_a_string(uploadDir)
    assert_all_are_dirs(uploadDir)
    assertIsAStringOrNULL(sampleMetadataFile)
    assert_is_character(interestingGroups)
    assertIsAStringOrNULL(organism)
    assertIsAnImplicitIntegerOrNULL(ensemblRelease)
    assertIsAStringOrNULL(genomeBuild)
    assert_is_any_of(transgeneNames, c("character", "NULL"))
    assert_is_any_of(spikeNames, c("character", "NULL"))
    assertIsAStringOrNULL(gffFile)
    if (is_a_string(gffFile)) {
        assert_all_are_existing_files(gffFile)
    }

    # Directory paths ==========================================================
    uploadDir <- normalizePath(uploadDir, winslash = "/", mustWork = TRUE)
    projectDir <- dir(
        uploadDir,
        pattern = bcbioBase::projectDirPattern,
        full.names = FALSE,
        recursive = FALSE
    )
    assert_is_a_string(projectDir)
    message(projectDir)
    match <- str_match(projectDir, bcbioBase::projectDirPattern)
    runDate <- as.Date(match[[2L]])
    template <- match[[3L]]
    projectDir <- file.path(uploadDir, projectDir)
    sampleDirs <- sampleDirs(uploadDir)

    # Sequencing lanes =========================================================
    if (any(grepl(bcbioBase::lanePattern, sampleDirs))) {
        lanes <- str_match(names(sampleDirs), bcbioBase::lanePattern) %>%
            .[, 2L] %>%
            unique() %>%
            length()
        message(paste(
            lanes, "sequencing lane detected", "(technical replicates)"
        ))
    } else {
        lanes <- 1L
    }

    # Project summary YAML =====================================================
    yamlFile <- file.path(projectDir, "project-summary.yaml")
    yaml <- readYAML(yamlFile)

    # bcbio run information ====================================================
    dataVersions <- readDataVersions(
        file = file.path(projectDir, "data_versions.csv")
    )
    assert_is_tbl_df(dataVersions)

    programVersions <- readProgramVersions(
        file = file.path(projectDir, "programs.txt")
    )
    assert_is_tbl_df(programVersions)

    bcbioLog <- readLog(
        file = file.path(projectDir, "bcbio-nextgen.log")
    )
    assert_is_character(bcbioLog)

    bcbioCommandsLog <- readLog(
        file = file.path(projectDir, "bcbio-nextgen-commands.log")
    )
    assert_is_character(bcbioCommandsLog)

    # Cellular barcode cutoff ==================================================
    cellularBarcodeCutoffPattern <- "--cb_cutoff (\\d+)"
    assert_any_are_matching_regex(
        x = bcbioCommandsLog,
        pattern = cellularBarcodeCutoffPattern
    )
    match <- str_match(
        string = bcbioCommandsLog,
        pattern = cellularBarcodeCutoffPattern
    )
    cellularBarcodeCutoff <- match %>%
        .[, 2L] %>%
        na.omit() %>%
        unique() %>%
        as.integer()
    assert_is_an_integer(cellularBarcodeCutoff)

    message(paste(
        cellularBarcodeCutoff,
        "reads per cellular barcode cutoff detected"
    ))

    # Detect gene or transcript-level output ===================================
    genemapPattern <- "--genemap (.+)-tx2gene.tsv"
    if (any(grepl(genemapPattern, bcbioCommandsLog))) {
        level <- "genes"
    } else {
        level <- "transcripts"
    }

    # Molecular barcode (UMI) type =============================================
    umiPattern <- "/umis/([a-z0-9\\-]+)\\.json"
    assert_any_are_matching_regex(bcbioCommandsLog, umiPattern)
    umiType <- str_match(bcbioCommandsLog, umiPattern) %>%
        .[, 2L] %>%
        na.omit() %>%
        unique() %>%
        gsub(
            pattern = "-transform",
            replacement = "",
            x = .
        )
    assert_is_a_string(umiType)
    message(paste("UMI type:", umiType))

    # Sample metadata ==========================================================
    # External file required for inDrop
    if (grepl("indrop", umiType) && is.null(sampleMetadataFile)) {
        stop(paste(
            "inDrop samples require `sampleMetadataFile`",
            "containing the index barcode sequences"
        ))
    }

    if (is_a_string(sampleMetadataFile)) {
        sampleData <- readSampleData(sampleMetadataFile)
    } else {
        sampleData <- readYAMLSampleData(yamlFile)
    }

    # Check for incorrect reverse complement input
    if ("sequence" %in% colnames(sampleData)) {
        sampleDirSequence <- str_extract(names(sampleDirs), "[ACGT]+$")
        if (identical(
            sort(sampleDirSequence),
            sort(as.character(sampleData[["sequence"]]))
        )) {
            stop(paste(
                "It appears that the reverse complement sequence of the",
                "i5 index barcodes were input into the sample metadata",
                "`sequence` column. bcbio outputs the revcomp into the",
                "sample directories, but the forward sequence should be",
                "used in the R package."
            ))
        }
    }

    assert_is_subset(rownames(sampleData), names(sampleDirs))
    sampleData <- sanitizeSampleData(sampleData)

    # Interesting groups =======================================================
    # Ensure internal formatting in camelCase
    interestingGroups <- camel(interestingGroups, strict = FALSE)
    assertFormalInterestingGroups(sampleData, interestingGroups)

    # Subset sample directories by metadata ====================================
    # Check to see if a subset of samples is requested via the metadata file.
    # This matches by the reverse complement sequence of the index barcode.
    if (nrow(sampleData) < length(sampleDirs)) {
        message("Loading a subset of samples, defined by the metadata file")
        allSamples <- FALSE
        sampleDirs <- sampleDirs[rownames(sampleData)]
        message(paste(length(sampleDirs), "samples matched by metadata"))
    } else {
        allSamples <- TRUE
    }

    # Assays ===================================================================
    message(paste("Reading counts as", level))
    countsList <- .sparseCountsList(
        sampleDirs = sampleDirs,
        pipeline = pipeline,
        umiType = umiType
    )
    # Ensure samples with empty matrices (`NULL`) are filtered
    countsList <- Filter(Negate(is.null), countsList)
    counts <- do.call(cbind, countsList)

    # Require transcript to gene conversion (legacy) ===========================
    if (level == "transcripts") {
        message("Converting transcripts to genes")

        # GFF file is required
        if (!is_a_string(gffFile)) {
            stop("GFF is required to convert transcripts to genes")
        }

        # Note that this data.frame won't contain transcript versions
        tx2gene <- makeTx2geneFromGFF(gffFile)

        # Add spike-ins to tx2gene, if necessary
        if (is.character(isSpike)) {
            assert_are_disjoint_sets(rownames(tx2gene), isSpike)
            spike <- data.frame(
                "txID" = isSpike,
                "geneID" = isSpike,
                row.names = isSpike,
                stringsAsFactors = FALSE
            )
            tx2gene <- rbind(spike, tx2gene)
        }

        # Ensure Ensembl transcript versions are removed
        counts <- stripTranscriptVersions(counts)

        # Resize the tx2gene to match the matrix rownames
        assert_is_subset(rownames(counts), rownames(tx2gene))
        tx2gene <- tx2gene[rownames(counts), , drop = FALSE]

        # Now we're ready to assign `geneID` and aggregate
        rownames(counts) <- tx2gene[["geneID"]]
        counts <- aggregate.Matrix(
            x = counts,
            groupings = rownames(counts),
            fun = "sum"
        )
        level <- "genes"
    } else {
        tx2gene <- NULL
    }

    # Unfiltered cellular barcode distributions ================================
    cbList <- .cellularBarcodesList(sampleDirs)
    cbData <- .bindCellularBarcodes(cbList)

    # Row data =================================================================
    rowRangesMetadata <- NULL
    if (is_a_string(gffFile)) {
        rowRanges <- makeGRangesFromGFF(gffFile, format = "genes")
    } else if (is_a_string(organism)) {
        # ah: AnnotationHub
        ah <- makeGRangesFromEnsembl(
            organism = organism,
            format = "genes",
            genomeBuild = genomeBuild,
            release = ensemblRelease,
            metadata = TRUE
        )
        assert_is_list(ah)
        assert_are_identical(names(ah), c("data", "metadata"))
        rowRanges <- ah[["data"]]
        assert_is_all_of(rowRanges, "GRanges")
        rowRangesMetadata <- ah[["metadata"]]
        assert_is_data.frame(rowRangesMetadata)
    } else {
        rowRanges <- emptyRanges(rownames(counts))
    }
    assert_is_subset(rownames(counts), names(rowRanges))
    rowData <- as.data.frame(rowRanges)
    rownames(rowData) <- names(rowRanges)

    # Column data ==============================================================
    # Always prefilter, removing very low quality cells with no UMIs or genes
    metrics <- metrics(
        object = counts,
        rowData = rowData,
        prefilter = TRUE
    )

    # Subset the counts to match the prefiltered metrics
    counts <- counts[, rownames(metrics), drop = FALSE]

    colData <- as(metrics, "DataFrame")
    colData[["cellID"]] <- rownames(colData)
    cell2sample <- mapCellsToSamples(
        cells = rownames(colData),
        samples = rownames(sampleData)
    )
    colData[["sampleID"]] <- cell2sample
    sampleData[["sampleID"]] <- rownames(sampleData)
    colData <- merge(
        x = colData,
        y = sampleData,
        by = "sampleID",
        all.x = TRUE
    )
    rownames(colData) <- colData[["cellID"]]
    colData[["cellID"]] <- NULL
    sampleData[["sampleID"]] <- NULL

    # Bind the `nCount` column into the colData
    nCount <- cbData[rownames(colData), "nCount", drop = FALSE]
    colData <- cbind(nCount, colData)

    # Metadata =================================================================
    metadata <- list(
        "version" = packageVersion,
        "pipeline" = pipeline,
        "level" = level,
        "uploadDir" = uploadDir,
        "sampleDirs" = sampleDirs,
        "sampleMetadataFile" = as.character(sampleMetadataFile),
        "interestingGroups" = interestingGroups,
        "organism" = as.character(organism),
        "genomeBuild" = as.character(genomeBuild),
        "ensemblRelease" = as.integer(ensemblRelease),
        "rowRangesMetadata" = rowRangesMetadata,
        "sampleData" = sampleData,
        "cell2sample" = as.factor(cell2sample),
        "umiType" = umiType,
        "allSamples" = allSamples,
        # bcbio pipeline-specific ----------------------------------------------
        "projectDir" = projectDir,
        "template" = template,
        "runDate" = runDate,
        "yaml" = yaml,
        "gffFile" = as.character(gffFile),
        "tx2gene" = tx2gene,
        "dataVersions" = dataVersions,
        "programVersions" = programVersions,
        "bcbioLog" = bcbioLog,
        "bcbioCommandsLog" = bcbioCommandsLog,
        "cellularBarcodes" = cbList,
        "cellularBarcodeCutoff" = cellularBarcodeCutoff,
        "call" = match.call()
    )
    # Add user-defined custom metadata, if specified
    if (length(dots)) {
        assert_are_disjoint_sets(names(metadata), names(dots))
        metadata <- c(metadata, dots)
    }

    # Return ===================================================================
    .new.bcbioSingleCell(
        assays = list("counts" = counts),
        rowRanges = rowRanges,
        colData = colData,
        metadata = metadata,
        transgeneNames = transgeneNames,
        spikeNames = spikeNames
    )
}



# Validity Checks ==============================================================
setValidity(
    "bcbioSingleCell",
    function(object) {
        stopifnot(metadata(object)[["version"]] >= 0.1)
        stopifnot(!.hasSlot(object, "bcbio"))

        # Assays ===============================================================
        assert_are_identical("counts", names(assays(object)))

        # Row data =============================================================
        assert_is_all_of(rowRanges(object), "GRanges")
        assert_is_all_of(rowData(object), "DataFrame")

        # Metadata =============================================================
        metadata <- metadata(object)

        # Optional metadata:
        # - filterCells
        # - filterGenes
        # - filterParams
        # - filterSummary
        # - lanes: integer
        # - rowRangesMetadata: tbl_df
        # - tx2gene: data.frame
        #
        # bcbio-specific:
        # - bcbioCommandsLog: character
        # - bcbioLog: character
        # - dataVersions: tbl_df
        # - gffFile: character
        # - programVersions: tbl_df
        # - projectDir: character
        # - runDate: Date
        # - template: character
        # - yaml: list

        # Class checks
        requiredMetadata <- list(
            "allSamples" = "logical",
            "cell2sample" = "factor",
            "date" = "Date",
            "devtoolsSessionInfo" = "session_info",
            "ensemblRelease" = "integer",
            "genomeBuild" = "character",
            "interestingGroups" = "character",
            "level" = "character",
            "organism" = "character",
            "pipeline" = "character",
            "sampleData" = "data.frame",
            "sampleDirs" = "character",
            "sampleMetadataFile" = "character",
            "umiType" = "character",
            "uploadDir" = "character",
            "utilsSessionInfo" = "sessionInfo",
            "version" = "package_version",
            "wd" = "character"
        )
        classChecks <- invisible(mapply(
            name <- names(requiredMetadata),
            expected <- requiredMetadata,
            MoreArgs = list(metadata = metadata),
            FUN = function(name, expected, metadata) {
                actual <- class(metadata[[name]])
                if (!length(intersect(expected, actual))) {
                    FALSE
                } else {
                    TRUE
                }
            },
            SIMPLIFY = TRUE,
            USE.NAMES = TRUE
        ))
        if (!all(classChecks)) {
            print(classChecks)
            stop(paste(
                "Metadata class checks failed.",
                bcbioBase::updateMessage,
                sep = "\n"
            ))
        }

        # level
        assert_is_subset(
            x = metadata[["level"]],
            y = c("genes", "transcripts")
        )

        TRUE
    }
)