bcbioRnaseq: RNA-Seq Utilities

setClassUnion("missingOrNULL", c("missing", "NULL"))



#' @rdname bcbioRNASeq-class
#' @aliases NULL
#' @exportClass bcbioRNASeq
#' @usage NULL
bcbioRNASeq <- setClass(
    Class = "bcbioRNASeq",
    contains = "RangedSummarizedExperiment"
)



# Constructors =================================================================
#' `bcbioRNASeq` Object and Constructor
#'
#' `bcbioRNASeq` is an S4 class that extends `RangedSummarizedExperiment`, and
#' is designed to store a [bcbio](https://bcbio-nextgen.readthedocs.org) RNA-seq
#' analysis.
#'
#' Simply point to the final upload directory generated by
#' [bcbio](https://bcbio-nextgen.readthedocs.io/), and this constructor function
#' will take care of the rest. It automatically imports RNA-seq counts,
#' metadata, and the program versions used.
#'
#' This class contains raw read counts and length-scaled
#' transcripts per million (TPM) generated by [tximport::tximport()]. Counts can
#' be loaded at gene or transcript level.
#'
#' @section Metadata:
#'
#' The [metadata()] accessor contains:
#'
#' - Sample quality control metrics.
#' - Ensembl annotations.
#' - Server run paths.
#' - R session information (e.g. [utils::sessionInfo()]).
#'
#' @section DESeq2:
#'
#' DESeq2 is run automatically when [bcbioRNASeq()] is called, and variance
#' stabilized counts are slotted into [assays()]. If the number of samples is
#' bigger than the `transformationLimit` argument, `rlog` and `vst` counts will
#' not be slotted into `assays()`. In this case, we recommend visualization
#' using [tmm()] counts, which are automatically calculated using edgeR.
#'
#' @section Remote Data:
#'
#' When working in RStudio, we recommend connecting to the bcbio run directory
#' as a remote connection over
#' [sshfs](https://github.com/osxfuse/osxfuse/wiki/SSHFS).
#'
#' @note `bcbioRNASeq` extended `SummarizedExperiment` prior to v0.2.0, where we
#'   migrated to `RangedSummarizedExperiment`.
#'
#' @rdname bcbioRNASeq-class
#' @aliases bcbioRNASeq-class
#' @docType class
#' @family S4 Class Definition
#' @author Michael Steinbaugh, Lorena Pantano, Rory Kirchner, Victor Barrera
#'
#' @inheritParams bcbioBase::prepareSummarizedExperiment
#' @inheritParams general
#' @param uploadDir Path to final upload directory. This path is set when
#'   running `bcbio_nextgen -w template`.
#' @param level Import counts as "`genes`" (default) or "`transcripts`".
#' @param caller Expression caller. Supports "`salmon`" (default), "`kallisto`",
#'   or "`sailfish`".
#' @param organism Organism name. Use the full latin name (e.g.
#'   "Homo sapiens"), since this will be input downstream to
#'   AnnotationHub and ensembldb, unless `gffFile` is set. If set `NULL`
#'   (*advanced use; not recommended*), the function call will skip loading
#'   gene/transcript-level annotations into `rowRanges()`. This can be useful
#'   for poorly annotation genomes or experiments involving multiple genomes.
#' @param samples *Optional.* Specify a subset of samples to load. The names
#'   must match the `description` specified in the bcbio YAML metadata. If a
#'   `sampleMetadataFile` is provided, that will take priority for sample
#'   selection. Typically this can be left unset.
#' @param sampleMetadataFile *Optional.* Custom metadata file containing
#'   sample information. Otherwise defaults to sample metadata saved in the YAML
#'   file. Remote URLs are supported. Typically this can be left unset.
#' @param genomeBuild *Optional.* Ensembl genome build name (e.g. "GRCh38").
#'   This will be passed to AnnotationHub for `EnsDb` annotation matching,
#'   unless `gffFile` is set.
#' @param ensemblRelease *Optional.* Ensembl release version. If unset,
#'   defaults to current release, and does not typically need to be
#'   user-defined. Passed to AnnotationHub for `EnsDb` annotation matching,
#'   unless `gffFile` is set.
#' @param gffFile *Advanced use; not recommended.* By default, we recommend
#'   leaving this `NULL` for genomes that are supported on Ensembl. In this
#'   case, the row annotations ([rowRanges()]) will be obtained automatically
#'   from Ensembl by passing the `organism`, `genomeBuild`, and `ensemblRelease`
#'   arguments to AnnotationHub and ensembldb. For a genome that is not
#'   supported on Ensembl and/or AnnotationHub, a GFF/GTF (General Feature
#'   Format) file is required. Generally, we recommend using a GTF (GFFv2) file
#'   here over a GFF3 file if possible, although all GFF formats are supported.
#'   The function will internally generate a `TxDb` containing
#'   transcript-to-gene mappings and construct a `GRanges` object containing the
#'   genomic ranges ([rowRanges()]).
#' @param transformationLimit Maximum number of samples to calculate
#'   [DESeq2::rlog()] and [DESeq2::varianceStabilizingTransformation()] matrix.
#'   For large datasets, DESeq2 is slow to apply variance stabilization. In this
#'   case, we recommend loading up the dataset in a high-performance computing
#'   environment. Use `Inf` to always apply and `-Inf` to always skip.
#' @param ... Additional arguments, slotted into the [metadata()] accessor.
#'
#' @return `bcbioRNASeq`.
#' @export
#'
#' @seealso
#' - [SummarizedExperiment::SummarizedExperiment()].
#' - [methods::initialize()].
#' - [methods::validObject()].
#' - [BiocGenerics::updateObject()].
#' - `.S4methods(class = "bcbioRNASeq")`.
#'
#' @examples
#' uploadDir <- system.file("extdata/bcbio", package = "bcbioRNASeq")
#'
#' # Gene level
#' x <- bcbioRNASeq(
#'     uploadDir = uploadDir,
#'     level = "genes",
#'     caller = "salmon",
#'     organism = "Mus musculus",
#'     ensemblRelease = 87L
#' )
#' show(x)
#' is(x, "RangedSummarizedExperiment")
#' validObject(x)
#'
#' # Transcript level
#' x <- bcbioRNASeq(
#'     uploadDir = uploadDir,
#'     level = "transcripts",
#'     caller = "salmon",
#'     organism = "Mus musculus",
#'     ensemblRelease = 87L
#' )
#' show(x)
#' validObject(x)
bcbioRNASeq <- function(
    uploadDir,
    level = c("genes", "transcripts"),
    caller = c("salmon", "kallisto", "sailfish"),
    organism,
    samples = NULL,
    sampleMetadataFile = NULL,
    interestingGroups = "sampleName",
    ensemblRelease = NULL,
    genomeBuild = NULL,
    transgeneNames = NULL,
    spikeNames = NULL,
    gffFile = NULL,
    transformationLimit = 50L,
    ...
) {
    dots <- list(...)

    # Legacy arguments =========================================================
    call <- match.call(expand.dots = TRUE)
    # annotable
    if ("annotable" %in% names(call)) {
        stop("`annotable` is defunct. Consider using `gffFile` instead.")
    }
    # ensemblVersion
    if ("ensemblVersion" %in% names(call)) {
        warning("Use `ensemblRelease` instead of `ensemblVersion`")
        ensemblRelease <- call[["ensemblVersion"]]
        dots[["ensemblVersion"]] <- NULL
    }
    # organism missing
    if (!"organism" %in% names(call)) {
        stop("`organism` is now required")
    }
    dots <- Filter(Negate(is.null), dots)

    # Assert checks ============================================================
    assert_is_a_string(uploadDir)
    assert_all_are_dirs(uploadDir)
    level <- match.arg(level)
    caller <- match.arg(caller)
    assertIsAStringOrNULL(sampleMetadataFile)
    assertIsCharacterOrNULL(samples)
    assert_is_character(interestingGroups)
    assertIsAStringOrNULL(organism)
    assertIsAnImplicitIntegerOrNULL(ensemblRelease)
    assertIsAStringOrNULL(genomeBuild)
    assertIsCharacterOrNULL(transgeneNames)
    assertIsCharacterOrNULL(spikeNames)
    assertIsAStringOrNULL(gffFile)
    if (is_a_string(gffFile)) {
        assert_all_are_existing_files(gffFile)
    }
    assert_is_a_number(transformationLimit)

    # Directory paths ==========================================================
    uploadDir <- normalizePath(uploadDir, winslash = "/", mustWork = TRUE)
    projectDir <- list.files(
        path = uploadDir,
        pattern = bcbioBase::projectDirPattern,
        full.names = FALSE,
        recursive = FALSE
    )
    assert_is_a_string(projectDir)
    message(projectDir)
    match <- str_match(projectDir, bcbioBase::projectDirPattern)
    runDate <- as.Date(match[[2L]])
    template <- match[[3L]]
    projectDir <- file.path(uploadDir, projectDir)
    assert_all_are_dirs(projectDir)
    sampleDirs <- sampleDirs(uploadDir)
    assert_all_are_dirs(sampleDirs)

    # Sequencing lanes =========================================================
    if (any(grepl(x = sampleDirs, pattern = bcbioBase::lanePattern))) {
        lanes <- str_match(names(sampleDirs), bcbioBase::lanePattern) %>%
            .[, 2L] %>%
            unique() %>%
            length()
        message(paste(
            lanes, "sequencing lane detected", "(technical replicates)"
        ))
    } else {
        lanes <- 1L
    }
    assert_is_an_integer(lanes)

    # Project summary YAML =====================================================
    yamlFile <- file.path(projectDir, "project-summary.yaml")
    assert_all_are_existing_files(yamlFile)
    yaml <- readYAML(yamlFile)

    # Column data ==============================================================
    colData <- readYAMLSampleData(yamlFile)

    # Replace columns with external, user-defined metadata, if desired. This is
    # nice for correcting metadata issues that aren't easy to fix by editing the
    # bcbio YAML output.
    if (is_a_string(sampleMetadataFile)) {
        userColData <- readSampleData(sampleMetadataFile, lanes = lanes)
        # Drop columns that are defined from the auto metadata
        setdiff <- setdiff(colnames(colData), colnames(userColData))
        # Note that we're allowing the user to subset the samples by the
        # metadata input here
        colData <- colData[rownames(userColData), setdiff, drop = FALSE]
        colData <- cbind(userColData, colData)
    }

    # Allow easy subsetting by sample (must match description)
    if (is.character(samples)) {
        assert_is_subset(samples, colData[["description"]])
        colData <- colData %>%
            .[which(samples %in% .[["description"]]), , drop = FALSE]
    }

    # Sanitize into factors
    colData <- sanitizeSampleData(colData)

    # Sample metrics ===========================================================
    # Note that sample metrics used for QC plots are not currently generated
    # when using fast RNA-seq workflow. This depends upon MultiQC and aligned
    # counts generated with STAR.
    message("Reading sample metrics")
    metrics <- readYAMLSampleMetrics(yamlFile)
    assert_is_data.frame(metrics)
    assert_are_disjoint_sets(colnames(colData), colnames(metrics))
    # Now safe to add the metrics to colData
    colData <- cbind(colData, metrics)

    # Interesting groups =======================================================
    interestingGroups <- camel(interestingGroups)
    assert_is_subset(interestingGroups, colnames(colData))

    # Subset sample directories by metadata ====================================
    samples <- rownames(colData)
    assert_is_subset(samples, names(sampleDirs))
    if (length(samples) < length(sampleDirs)) {
        message(paste(
            "Loading a subset of samples:",
            str_trunc(toString(samples), width = 80L),
            sep = "\n"
        ))
        allSamples <- FALSE
        sampleDirs <- sampleDirs[samples]
    } else {
        allSamples <- TRUE
    }

    # bcbio run information ====================================================
    dataVersions <- readDataVersions(
        file = file.path(projectDir, "data_versions.csv")
    )
    assert_is_tbl_df(dataVersions)

    programVersions <- readProgramVersions(
        file = file.path(projectDir, "programs.txt")
    )
    assert_is_tbl_df(programVersions)

    bcbioLog <- readLog(
        file = file.path(projectDir, "bcbio-nextgen.log")
    )
    assert_is_character(bcbioLog)

    bcbioCommandsLog <- readLog(
        file = file.path(projectDir, "bcbio-nextgen-commands.log")
    )
    assert_is_character(bcbioCommandsLog)

    # tximport =================================================================
    # Run this step only after the required metadata has imported successfully
    if (level == "transcripts") {
        txOut <- TRUE
    } else {
        txOut <- FALSE
    }

    # tx2gene
    tx2geneFile <- file.path(projectDir, "tx2gene.csv")
    if (file.exists(tx2geneFile)) {
        # CSV file always takes priority
        tx2gene <- readTx2gene(tx2geneFile)
    } else if (is_a_string(gffFile)) {
        # Fall back to using GFF, if declared
        tx2gene <- makeTx2geneFromGFF(gffFile)
    } else {
        stop(paste(
            "tx2gene assignment failure.",
            "tx2gene.csv or GFF file are required."
        ))
    }

    txi <- .tximport(
        sampleDirs = sampleDirs,
        type = caller,
        txIn = TRUE,
        txOut = txOut,
        tx2gene = tx2gene
    )

    # TPM/abundance: transcripts per million
    tpm <- txi[["abundance"]]
    assert_is_matrix(tpm)
    # counts: raw counts
    counts <- txi[["counts"]]
    assert_is_matrix(counts)
    # length: average transcript length
    length <- txi[["length"]]
    assert_is_matrix(length)
    # countsFromAbundance: character describing TPM
    countsFromAbundance <- txi[["countsFromAbundance"]]
    assert_is_character(countsFromAbundance)

    # Ensure `colData` matches the colnames in `assays()`
    colData <- colData[colnames(counts), , drop = FALSE]

    # Row data =================================================================
    rowRangesMetadata <- NULL
    if (is_a_string(gffFile)) {
        rowRanges <- makeGRangesFromGFF(gffFile)
    } else if (is_a_string(organism)) {
        # ah: AnnotationHub
        ah <- makeGRangesFromEnsembl(
            organism = organism,
            format = level,
            genomeBuild = genomeBuild,
            release = ensemblRelease,
            metadata = TRUE
        )
        assert_is_list(ah)
        assert_are_identical(names(ah), c("data", "metadata"))
        rowRanges <- ah[["data"]]
        assert_is_all_of(rowRanges, "GRanges")
        rowRangesMetadata <- ah[["metadata"]]
        assert_is_data.frame(rowRangesMetadata)
    } else {
        rowRanges <- emptyRanges(rownames(counts))
    }

    # Gene-level variance stabilization ========================================
    normalized <- NULL
    rlog <- NULL
    vst <- NULL
    if (level == "genes") {
        message(paste(
            "Generating DESeqDataSet with DESeq2",
            packageVersion("DESeq2")
        ))
        dds <- DESeqDataSetFromTximport(
            txi = txi,
            colData = colData,
            # Use an empty design formula
            design = ~ 1  # nolint
        )
        # Suppress warning about empty design formula
        dds <- suppressWarnings(DESeq(dds))
        normalized <- counts(dds, normalized = TRUE)
        if (nrow(colData) <= transformationLimit) {
            message("Applying rlog transformation")
            rlog <- assay(rlog(dds))
            message("Applying variance stabilizing transformation")
            vst <- assay(varianceStabilizingTransformation(dds))
        } else {
            warning(paste(
                "Dataset contains many samples.",
                "Skipping variance stabilizing transformations."
            ))
        }
    }

    # Assays ===================================================================
    assays <- list(
        "counts" = counts,
        "tpm" = tpm,
        "length" = length,
        "normalized" = normalized,
        "rlog" = rlog,
        "vst" = vst
    )

    # Metadata =================================================================
    metadata <- list(
        "version" = packageVersion,
        "level" = level,
        "caller" = caller,
        "countsFromAbundance" = countsFromAbundance,
        "uploadDir" = uploadDir,
        "sampleDirs" = sampleDirs,
        "sampleMetadataFile" = as.character(sampleMetadataFile),
        "projectDir" = projectDir,
        "template" = template,
        "runDate" = runDate,
        "interestingGroups" = interestingGroups,
        "organism" = as.character(organism),
        "genomeBuild" = as.character(genomeBuild),
        "ensemblRelease" = as.integer(ensemblRelease),
        "rowRangesMetadata" = rowRangesMetadata,
        "gffFile" = as.character(gffFile),
        "tx2gene" = tx2gene,
        "lanes" = lanes,
        "yaml" = yaml,
        "dataVersions" = dataVersions,
        "programVersions" = programVersions,
        "bcbioLog" = bcbioLog,
        "bcbioCommandsLog" = bcbioCommandsLog,
        "allSamples" = allSamples,
        "call" = match.call()
    )
    # Add user-defined custom metadata, if specified
    if (length(dots)) {
        assert_are_disjoint_sets(metadata, dots)
        metadata <- c(metadata, dots)
    }

    # Return ===================================================================
    .new.bcbioRNASeq(
        assays = assays,
        rowRanges = rowRanges,
        colData = colData,
        metadata = metadata,
        transgeneNames = transgeneNames,
        spikeNames = spikeNames
    )
}



# TODO Consider erroring on `sampleID` and `description` columns in colData
# Validity Checks ==============================================================
setValidity(
    "bcbioRNASeq",
    function(object) {
        stopifnot(metadata(object)[["version"]] >= 0.2)
        assert_is_all_of(object, "RangedSummarizedExperiment")
        assert_has_dimnames(object)

        # Assays ===============================================================
        # Note that `rlog` and `vst` DESeqTransform objects are optional
        assert_is_subset(requiredAssays, assayNames(object))
        # Check that all assays are matrices
        assayCheck <- vapply(
            X = assays(object),
            FUN = is.matrix,
            FUN.VALUE = logical(1L),
            USE.NAMES = TRUE
        )
        if (!all(assayCheck)) {
            stop(paste(
                paste(
                    "Assays that are not matrix:",
                    toString(names(assayCheck[!assayCheck]))
                ),
                bcbioBase::updateMessage,
                sep = "\n"
            ))
        }

        # Gene-level specific
        if (metadata(object)[["level"]] == "genes") {
            assert_is_subset("normalized", names(assays(object)))
        }

        # Row data =============================================================
        assert_is_all_of(rowRanges(object), "GRanges")
        assert_is_all_of(rowData(object), "DataFrame")

        # Column data ==========================================================
        # metrics
        if (is.data.frame(metadata(object)[["metrics"]])) {
            stop(paste(
                "`metrics` saved in `metadata()` instead of `colData()`.",
                bcbioBase::updateMessage
            ))
        }
        assert_are_disjoint_sets(
            x = colnames(colData(object)),
            y = legacyMetricsCols
        )

        # Metadata =============================================================
        metadata <- metadata(object)

        # Detect legacy slots
        legacyMetadata <- c(
            "design",
            "ensemblVersion",
            "gtf",
            "gtfFile",
            "missingGenes",
            "programs",
            "yamlFile"
        )
        intersect <- intersect(names(metadata), legacyMetadata)
        if (length(intersect)) {
            stop(paste(
                paste(
                    "Legacy metadata slots:",
                    toString(sort(intersect))
                ),
                bcbioBase::updateMessage,
                sep = "\n"
            ))
        }

        # Class checks (order independent)
        requiredMetadata <- list(
            "allSamples" = "logical",
            "bcbioCommandsLog" = "character",
            "bcbioLog" = "character",
            "caller" = "character",
            "countsFromAbundance" = "character",
            "date" = "Date",
            "devtoolsSessionInfo" = "session_info",
            "ensemblRelease" = "integer",
            "genomeBuild" = "character",
            "gffFile" = "character",
            "interestingGroups" = "character",
            "lanes" = "integer",
            "level" = "character",
            "organism" = "character",
            "programVersions" = "tbl_df",
            "projectDir" = "character",
            "runDate" = "Date",
            "sampleDirs" = "character",
            "sampleMetadataFile" = "character",
            "template" = "character",
            "tx2gene" = "data.frame",
            "uploadDir" = "character",
            "utilsSessionInfo" = "sessionInfo",
            "version" = "package_version",
            "wd" = "character",
            "yaml" = "list"
        )
        classChecks <- invisible(mapply(
            name <- names(requiredMetadata),
            expected <- requiredMetadata,
            MoreArgs = list(metadata = metadata),
            FUN = function(name, expected, metadata) {
                actual <- class(metadata[[name]])
                if (!length(intersect(expected, actual))) {
                    FALSE
                } else {
                    TRUE
                }
            },
            SIMPLIFY = TRUE,
            USE.NAMES = TRUE
        ))
        if (!all(classChecks)) {
            print(classChecks)
            stop(paste(
                "Metadata class checks failed.",
                bcbioBase::updateMessage,
                sep = "\n"
            ))
        }

        # Additional assert checks
        # caller
        assert_is_subset(
            x = metadata[["caller"]],
            y = validCallers
        )
        # level
        assert_is_subset(
            x = metadata[["level"]],
            y = c("genes", "transcripts")
        )
        # tx2gene
        tx2gene <- metadata[["tx2gene"]]
        assertIsTx2gene(tx2gene)

        TRUE
    }
)