R/SCE-functions.R

#' Add feature statistics
#'
#' Add additional feature statistics to a SingleCellExperiment object
#'
#' @param sce SingleCellExperiment to add feature statistics to.
#' @param value the expression value to calculate statistics for. Options are
#'        "counts", "cpm", "tpm" or "fpkm". The values need to exist in the
#'        given SingleCellExperiment.
#' @param log logical. Whether to take log2 before calculating statistics.
#' @param offset offset to add to avoid taking log of zero.
#' @param no.zeros logical. Whether to remove all zeros from each feature before
#'        calculating statistics.
#'
#' @details
#' Currently adds the following statistics: mean, variance, coefficient of
#' variation, median and median absolute deviation. Statistics are added to
#' the \code{\link{rowData}} slot and are named \code{Stat[Log]Value[No0]} where
#' \code{Log} and \code{No0} are added if those arguments are true.
#' UpperCamelCase is used to differentiate these columns from those added by
#' analysis packages.
#'
#' @return SingleCellExperiment with additional feature statistics
#'
#' @importFrom SummarizedExperiment rowData rowData<-
addFeatureStats <- function(sce, value = c("counts", "cpm", "tpm", "fpkm"),
                            log = FALSE, offset = 1, no.zeros = FALSE) {

    checkmate::assertClass(sce, "SingleCellExperiment")
    checkmate::assertLogical(log)
    checkmate::assertNumber(offset, lower = 0)
    checkmate::assertLogical(no.zeros)
    value <- match.arg(value)

    switch(value,
           counts = {
               values = BiocGenerics::counts(sce)
               suffix <- "Counts"
           },
           cpm = {
               values = SingleCellExperiment::cpm(sce)
               suffix <- "CPM"
           },
           tpm = {
               values = SingleCellExperiment::tpm(sce)
               suffix <- "TPM"
           },
           fpkm = {
               values = SummarizedExperiment::assays(sce)$fpkm
               suffix <- "FPKM"
           }
    )

    if (no.zeros) {
        values[values == 0] <- NA
        suffix = paste0(suffix, "No0")
    }

    if (log) {
        values = log2(values + offset)
        suffix = paste0("Log", suffix)
    }

    mean.str <- paste0("Mean", suffix)
    var.str  <- paste0("Var",  suffix)
    cv.str   <- paste0("CV",   suffix)
    med.str  <- paste0("Med",  suffix)
    mad.str  <- paste0("MAD",  suffix)

    rowData(sce)[, mean.str] <- rowMeans(values, na.rm = TRUE)
    rowData(sce)[, var.str]  <- matrixStats::rowVars(values, na.rm = TRUE)
    rowData(sce)[, cv.str]   <- sqrt(rowData(sce)[, var.str]) /
        rowData(sce)[, mean.str]
    rowData(sce)[, med.str]  <- matrixStats::rowMedians(values, na.rm = TRUE)
    rowData(sce)[, mad.str]  <- matrixStats::rowMads(values, na.rm = TRUE)

    return(sce)
}

#' Add gene lengths
#'
#' Add gene lengths to an SingleCellExperiment object
#'
#' @param sce SingleCellExperiment to add gene lengths to.
#' @param method Method to use for creating lengths.
#' @param loc Location parameter for the generate method.
#' @param scale Scale parameter for the generate method.
#' @param lengths Vector of lengths for the sample method.
#'
#' @details
#' This function adds simulated gene lengths to the
#' \code{\link{rowData}} slot of a
#' \code{\link[SingleCellExperiment]{SingleCellExperiment}} object that can be
#' used for calculating length normalised expression values such as TPM or FPKM.
#' The \code{generate} method simulates lengths using a (rounded) log-normal
#' distribution, with the default \code{loc} and \code{scale} parameters based
#' on human protein-coding genes. Alternatively the \code{sample} method can be
#' used which randomly samples lengths (with replacement) from a supplied
#' vector.
#'
#' @return SingleCellExperiment with added gene lengths
#' @examples
#' # Default generate method
#' sce <- simpleSimulate()
#' sce <- addGeneLengths(sce)
#' head(rowData(sce))
#' # Sample method (human coding genes)
#' \dontrun{
#' library(TxDb.Hsapiens.UCSC.hg19.knownGene)
#' library(GenomicFeatures)
#' txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
#' tx.lens <- transcriptLengths(txdb, with.cds_len = TRUE)
#' tx.lens <- tx.lens[tx.lens$cds_len > 0, ]
#' gene.lens <- max(splitAsList(tx.lens$tx_len, tx.lens$gene_id))
#' sce <- addGeneLengths(sce, method = "sample", lengths = gene.lens)
#' }
#' @export
#' @importFrom stats rlnorm
addGeneLengths <- function(sce, method = c("generate", "sample"), loc = 7.9,
                           scale = 0.7, lengths = NULL) {

    method <- match.arg(method)
    checkmate::assertClass(sce, "SingleCellExperiment")
    checkmate::assertNumber(loc)
    checkmate::assertNumber(scale, lower = 0)
    checkmate::assertNumeric(lengths, lower = 0, null.ok = TRUE)

    switch(method,
           generate = {
               sim.lengths <- rlnorm(nrow(sce), meanlog = loc, sdlog = scale)
               sim.lengths <- round(sim.lengths)
           },
           sample = {
               if (is.null(lengths)) {
                   stop("Lengths must be supplied to use the sample method.")
               } else {
                   sim.lengths <- sample(lengths, nrow(sce), replace = TRUE)
               }
           }
    )

    rowData(sce)$Length <- sim.lengths

    return(sce)
}
Granoia/splatter-mod documentation built on May 28, 2019, 12:31 a.m.