R/BASiCS-estimate.R

#' Estimate BASiCS simulation parameters
#'
#' Estimate simulation parameters for the BASiCS simulation from a real dataset.
#'
#' @param counts either a counts matrix or a SingleCellExperiment object
#'        containing count data to estimate parameters from.
#' @param spike.info data.frame describing spike-ins with two columns: "Name"
#'        giving the names of the spike-in features (must match
#'        \code{rownames(counts)}) and "Input" giving the number of input
#'        molecules.
#' @param batch vector giving the batch that each cell belongs to.
#' @param n total number of MCMC iterations. Must be \code{>= max(4, thin)} and
#' a multiple of \code{thin}.
#' @param thin thining period for the MCMC sampler. Must be \code{>= 2}.
#' @param burn burn-in period for the MCMC sampler. Must be in the range
#' \code{1 <= burn < n} and a multiple of \code{thin}.
#' @param regression logical. Whether to use regression to identify
#' over-dispersion. See \code{\link[BASiCS]{BASiCS_MCMC}} for details.
#' @param params BASiCSParams object to store estimated values in.
#' @param verbose logical. Whether to print progress messages.
#' @param progress logical. Whether to print additional BASiCS progress
#' messages.
#' @param ... Optional parameters passed to \code{\link[BASiCS]{BASiCS_MCMC}}.
#'
#' @details
#' This function is just a wrapper around \code{\link[BASiCS]{BASiCS_MCMC}} that
#' takes the output and converts it to a BASiCSParams object. Either a set of
#' spike-ins or batch information (or both) must be supplied. If only batch
#' information is provided there must be at least two batches. See
#' \code{\link[BASiCS]{BASiCS_MCMC}} for details.
#'
#' @return BASiCSParams object containing the estimated parameters.
#'
#' @examples
#' \dontrun{
#' # Load example data
#' library(scater)
#' data("sc_example_counts")
#'
#' spike.info <- data.frame(Name = rownames(sc_example_counts)[1:10],
#'                          Input = rnorm(10, 500, 200),
#'                          stringsAsFactors = FALSE)
#' params <- BASiCSEstimate(sc_example_counts[1:100, 1:30],
#'                          spike.info)
#' params
#' }
#' @export
BASiCSEstimate <- function(counts, spike.info = NULL, batch = NULL,
                           n = 20000, thin = 10, burn = 5000,
                           regression = TRUE,
                           params = newBASiCSParams(), verbose = TRUE,
                           progress = TRUE, ...) {
    UseMethod("BASiCSEstimate")
}

#' @rdname BASiCSEstimate
#' @export
BASiCSEstimate.SingleCellExperiment <- function(counts, spike.info = NULL,
                                                batch = NULL, n = 20000,
                                                thin = 10, burn = 5000,
                                                regression = TRUE,
                                                params = newBASiCSParams(),
                                                verbose = TRUE, progress = TRUE,
                                                ...) {
    counts <- BiocGenerics::counts(counts)
    BASiCSEstimate(counts, params)
}

#' @rdname BASiCSEstimate
#' @export
BASiCSEstimate.matrix <- function(counts, spike.info = NULL, batch = NULL,
                                  n = 20000, thin = 10, burn = 5000,
                                  regression = TRUE,
                                  params = newBASiCSParams(), verbose = TRUE,
                                  progress = TRUE, ...) {

    checkmate::assertClass(params, "BASiCSParams")
    checkmate::assertMatrix(counts, mode = "numeric", any.missing = FALSE,
                            min.rows = 1, min.cols = 1, row.names = "unique",
                            col.names = "unique")
    if (is.null(spike.info) && is.null(batch)) {
        stop("At least one of spike.info and batch must be provided")
    }
    if (!is.null(spike.info)) {
        checkmate::assertDataFrame(spike.info, any.missing = FALSE,
                                   min.rows = 1, ncols = 2)
        if (!all(colnames(spike.info) == c("Name", "Input"))) {
            stop("spike.info must have columns named 'Name' and 'Input'")
        }
        checkmate::assertCharacter(spike.info$Name, min.chars = 1,
                                   unique = TRUE)
        checkmate::assertNumeric(spike.info$Input, lower = 0, finite = TRUE)
    } #else {
    #    spike.info <- data.frame(Name = c(), Input = c())
    #}
    if (!is.null(batch)) {
        checkmate::assertIntegerish(batch, lower = 0, any.missing = FALSE,
                                    len = ncol(counts))
        if (is.null(spike.info) && length(unique(batch)) == 1) {
            stop("If spike.info is not provided there must be at least two ",
                 "batches")
        }
    } else {
        batch <- rep(1, ncol(counts))
    }
    checkmate::assertInt(thin, lower = 2)
    checkmate::assertInt(n, lower = max(4, thin))
    if ((n %% thin) != 0) {
        stop("'n' must be a multiple of 'thin'")
    }
    checkmate::assertInt(burn, lower = 1, upper = n - 1)
    if ((burn %% thin) != 0) {
        stop("'burn' must be a multiple of 'thin'")
    }
    checkmate::assertFlag(regression)

    is.spike <- rownames(counts) %in% spike.info$Name

    BASiCS.data <- suppressMessages(
                       BASiCS::newBASiCS_Data(counts, is.spike, spike.info,
                                              batch)
    )

    with.spikes <- sum(is.spike) > 1

    if (verbose) {
        mcmc <- BASiCS::BASiCS_MCMC(Data = BASiCS.data, N = n, Thin = thin,
                                    Burn = burn, Regression = regression,
                                    PrintProgress = progress,
                                    WithSpikes = with.spikes, ...)
    } else {
        mcmc <- suppressMessages(
                    BASiCS::BASiCS_MCMC(Data = BASiCS.data, N = n, Thin = thin,
                                        Burn = burn, Regression = regression,
                                        PrintProgress = progress,
                                        WithSpikes = with.spikes, ...)
        )
    }

    mcmc.summ <- BASiCS::Summary(mcmc)

    means <- BASiCS::displaySummaryBASiCS(mcmc.summ, Param = "mu")[, 1]
    deltas <- BASiCS::displaySummaryBASiCS(mcmc.summ, Param = "delta")[, 1]
    if (!is.null(spike.info)) {
        phis <- BASiCS::displaySummaryBASiCS(mcmc.summ, Param = "phi")[, 1]
    } else {
        phis <- rep(1, ncol(counts))
    }
    ss <- BASiCS::displaySummaryBASiCS(mcmc.summ, Param = "s")[, 1]
    thetas <- BASiCS::displaySummaryBASiCS(mcmc.summ, Param = "theta")[, 1]

    params <- setParams(params,
                        nGenes = sum(!is.spike),
                        batchCells = as.vector(table(batch)),
                        gene.params = data.frame(Mean = means, Delta = deltas),
                        nSpikes = sum(is.spike),
                        spike.means = ifelse(!is.null(spike.info),
                                             spike.info$Input, numeric()),
                        cell.params = data.frame(Phi = phis, S = ss),
                        theta = thetas)

    return(params)
}
Granoia/splatter-mod documentation built on May 28, 2019, 12:31 a.m.