#' Runs SAVER
#'
#' Recovers expression using the SAVER method.
#'
#' The SAVER method starts by estimating the prior mean and variance for the
#' true expression level for each gene and cell. The prior mean is obtained
#' through predictions from a LASSO Poisson regression for each gene
#' implemented using the \code{glmnet} package. Then, the variance is estimated
#' through maximum likelihood assuming constant variance, Fano factor, or
#' coefficient of variation variance structure for each gene. The posterior
#' distribution is calculated and the posterior mean is reported as the SAVER
#' estimate.
#'
#' @param x An expression count matrix. The rows correspond to genes and
#' the columns correspond to cells. Can be sparse.
#'
#' @param do.fast Approximates the prediction step. Default is TRUE.
#'
#' @param ncores Number of cores to use. Default is 1.
#'
#' @param size.factor Vector of cell size normalization factors.
#' If \code{x} is already normalized or normalization is not desired, use
#' \code{size.factor = 1}. Default uses mean library size normalization.
#'
#' @param npred Number of genes for regression prediction. Selects the top
#' \code{npred} genes in terms of mean expression for regression prediction.
#' Default is all genes.
#'
#' @param pred.cells Indices of cells to perform regression prediction.
#' Default is all cells.
#'
#' @param pred.genes Indices of specific genes to perform regression
#' prediction. Overrides \code{npred}. Default is all genes.
#'
#' @param pred.genes.only Return expression levels of only \code{pred.genes}.
#' Default is FALSE (returns expression levels of all genes).
#'
#' @param null.model Whether to use mean gene expression as prediction.
#'
#' @param mu Matrix of prior means.
#'
#' @param estimates.only Only return SAVER estimates. Default is FALSE.
#'
#'
#' @return If `estimates.only = TRUE`, then a matrix of SAVER estimates.
#'
#' If `estimates.only = FALSE`, a list with the following components
#' \item{\code{estimate}}{Recovered (normalized) expression.}
#' \item{\code{se}}{Standard error of estimates.}
#' \item{\code{info}}{Information about dataset.}
#'
#' The \code{info} element is a list with the following components:
#' \item{\code{size.factor}}{Size factor used for normalization.}
#' \item{\code{maxcor}}{Maximum absolute correlation for each gene. 2 if not
#' calculated}
#' \item{\code{lambda.max}}{Smallest value of lambda which gives the null
#' model.}
#' \item{\code{lambda.min}}{Value of lambda from which the prediction model is
#' used}
#' \item{\code{sd.cv}}{Difference in the number of standard deviations in
#' deviance between the model with lowest cross-validation error and the null
#' model}
#' \item{\code{pred.time}}{Time taken to generate predictions.}
#' \item{\code{var.time}}{Time taken to estimate variance.}
#' \item{\code{maxcor}}{Maximum absolute correlation cutoff used to determine
#' if a gene should be predicted.}
#' \item{\code{lambda.coefs}}{Coefficients for estimating lambda with lowest
#' cross-validation error.}
#' \item{\code{total.time}}{Total time for SAVER estimation.}
#'
#' @examples
#' data("linnarsson")
#'
#' \dontrun{
#' system.time(linnarsson_saver <- saver(linnarsson, ncores = 12))
#' }
#'
#' # predictions for top 5 highly expressed genes
#' \dontrun{
#' saver2 <- saver(linnarsson, npred = 5)
#' }
#'
#' # predictions for certain genes
#' \dontrun{
#' genes <- c("Thy1", "Mbp", "Stim2", "Psmc6", "Rps19")
#' genes.ind <- which(rownames(linnarsson) %in% genes)
#' saver3 <- saver(linnarsson, pred.genes = genes.ind)
#' }
#'
#' # return only certain genes
#' \dontrun{
#' saver4 <- saver(linnarsson, pred.genes = genes.ind, pred.genes.only = TRUE)
#' }
#'
#' @import foreach methods
#'
#' @export
saver <- function(x, do.fast = TRUE, ncores = 1, size.factor = NULL,
npred = NULL, pred.cells = NULL, pred.genes = NULL,
pred.genes.only = FALSE, null.model = FALSE, mu = NULL,
estimates.only = FALSE) {
if (!is.null(mu)) {
mu <- check.mu(x, mu)
}
x <- clean.data(x)
np <- dim(x)
ngenes <- as.integer(np[1])
ncells <- as.integer(np[2])
message(ngenes, " genes, ", ncells, " cells")
# assign size factor
sf.out <- calc.size.factor(x, size.factor, ncells)
sf <- sf.out[[1]]
scale.sf <- sf.out[[2]]
gene.names <- rownames(x)
cell.names <- colnames(x)
# Set up parallel backend
cl.create <- FALSE
if (foreach::getDoParWorkers() > 1) {
if (ncores > 1 & foreach::getDoParWorkers() != ncores) {
message(paste("Parallel backend already registered and is inconsistent",
"with ncores."))
}
ncores <- getDoParWorkers()
} else {
if (ncores > 1) {
cl.create <- TRUE
cl <- parallel::makeCluster(ncores, outfile = "")
doParallel::registerDoParallel(cl)
on.exit({
parallel::stopCluster(cl)
foreach::registerDoSEQ()
})
}
}
# if prior means are provided
if (!is.null(mu)) {
out <- saver.fit.mean(x, ncores, sf, scale.sf, mu, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
} else if (null.model) {
out <- saver.fit.null(x, ncores, sf, scale.sf, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
} else {
# assign pred.cells and pred.genes
pred.cells <- get.pred.cells(pred.cells, ncells)
pred.genes <- get.pred.genes(x, pred.genes, npred, ngenes)
npred <- length(pred.genes)
good.genes <- which(Matrix::rowMeans(x) >= 0.1)
x.est <- t(as.matrix(log(sweep(x[good.genes, ] + 1, 2, sf, "/"))))
if (pred.genes.only) {
x <- x[pred.genes, , drop = FALSE]
pred.genes <- 1:nrow(x)
gene.names <- rownames(x)
cell.names <- colnames(x)
}
out <- saver.fit(x, x.est, do.fast, ncores, sf, scale.sf, pred.genes,
pred.cells, null.model, ngenes = nrow(x),
ncells = ncol(x), gene.names, cell.names,
estimates.only)
}
message("Done!")
message("Finish time: ", Sys.time())
message("Total time: ", format(out$info$total.time))
if (!estimates.only) {
class(out) <- "saver"
out
} else {
out$estimate
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.