slalom: Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

# Methods for the (Rcpp)SlalomModel class

################################################################################
### New SlalomModel class object

#' Create a new SlalomModel object.
#'
#' Slalom fits relatively complicated hierarchical Bayesian factor analysis
#' models with data and results stored in a \code{"SlalomModel"} object. This
#' function builds a new \code{"SlalomModel"} object from minimal inputs.
#'
#' @param object \code{"SingleCellExperiment"} object N x G expression data matrix (cells x genes)
#' @param genesets a \code{"GeneSetCollection"} object containing annotated
#' gene sets
#' @param n_hidden number of hidden factors to fit in the model (2-5 recommended)
#' @param prune_genes logical, should genes that are not annotated to any gene
#' sets be filtered out? If \code{TRUE}, then any genes with zero variance in
#' expression are also filtered out.
#' @param min_genes scalar, minimum number of genes required in order to retain
#' a gene set for analysis
#' @param design numeric design matrix providing values for covariates to fit in
#' the model (rows represent cells)
#' @param anno_fpr numeric(1), false positive rate (FPR) for assigning genes to
#' factors (pathways); default is 0.01
#' @param anno_fnr numeric(1), false negative rate (FNR) for assigning genes to
#' factors (pathways); default is 0.001
#' @param assay_name character(1), the name of the \code{assay} of the
#' \code{object} to use as expression values. Default is \code{logcounts},
#' assumed to be normalised log2-counts-per-million values or equivalent.
#' @param verbose logical(1), should information about what's going be printed
#' to screen?
#'
#' @return a new Rcpp_SlalomModel object
#'
#' @details
#' This function builds and returns the object, checking for validity, which
#' includes checking that the input data is of consistent dimensions.
#'
#' @importFrom GSEABase geneIds
#' @importFrom GSEABase getGmt
#' @import Rcpp
#' @import RcppArmadillo
#' @importFrom methods new
#' @importFrom methods validObject
#' @import SingleCellExperiment
#' @import SummarizedExperiment
#' @export
#'
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#'
#' exprsfile <- system.file("extdata", "mesc.csv", package = "slalom")
#' mesc_mat <- as.matrix(read.csv(exprsfile))
#' sce <- SingleCellExperiment::SingleCellExperiment(assays = list(logcounts = mesc_mat))
#' # model2 <- newSlalomModel(mesc_mat, genesets, n_hidden = 5, min_genes = 10)
#'
newSlalomModel <- function(
    object, genesets, n_hidden = 5, prune_genes = TRUE, min_genes = 15,
    design = NULL, anno_fpr = 0.01, anno_fnr = 0.001, assay_name = "logcounts",
    verbose = TRUE) {

    if (!methods::is(object, "SingleCellExperiment"))
        stop("object must be a SingleCellExperiment")
    Y <- t(SummarizedExperiment::assay(object, assay_name))
    if (is.null(Y))
        stop("object must contain 'logcounts' in assays")
    if (is.null(rownames(object)))
        stop("rownames(object) is NULL: expecting gene identifiers as rownames")
    colnames(Y) <- rownames(object)
    if (is.null(colnames(object)))
        rownames(Y) <- as.character(1:ncol(object))
    else
        rownames(Y) <- colnames(object)
    ## convert GeneSetCollection into I matrix of indicators
    I <- lapply(genesets, function(x) {colnames(Y) %in% GSEABase::geneIds(x)})
    names(I) <- names(genesets)
    I <- as.matrix(as.data.frame(I))
    rownames(I) <- colnames(Y)
    ## check for zero-variance genes
    n_cells <- nrow(Y)
    var_y_col <- (n_cells / (n_cells - 1) *
                      (colMeans(Y * Y) - colMeans(Y) ^ 2))
    if (prune_genes) {
        Y <- Y[, var_y_col > 0L]
        I <- I[var_y_col > 0L,]
    } else {
        if (!all(var_y_col > 0L) )
            stop("Some genes have zero variance in expression. Please filter these out.")
    }
    ## filter genesets on min_genes
    n_sets_pass <- colSums(I) >= min_genes
    if (sum(n_sets_pass) > 0L) {
        I <- I[, n_sets_pass]
        if (verbose)
            cat(sum(n_sets_pass), "annotated factors retained; ",
                length(genesets) - sum(n_sets_pass), "annotated factors dropped.\n")
    } else
        stop("No gene sets found containing more than min_genes genes.")
    ## prune genes if desired
    if (prune_genes) {
        keep_gene <- (rowSums(I) > 0L)
        I <- I[keep_gene, ]
    }
    retained_genes <- rownames(I)
    if (verbose)
        cat(length(retained_genes), " genes retained for analysis.\n")
    ## add hidden factors
    if (n_hidden > 0L) {
        hidden_factors <- matrix(TRUE, nrow = nrow(I), ncol = n_hidden)
        colnames(hidden_factors) <- paste0("hidden", sprintf("%02d", 1:n_hidden))
        I <- cbind(hidden_factors, I)
    }

    ## create new SlalomModel object for output
    ## set some attributes that we need frequently for the updates, inculuding
    ## number and idx of hidden (unannotated) terms
    pi_init <- I * 1L
    if (!is.null(design)) {
        if (is.null(colnames(design)))
            colnames(design) <- paste0("known", sprintf("%02d", 1:ncol(design)))
        tmpmat <- matrix(1, nrow = nrow(pi_init), ncol = ncol(design))
        colnames(tmpmat) <- colnames(design)
        pi_init <- cbind(tmpmat, pi_init)
    }
    pi_init[pi_init > 0.5] <- 1 - anno_fpr
    pi_init[pi_init < 0.5] <- anno_fnr
    out <- .newSlalom(Y[, retained_genes], pi_init, n_hidden, design)
    out$termNames <- c(colnames(design), colnames(I))
    out$cellNames <- rownames(Y)
    out$geneNames <- retained_genes
    out$pretrain_order <- seq_len(out$K) - 1
    ## Check validity of object
    #validObject(out)
    out
}


.newSlalom <- function(Y, pi_init, n_hidden, design = NULL) {
    n_known <- ifelse(is.null(design), 0, ncol(design))
    if (is.null(design))
        x_init <- matrix(0, nrow = nrow(Y), ncol = ncol(pi_init))
    else
        x_init <- cbind(
            design,
            matrix(0, nrow = nrow(Y), ncol = ncol(pi_init) - ncol(design)))
    Rcpp::loadModule("SlalomModel")
    out <- new(
        "Rcpp_SlalomModel",
        Y_init = Y,
        pi_init = pi_init,
        X_init = x_init,
        W_init = matrix(0, nrow = ncol(Y), ncol(pi_init)),
        prior_alpha = c(1e-03, 1e-03),
        prior_epsilon = c(1e-03, 1e-03)
    )
    ## add more data to the object
    out$Z_E1 <- out$X_E1 %*% t(out$W_E1 * pi_init)
    out$iUnannotatedDense <- seq(from = n_known + 1, to = n_known + n_hidden)
    out$nAnnotated <- ncol(pi_init) - n_hidden - n_known
    out$nHidden <- n_hidden
    out$nKnown <- n_known
    ## add design matrix as known factors if supplied
    if (!is.null(design)) {
        if (nrow(design) != out$N)
            stop("Number of rows of design does not match number of cells.")
        else {
            out$Known <- design
        }
    }
    out
}


################################################################################
### Initialize a SlalomModel class object

#' Initialize a SlalomModel object
#'
#' Initialize a SlalomModel with sensible starting values for parameters before
#' training the model.
#'
#' @param object a \code{Rcpp_SlalomModel} object
#' @param alpha_priors numeric(2) giving alpha and beta hyperparameters for a
#' gamma prior distribution for alpha parameters (precision of factor weights)
#' @param epsilon_priors numeric(2) giving alpha and beta hyperparameters for a
#' gamma prior distribution for noise precision parameters
#' @param noise_model character(1) defining noise model, defaults to "gauss" for
#' Gaussian noise model
#' @param seed integer(1) value supplying a random seed to make results
#' reproducible (default is \code{NULL})
#' @param pi_prior numeric matrix (genes x factors) giving prior probability of
#' a gene being active for a factor
#' @param n_hidden integer(1), number of hidden factors in model. Required if
#' \code{pi_prior} is not \code{NULL}, ignored otherwise.
#' @param design matrix of known factors (covariates) to fit in the
#' model. Optional if \code{pi_prior} is not \code{NULL}, ignored otherwise.
#' @param verbose logical(1), should messages be printed about what the function
#' is doing? Default is \code{TRUE}.
#' @param save_init logical(1), save the initial X values (factor states for
#' each cell) in the object? Default is \code{FALSE} as this is potentially a
#' large N (number of cell) by K (number of factors) matrix.
#'
#' @details It is strongly recommended to use \code{\link{newSlalomModel}} to
#' create the \code{\link{SlalomModel}} object prior to applying
#' \code{initSlalom}.
#'
#' @return an `Rcpp_SlalomModel` object
#'
#' @name initSlalom
#' @rdname initSlalom
#'
#' @author Davis McCarthy
#' @importFrom rsvd rpca
#' @importFrom stats runif
#' @importFrom stats rnorm
#' @importFrom stats prcomp
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
initSlalom <- function(
    object, alpha_priors = NULL, epsilon_priors = NULL,  noise_model = "gauss",
    seed = NULL, pi_prior = NULL, n_hidden = NULL, design = NULL,
    verbose = FALSE, save_init = FALSE) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")
    n_cells <- nrow(object$Y)
    var_y_col <- (n_cells / (n_cells - 1) *
                      (colMeans(object$Y * object$Y) - colMeans(object$Y) ^ 2))
    if (!all(var_y_col > 0L))
        stop("Some genes have zero variance in expression. Please filter these out.")
    ## if Pi priors are supplied, need to redefine object
    if (!is.null(pi_prior)) {
        if (is.null(n_hidden))
            stop("If pi_prior is supplied, n_hidden and n_known must be provided.
                First n_known cols of pi_prior are taken as known covariates and next n_hidden cols are taken as hidden factors.")
        tnames <- object$termNames
        cnames <- object$cellNames
        gnames <- object$geneNames
        object <- .newSlalom(object$Y, pi_prior, n_hidden, design)
        object$termNames <- tnames
        object$cellNames <- cnames
        object$geneNames <- gnames
        object$pretrain_order <- seq_len(object$K) - 1
    }
    ## define noise model
    noise_model <- match.arg(noise_model, c("gauss")) ## future: support "hurdle", "poisson"
    object$noiseModel <- noise_model
    ## define priors for alpha and epsilon
    if (!is.null(alpha_priors)) {
        object$alpha_pa <- alpha_priors[0]
        object$alpha_pb <- alpha_priors[1]
        object$alpha_a <- rep(object$alpha_pa, object$K)
        object$alpha_b <- rep(object$alpha_pb, object$K)
        object$alpha_E1 <- object$alpha_b / object$alpha_a
        object$alpha_lnE1 <- digamma(object$alpha_a) - log(object$alpha_b)
    }
    if (!is.null(epsilon_priors)) {
        object$epsilon_pa <- epsilon_priors[0]
        object$epsilon_pb <- epsilon_priors[1]
        object$epsilon_a <- rep(object$epsilon_pa, object$G)
        object$epsilon_b <- rep(object$epsilon_pb, object$G)
        object$epsilon_E1 <- object$epsilon_b / object$epsilon_a
        object$epsilon_lnE1 <- digamma(object$epsilon_a) - log(object$epsilon_b)
    }

    ## pi is likelihood of link for genes x factors, defined in object from
    ## annotated gene sets with newSlalomModel
    ## define the number of genes that are "on" for each factor
    object$nOn <- colSums(object$Pi_E1 > 0.5)

    ## define initial expected values for indicator variables Z with observed
    ## annotations
    object$Z_E1 <- object$Y

    ## bits and bobs
    object$onF <- 1 / object$nScale # object@nScale
    # object@isExpressed <- (object@Z[["E1"]] > 0) * 1.0
    # object@numExpressed <- colSums(object@Z[["E1"]] > 0)

    ## initialize parameters using random PCA method
    object <- .initializePCARand(object, seed = seed, save_init = save_init)
    ## check validity of object and return
    validObject(object)
    object
}


.initializePCARand <- function(object, seed = NULL, save_init = FALSE) {
    if (!is.null(seed))
        set.seed(seed)
    # Zstd <- object$Z[["E1"]]
    ## initialize some objects
    Ystd <- object$Y
    unif_vars <- stats::runif(nrow(object$Pi_E1) * ncol(object$Pi_E1))
    Z_init <- object$Pi_E1
    Z_init[Z_init < 0.1] <- 0.1
    Z_init[Z_init > 0.9] <- 0.9
    Ion <- matrix(unif_vars, nrow = nrow(object$Pi_E1)) < Z_init
    object$X_E1 <- matrix(nrow = object$N, ncol = object$K)
    object$X_diagSigmaS <- matrix(1.0 / 2, nrow = object$N, ncol = object$K)
    object$W_E1 <- matrix(nrow = object$G, ncol = object$K)
    ## initialize gamma component of weights
    object$W_gamma0 <- object$Pi_E1
    object$W_gamma0[object$W_gamma0 <= 0.1] <- 0.1
    object$W_gamma0[object$W_gamma0 >= 0.9] <- 0.9
    object$W_gamma1 <- 1.0 - object$W_gamma0
    ## initialise known factors
    if (object$nKnown > 0L) {
        for (k in seq_len(object$nKnown)) {
            object$W_E1[, k] <- sqrt(1.0 / object$K) * stats::rnorm(object$G)
            object$X_diagSigmaS[, k] <- 1.0 / 2
        }
        object$X_E1[, seq_len(object$nKnown)] <- object$Known
    }
    ## initialise hidden (latent) factors
    if (object$nHidden > 0L) {
        for (iL in object$iUnannotatedDense) {
            object$X_E1[, iL] <- stats::rnorm(object$N)
            object$W_E1[, iL] <- sqrt(1.0 / object$K) * stats::rnorm(object$G)
        }
    }
    ## initialise annotated gene set factors
    for (k in seq_len(object$nAnnotated)) {
        k <- k + object$nKnown + object$nHidden
        object$W_E1[, k] <- sqrt(1.0 / object$K) * stats::rnorm(object$G)
        object$X_diagSigmaS[, k] <- 1.0 / 2
        if (sum(Ion[,k]) > 5) {
            if (object$N < 500)
                pca <- stats::prcomp(Ystd[, Ion[, k]], rank. = 1, retx = TRUE)
            else
                pca <- rsvd::rpca(Ystd[, Ion[, k]], k = 2, retx = TRUE)
            object$X_E1[, k] <- scale(pca$x[, 1])
        } else {
            object$X_E1[, k] <- stats::rnorm(object$N)
        }
    }
    ## save initial X values if desired
    if (save_init) {
        object$X_init <- object$X_E1
    }
    object
}



################################################################################
### Train a SlalomModel class object

#' Train a SlalomModel object
#'
#' Train a SlalomModel to infer model parameters.
#'
#' @param object a \code{Rcpp_SlalomModel} object
#' @param nIterations integer(1) maximum number of iterations to use in training
#' the model (default: 5000)
#' @param minIterations integer(1) minimum number of iterations to perform.
#' @param tolerance numeric(1) tolerance to allow between iterations
#' (default 1e-08)
#' @param forceIterations logical(1) should the model be forced to update
#' \code{nIteration} times?
#' @param shuffle logical(1) should the order in which factors are updated be
#' shuffled between iterations? Shuffling generally helps speed up convergence
#' so is recommended and defaults is \code{TRUE}
#' @param pretrain logical(1), should the model be "pre-trained" to achieve
#' faster convergence and obtain an initial update order? Recommended; default
#' is \code{TRUE}
#' @param verbose logical(1), should messages be printed about what the function
#' is doing? Default is \code{TRUE}.
#' @param seed integer(1) value supplying a random seed to make results
#' reproducible (default is \code{NULL})
#' @param drop_factors logical(1), should factors be dropped from the model if
#' the model determines them not to be relevant? Default is \code{TRUE}.
#'
#' @details Train the model using variational Bayes methods to infer parameters.
#'
#' @return an `Rcpp_SlalomModel` object
#'
#' @importFrom stats cor
#' @docType methods
#' @aliases train train,Rcpp_SlalomModel-method
#'
#' @author Davis McCarthy
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
trainSlalom <- function(
    object, nIterations = 5000, minIterations = 700, tolerance = 1e-08,
    forceIterations = FALSE, shuffle = TRUE, pretrain = TRUE, verbose = TRUE,
    seed = NULL, drop_factors = TRUE) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")
    ## define training parameters in object
    object$dropFactors <- drop_factors  ## drop factors if deemed to be off
    object$tolerance <- tolerance
    object$nIterations <- nIterations
    object$minIterations <- minIterations
    object$forceIterations <- forceIterations
    object$shuffle <- shuffle ## shuffle order in which factors are updated each
    ## for each training iteration
    if (pretrain) {
        if (verbose)
            cat("pre-training model for faster convergence\n")
        pt_out <- .preTrain(object, seed = seed)
        object$pretrain_order <- pt_out - 1
    } else {
        object$pretrain_order <- seq_len(object$K) - 1
    }
    ## train model
    object$train()
    ## return object
    object
}


.preTrain <- function(object, seed = NULL, n_fix = NULL) {
    ## n_fix denotes the number of terms which should be fixed and updated first;
    ## defaults to NULL, which results in the nubmer of unannotated factors
    ## being updated first
    if (is.null(n_fix))
        n_fix <- object$nKnown + object$nHidden
    pi0 <- object$Pi_E1
    pi0[pi0 > 0.8] <- 1 - 1e-100
    pi0[pi0 < 0.2] <- 1e-100
    ## fit pca
    if (!is.null(seed))
        set.seed(seed)
    if (object$N < 50000)
        pca <- stats::prcomp(object$Y, rank. = 1, retx = TRUE)
    else
        pca <- rsvd::rpca(object$Y, k = 2, retx = TRUE)
    ## sort by correlation to PC1
    mpc <- abs(stats::cor(object$X_E1, pca$x[,1]))[-c(1:n_fix)]
    Ipi <- order(mpc, decreasing = TRUE)
    IpiRev <- rev(Ipi)
    ## organise for fitting fwd and rev models
    k_range <- 1:object$K
    k_range[-c(1:n_fix)] <- Ipi + n_fix
    k_range_rev <- 1:object$K
    k_range_rev[-c(1:n_fix)] <- IpiRev + n_fix
    n_hidden <- object$nHidden
    if (object$nKnown > 0)
        design <- object$Known
    else
        design <- NULL
    Y <- object$Z_E1
    rm(object)
    ## run model for 50 iterations
    pi <- pi0[, k_range]
    obj_fwd <- .newSlalom(Y, pi, n_hidden, design = design)
    obj_fwd$shuffle <- TRUE
    obj_fwd$nScale <- 30
    obj_fwd <- initSlalom(obj_fwd)
    obj_fwd <- trainSlalom(obj_fwd, pretrain = FALSE, shuffle = FALSE,
                           nIterations = 50, minIterations = 50,
                           verbose = FALSE, drop_factors = FALSE)
    alpha_fwd <- obj_fwd$alpha_E1
    rm(obj_fwd)
    ## run reverse model for 50 iterations
    pi <- pi0[, k_range_rev]
    obj_rev <- .newSlalom(Y, pi, n_hidden, design = design)
    obj_rev$shuffle <- TRUE
    obj_rev$nScale <- 30
    obj_rev <- initSlalom(obj_rev)
    obj_rev <- trainSlalom(obj_rev, pretrain = FALSE, shuffle = FALSE,
                           nIterations = 50, minIterations = 50,
                           verbose = FALSE, drop_factors = FALSE)
    alpha_rev <- obj_rev$alpha_E1
    rm(obj_rev)
    ## get update ordering
    IpiK <- order(
        -(0.5 * (1.0 / alpha_rev[order(k_range_rev)][-c(1:n_fix)])) +
            0.5 * (1.0 / alpha_fwd[order(k_range)][-c(1:n_fix)])
    ) ## check this!!
    i_label <- c(1:n_fix, IpiK + n_fix)
    i_label
}



################################################################################
### Update a SlalomModel class object

#' Update a SlalomModel object
#'
#' Do one variational update of a SlalomModel to infer model parameters.
#'
#' @param object a \code{Rcpp_SlalomModel} object
#'
#' @details Update the model with one iteration using variational Bayes methods
#' to infer parameters.
#'
#' @name updateSlalom
#' @rdname updateSlalom
#' @importFrom methods is
#'
#' @return an `Rcpp_SlalomModel` object
#'
#' @author Davis McCarthy
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- updateSlalom(model)
updateSlalom <- function(object) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")
    ## train model
    object$update()
    ## return object
    object
}


################################################################################
### Define validity check for SlalomModel class object

setValidity("Rcpp_SlalomModel", function(object) {
    msg <- NULL
    valid <- TRUE

    if ( is.null(object@Y) ) {
        valid <- FALSE
        msg <- c(msg,
                 "object must contain an expression matrix at object@Y")
    }
    if ( any(is.na(object@Y)) ) {
        valid <- FALSE
        msg <- c(msg,
                 "Expression matrix Y cannot contain NA values")
    }
    if ( any(is.na(object@Pi_E1)) ) {
        valid <- FALSE
        msg <- c(msg,
                 "Matrix Pi of observed gene set annotations cannot contain NA values")
    }
    if ( (ncol(object@Y) != object@G)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of genes in expression matrix Y must match object@G")
    }
    if ( (nrow(object@Y) != object@N)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of cells in expression matrix Y must match object@N")
    }
    if ( (ncol(object@Pi_E1) != object@K)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of factors in matrix pi must match object@K")
    }
    if ( (nrow(object@Pi_E1) != object@G)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of factors in matrix pi must match object@G")
    }
    if ( (length(object@termNames) != object@K)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of factors termNames must match object@K")
    }
    if ( (length(object@geneNames) != object@G)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of geneNames must match object@G")
    }
    if ( (length(object@cellNames) != object@N)) {
        valid <- FALSE
        msg <- c(msg,
                 "Number of cellNames must match object@N")
    }


    if (valid) TRUE else msg
})


################################################################################
### Explore results


#' Show results of a Slalom model
#'
#' @param object an object of class \code{Rcpp_SlalomModel}
#' @param n_active number of terms (factors) to be plotted (default is 20)
#' @param mad_filter numeric(1), filter factors by this mean absolute deviation
#' to exclude outliers. For large datasets this can be set to 0
#' @param annotated logical(1), should annotated factors be plotted? Default is
#' \code{TRUE}
#' @param unannotated_dense logical(1), should dense unannotated factors be
#' plotted? Default is \code{FALSE}
#' @param unannotated_sparse logical(1), should sparse unannotated factors be
#' plotted? Default is \code{FALSE}
#'
#' @return data.frame with factors ordered by relevance, showing \code{term}
#' (term names), \code{relevance}, \code{type} (factor type: known, annotated
#' or unannotated), \code{n_prior} (number of genes annotated to the gene
#' set/factor), \code{n_gain} (number of genes added/switched on for the
#' factor), \code{n_loss} (number of genes turned off for the factor).
#'
#' @export
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
#' topTerms(model)
topTerms <- function(
    object, n_active = 20, mad_filter = 0.4, annotated = TRUE,
    unannotated_dense = FALSE, unannotated_sparse = FALSE) {
    if (!methods::is(object, "Rcpp_SlalomModel"))
        stop("object must be of class Rcpp_SlalomModel")

    i_use <- rep(FALSE, object$K)
    if (unannotated_sparse)
        i_use[object$iUnannotatedSparse] <- TRUE
    if (unannotated_dense)
        i_use[object$iUnannotatedDense] <- TRUE
    if (annotated)
        i_use[seq(from = object$nKnown + object$nHidden + 1, to = object$K,
                  by = 1)] <- TRUE

    i_prior <- (object$Pi_E1[, i_use] > 0.5)
    i_posterior <- (object$W_gamma0[, i_use] > 0.5)
    relevance <- (1.0 / object$alpha_E1[i_use])
    terms <- object$termNames[i_use]
    factor_type <- c(rep("known", object$nKnown),
                     rep("unannotated", object$nHidden),
                     rep("annotated", object$K - object$nHidden - object$nKnown))
    factor_type <- factor_type[i_use]
    MAD <- apply(object$X_E1[, i_use], 2, stats::mad)
    R <- (MAD > mad_filter) * relevance

    n_active <- min(sum(R > 0), n_active)

    i_active <- order(R, decreasing = TRUE)[1:n_active]

    df <- data.frame(
        term = terms[i_active],
        relevance = R[i_active],
        type = factor_type[i_active],
        n_prior = colSums(i_prior)[i_active],
        n_gain = colSums(i_posterior & !i_prior)[i_active],
        n_loss = colSums(!i_posterior & i_prior)[i_active]
    )
    df
}


#' Add results to SingleCellExperiment object
#'
#' @param sce_object an object of class
#' \code{\link[SingleCellExperiment]{SingleCellExperiment}}
#' @param slalom_object an object of class \code{Rcpp_SlalomModel}
#' @param n_active number of terms (factors) to be added (default is 20)
#' @param mad_filter numeric(1), filter factors by this mean absolute deviation
#' to ensure variability in the factor states. For large datasets this can be
#' set to 0
#' @param annotated logical(1), should annotated factors be included? Default is
#' \code{TRUE}
#' @param unannotated_dense logical(1), should dense unannotated factors be
#' included? Default is \code{FALSE}
#' @param unannotated_sparse logical(1), should sparse unannotated factors be
#' included? Default is \code{FALSE}
#' @param add_loadings logical(1), should gene/feature loadings be added to
#' the \code{rowData} of the object?
#' @param dimred character(1), name of the reduced-dimension slot to save the
#' factor states to. Default is \code{"slalom"}
#' @param check_convergence logical(1), check that model has converged before
#' adding \code{slalom} results. If \code{TRUE} and model has not converged it
#' throws an error.
#'
#' @return a \code{\link[SingleCellExperiment]{SingleCellExperiment}} object
#' with factor states (X) in a reduced-dimension slot, and gene loadings for
#' factors added to \code{rowData}.
#'
#' @importFrom SingleCellExperiment rowData
#' @importFrom SingleCellExperiment colData
#' @export
#'
#' @examples
#' gmtfile <- system.file("extdata", "reactome_subset.gmt", package = "slalom")
#' genesets <- GSEABase::getGmt(gmtfile)
#' data("mesc")
#' model <- newSlalomModel(mesc, genesets, n_hidden = 5, min_genes = 10)
#' model <- initSlalom(model)
#' model <- trainSlalom(model, nIterations = 10)
#' mesc <- addResultsToSingleCellExperiment(mesc, model,
#' check_convergence = FALSE)
addResultsToSingleCellExperiment <- function(
    sce_object, slalom_object,  n_active = 20, mad_filter = 0.4,
    annotated = TRUE, unannotated_dense = FALSE, unannotated_sparse = FALSE,
    add_loadings = TRUE, dimred = "slalom", check_convergence = TRUE) {
    if (!methods::is(sce_object, "SingleCellExperiment"))
        stop("sce_object must be of class SingleCellExperiment")
    if (!methods::is(slalom_object, "Rcpp_SlalomModel"))
        stop("slalom_object must be of class Rcpp_SlalomModel")
    if (!slalom_object$converged && check_convergence)
        stop("slalom_object model has not converged; results may not be trustworthy.")
    if (!identical(colnames(sce_object), slalom_object$cellNames))
        stop("cellNames (i.e. colnames) do not match between sce_object and slalom_object.")
    if (add_loadings && !any(slalom_object$geneNames %in% rownames(sce_object)))
        stop("add_loadings is TRUE, but no genes/features are shared between slalom_object and sce_object.")
    ## select factors to use
    i_use <- rep(FALSE, slalom_object$K)
    if (unannotated_sparse)
        i_use[slalom_object$iUnannotatedSparse] <- TRUE
    if (unannotated_dense)
        i_use[slalom_object$iUnannotatedDense] <- TRUE
    if (annotated)
        i_use[seq(from = slalom_object$nKnown + slalom_object$nHidden + 1,
                  to = slalom_object$K, by = 1)] <- TRUE
    ## select factors by relevance
    relevance <- (1.0 / slalom_object$alpha_E1[i_use])
    terms <- slalom_object$termNames[i_use]
    X <- slalom_object$X_E1[, i_use]
    W <- slalom_object$X_E1[, i_use]
    MAD <- apply(X, 2, stats::mad)
    R <- (MAD > mad_filter) * relevance
    n_active <- min(sum(R > 0), n_active)
    i_active <- order(R, decreasing = TRUE)[1:n_active]
    ## add factor states
    X_to_add <- X[, i_active, drop = FALSE]
    colnames(X_to_add) <- terms[i_active]
    reducedDim(sce_object, dimred) <- X_to_add
    ## add loadings if requested
    if (add_loadings) {
        gene_labels <- slalom_object$geneNames
        shared_genes <- intersect(rownames(sce_object), gene_labels)
        i_sce <- (rownames(sce_object) %in% shared_genes)
        mm_slalom <- match(gene_labels, rownames(sce_object)[i_sce])
        W <- slalom_object$W_E1[, i_active, drop = FALSE]
        Z <- slalom_object$W_gamma0[, i_active, drop = FALSE]
        loading <- W * Z
        new_rdata <- matrix(NA, nrow = nrow(sce_object), ncol = n_active)
        new_rdata[i_sce,] <- loading[mm_slalom,]
        colnames(new_rdata) <- terms[i_active]
        rowData(sce_object) <- cbind(rowData(sce_object), new_rdata)
    }
    sce_object
}
PMBio/slalom documentation built on May 20, 2019, 1:26 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
PMBio/slalom
Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

R/SlalomModel-methods.R
In PMBio/slalom: Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

R Package Documentation

Browse R Packages

We want your feedback!

PMBio/slalom Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

R/SlalomModel-methods.R In PMBio/slalom: Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

R Package Documentation

Browse R Packages

We want your feedback!

PMBio/slalom
Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data

R/SlalomModel-methods.R
In PMBio/slalom: Factorial Latent Variable Modeling of Single-Cell RNA-Seq Data