R/factorizeMatrix.R
In celda: CEllular Latent Dirichlet Allocation

#' @title Generate factorized matrices showing each feature's influence on cell
#'  / gene clustering
#' @description Generates factorized matrices showing the contribution of each
#'  feature in each cell population or each cell population in each sample.
#' @param x Can be one of
#'  \itemize{
#'  \item A \linkS4class{SingleCellExperiment} object returned by
#'  \link{celda_C}, \link{celda_G} or \link{celda_CG}, with the matrix
#'  located in the \code{useAssay} assay slot in \code{altExp(x, altExpName)}.
#'  Rows represent features and columns represent cells.
#'  \item Integer counts matrix. Rows represent features and columns represent
#'  cells. This matrix should be the same as the one used to generate
#'  \code{celdaMod}.}
#' @param useAssay A string specifying which \link{assay}
#'  slot to use if \code{x} is a \linkS4class{SingleCellExperiment} object.
#'  Default "counts".
#' @param altExpName The name for the \link{altExp} slot
#'  to use. Default "featureSubset".
#' @param celdaMod Celda model object. Only works if \code{x} is an integer
#'  counts matrix.
#' @param type Character vector. A vector containing one or more of "counts",
#'  "proportion", or "posterior". "counts" returns the raw number of counts for
#'  each factorized matrix. "proportions" returns the normalized probabilities
#'  for each factorized matrix, which are calculated by dividing the raw counts
#'  in each factorized matrix by the total counts in each column. "posterior"
#'  returns the posterior estimates which include the addition of the Dirichlet
#'  concentration parameter (essentially as a pseudocount). Default
#'  \code{"counts"}.
#' @param ... Ignored. Placeholder to prevent check warning.
#' @export
setGeneric("factorizeMatrix",
    function(x, celdaMod, ...) {
        standardGeneric("factorizeMatrix")})


#' @examples
#' data(sceCeldaCG)
#' factorizedMatrices <- factorizeMatrix(sceCeldaCG, type = "posterior")
#' @rdname factorizeMatrix
#' @export
setMethod("factorizeMatrix", signature(x = "SingleCellExperiment"),
    function(x,
        useAssay = "counts",
        altExpName = "featureSubset",
        type = c("counts", "proportion", "posterior")) {

        altExp <- SingleCellExperiment::altExp(x, e = altExpName)

        if (celdaModel(x, altExpName = altExpName) == "celda_C") {
            res <- .factorizeMatrixCelda_C(sce = altExp, useAssay = useAssay,
                type = type)
        } else if (celdaModel(x, altExpName = altExpName) == "celda_CG") {
            res <- .factorizeMatrixCelda_CG(sce = altExp, useAssay = useAssay,
                type = type)
        } else if (celdaModel(x, altExpName = altExpName) == "celda_G") {
            res <- .factorizeMatrixCelda_G(sce = altExp, useAssay = useAssay,
                type = type)
        } else {
            stop("S4Vectors::metadata(altExp(x, altExpName))$",
                "celda_parameters$model must be",
                " one of 'celda_C', 'celda_G', or 'celda_CG'")
        }
        return(res)
    })


#' @return For celda_CG model, A list with elements for "counts", "proportions",
#'  or "posterior" probabilities. Each element will be a list containing
#'  factorized matrices for "module", "cellPopulation", and "sample".
#'  Additionally, the contribution of each module in each individual cell will
#'  be included in the "cell" element of "counts" and "proportions" elements.
#' @examples
#' data(celdaCGSim, celdaCGMod)
#' factorizedMatrices <- factorizeMatrix(
#'   celdaCGSim$counts,
#'   celdaCGMod,
#'   "posterior")
#' @rdname factorizeMatrix
#' @export
setMethod("factorizeMatrix", signature(x = "matrix", celdaMod = "celda_CG"),
    function(x,
        celdaMod,
        type = c("counts", "proportion", "posterior")) {

        counts <- .processCounts(x)
        compareCountMatrix(counts, celdaMod)

        K <- params(celdaMod)$K
        L <- params(celdaMod)$L
        z <- celdaClusters(celdaMod)$z
        y <- celdaClusters(celdaMod)$y
        alpha <- params(celdaMod)$alpha
        beta <- params(celdaMod)$beta
        delta <- params(celdaMod)$delta
        gamma <- params(celdaMod)$gamma
        sampleLabel <- sampleLabel(celdaMod)
        s <- as.integer(sampleLabel)

        ## Calculate counts one time up front
        p <- .cCGDecomposeCounts(counts, s, z, y, K, L)
        nS <- p$nS
        nG <- p$nG
        nM <- p$nM
        mCPByS <- p$mCPByS
        nTSByC <- p$nTSByC
        nTSByCP <- p$nTSByCP
        nByG <- p$nByG
        nByTS <- p$nByTS
        nGByTS <- p$nGByTS
        nGByTS[nGByTS == 0] <- 1

        GByTS <- matrix(0, nrow = length(y), ncol = L)
        GByTS[cbind(seq(nG), y)] <- p$nByG

        LNames <- paste0("L", seq(L))
        KNames <- paste0("K", seq(K))
        colnames(nTSByC) <- matrixNames(celdaMod)$column
        rownames(nTSByC) <- LNames
        colnames(GByTS) <- LNames
        rownames(GByTS) <- matrixNames(celdaMod)$row
        rownames(mCPByS) <- KNames
        colnames(mCPByS) <- matrixNames(celdaMod)$sample
        colnames(nTSByCP) <- KNames
        rownames(nTSByCP) <- LNames

        countsList <- c()
        propList <- c()
        postList <- c()
        res <- list()

        if (any("counts" %in% type)) {
            countsList <- list(
                sample = mCPByS,
                cellPopulation = nTSByCP,
                cell = nTSByC,
                module = GByTS,
                geneDistribution = nGByTS
            )
            res <- c(res, list(counts = countsList))
        }

        if (any("proportion" %in% type)) {
            ## Need to avoid normalizing cell/gene states with zero cells/genes
            uniqueZ <- sort(unique(z))
            tempNTSByCP <- nTSByCP
            tempNTSByCP[, uniqueZ] <- normalizeCounts(tempNTSByCP[, uniqueZ],
                normalize = "proportion"
            )

            uniqueY <- sort(unique(y))
            tempGByTS <- GByTS
            tempGByTS[, uniqueY] <- normalizeCounts(tempGByTS[, uniqueY],
                normalize = "proportion"
            )
            tempNGByTS <- nGByTS / sum(nGByTS)

            propList <- list(
                sample = normalizeCounts(mCPByS,
                    normalize = "proportion"
                ),
                cellPopulation = tempNTSByCP,
                cell = normalizeCounts(nTSByC, normalize = "proportion"),
                module = tempGByTS,
                geneDistribution = tempNGByTS
            )
            res <- c(res, list(proportions = propList))
        }

        if (any("posterior" %in% type)) {
            gs <- GByTS
            gs[cbind(seq(nG), y)] <- gs[cbind(seq(nG), y)] + delta
            gs <- normalizeCounts(gs, normalize = "proportion")
            tempNGByTS <- (nGByTS + gamma) / sum(nGByTS + gamma)

            postList <- list(
                sample = normalizeCounts(mCPByS + alpha,
                    normalize = "proportion"
                ),
                cellPopulation = normalizeCounts(nTSByCP + beta,
                    normalize = "proportion"
                ),
                module = gs,
                geneDistribution = tempNGByTS
            )
            res <- c(res, posterior = list(postList))
        }
        return(res)
    }
)


#' @examples
#' data(celdaCSim, celdaCMod)
#' factorizedMatrices <- factorizeMatrix(
#'   celdaCSim$counts,
#'   celdaCMod, "posterior"
#' )
#' @return For celda_C model, a list with elements for "counts", "proportions",
#'  or "posterior" probabilities. Each element will be a list containing
#'  factorized matrices for "module" and "sample".
#' @rdname factorizeMatrix
#' @export
setMethod("factorizeMatrix", signature(x = "matrix", celdaMod = "celda_C"),
    function(x,
        celdaMod,
        type = c("counts", "proportion", "posterior")) {

        counts <- .processCounts(x)
        compareCountMatrix(counts, celdaMod)

        K <- params(celdaMod)$K
        z <- celdaClusters(celdaMod)$z
        alpha <- params(celdaMod)$alpha
        beta <- params(celdaMod)$beta
        sampleLabel <- sampleLabel(celdaMod)
        s <- as.integer(sampleLabel)

        p <- .cCDecomposeCounts(counts, s, z, K)
        mCPByS <- p$mCPByS
        nGByCP <- p$nGByCP

        KNames <- paste0("K", seq(K))
        rownames(nGByCP) <- matrixNames(celdaMod)$row
        colnames(nGByCP) <- KNames
        rownames(mCPByS) <- KNames
        colnames(mCPByS) <- matrixNames(celdaMod)$sample

        countsList <- c()
        propList <- c()
        postList <- c()
        res <- list()

        if (any("counts" %in% type)) {
            countsList <- list(sample = mCPByS, module = nGByCP)
            res <- c(res, list(counts = countsList))
        }

        if (any("proportion" %in% type)) {
            ## Need to avoid normalizing cell/gene states with zero cells/genes
            uniqueZ <- sort(unique(z))
            tempNGByCP <- nGByCP
            tempNGByCP[, uniqueZ] <- normalizeCounts(tempNGByCP[, uniqueZ],
                normalize = "proportion"
            )

            propList <- list(
                sample = normalizeCounts(mCPByS,
                    normalize = "proportion"
                ),
                module = tempNGByCP
            )
            res <- c(res, list(proportions = propList))
        }

        if (any("posterior" %in% type)) {
            postList <- list(
                sample = normalizeCounts(mCPByS + alpha,
                    normalize = "proportion"
                ),
                module = normalizeCounts(nGByCP + beta,
                    normalize = "proportion"
                )
            )

            res <- c(res, posterior = list(postList))
        }

        return(res)
    }
)


#' @return For celda_G model, a list with elements for "counts", "proportions",
#'  or "posterior" probabilities. Each element will be a list containing
#'  factorized matrices for "module" and "cell".
#' @examples
#' data(celdaGSim, celdaGMod)
#' factorizedMatrices <- factorizeMatrix(
#'   celdaGSim$counts,
#'   celdaGMod, "posterior"
#' )
#' @rdname factorizeMatrix
#' @export
setMethod("factorizeMatrix", signature(x = "matrix", celdaMod = "celda_G"),
    function(x,
        celdaMod,
        type = c("counts", "proportion", "posterior")) {

        counts <- .processCounts(x)
        # compareCountMatrix(counts, celdaMod)

        L <- params(celdaMod)$L
        y <- celdaClusters(celdaMod)$y
        beta <- params(celdaMod)$beta
        delta <- params(celdaMod)$delta
        gamma <- params(celdaMod)$gamma

        p <- .cGDecomposeCounts(counts = counts, y = y, L = L)
        nTSByC <- p$nTSByC
        nByG <- p$nByG
        nByTS <- p$nByTS
        nGByTS <- p$nGByTS
        nGByTS[nGByTS == 0] <- 1
        nM <- p$nM
        nG <- p$nG
        rm(p)

        GByTS <- matrix(0, nrow = length(y), ncol = L)
        GByTS[cbind(seq(nG), y)] <- nByG

        LNames <- paste0("L", seq(L))
        colnames(nTSByC) <- matrixNames(celdaMod)$column
        rownames(nTSByC) <- LNames
        colnames(GByTS) <- LNames
        rownames(GByTS) <- matrixNames(celdaMod)$row
        names(nGByTS) <- LNames

        countsList <- c()
        propList <- c()
        postList <- c()
        res <- list()

        if (any("counts" %in% type)) {
            countsList <- list(
                cell = nTSByC,
                module = GByTS,
                geneDistribution = nGByTS
            )
            res <- c(res, list(counts = countsList))
        }

        if (any("proportion" %in% type)) {
            ## Need to avoid normalizing cell/gene states with zero cells/genes
            uniqueY <- sort(unique(y))
            tempGByTS <- GByTS
            tempGByTS[, uniqueY] <- normalizeCounts(tempGByTS[, uniqueY],
                normalize = "proportion"
            )
            tempNGByTS <- nGByTS / sum(nGByTS)

            propList <- list(
                cell = normalizeCounts(nTSByC,
                    normalize = "proportion"
                ),
                module = tempGByTS,
                geneDistribution = tempNGByTS
            )
            res <- c(res, list(proportions = propList))
        }

        if (any("posterior" %in% type)) {
            gs <- GByTS
            gs[cbind(seq(nG), y)] <- gs[cbind(seq(nG), y)] + delta
            gs <- normalizeCounts(gs, normalize = "proportion")
            tempNGByTS <- (nGByTS + gamma) / sum(nGByTS + gamma)

            postList <- list(
                cell = normalizeCounts(nTSByC + beta,
                    normalize = "proportion"
                ),
                module = gs,
                geneDistribution = tempNGByTS
            )
            res <- c(res, posterior = list(postList))
        }

        return(res)
    }
)


.factorizeMatrixCelda_C <- function(sce, useAssay, type) {
    counts <- SummarizedExperiment::assay(sce, i = useAssay)
    counts <- .processCounts(counts)

    K <- S4Vectors::metadata(sce)$celda_parameters$K
    z <- SummarizedExperiment::colData(sce)$celda_cell_cluster
    alpha <- S4Vectors::metadata(sce)$celda_parameters$alpha
    beta <- S4Vectors::metadata(sce)$celda_parameters$beta
    sampleLabel <- SummarizedExperiment::colData(sce)$celda_sample_label
    s <- as.integer(sampleLabel)

    p <- .cCDecomposeCounts(counts, s, z, K)
    mCPByS <- p$mCPByS
    nGByCP <- p$nGByCP

    KNames <- paste0("K", seq(K))
    rownames(nGByCP) <- rownames(sce)
    colnames(nGByCP) <- KNames
    rownames(mCPByS) <- KNames
    colnames(mCPByS) <-
        S4Vectors::metadata(sce)$celda_parameters$sampleLevels

    countsList <- c()
    propList <- c()
    postList <- c()
    res <- list()

    if (any("counts" %in% type)) {
        countsList <- list(sample = mCPByS, module = nGByCP)
        res <- c(res, list(counts = countsList))
    }

    if (any("proportion" %in% type)) {
        ## Need to avoid normalizing cell/gene states with zero cells/genes
        uniqueZ <- sort(unique(z))
        tempNGByCP <- nGByCP
        tempNGByCP[, uniqueZ] <- normalizeCounts(tempNGByCP[, uniqueZ],
            normalize = "proportion")

        propList <- list(sample = normalizeCounts(mCPByS,
            normalize = "proportion"),
            module = tempNGByCP)
        res <- c(res, list(proportions = propList))
    }

    if (any("posterior" %in% type)) {
        postList <- list(sample = normalizeCounts(mCPByS + alpha,
            normalize = "proportion"),
            module = normalizeCounts(nGByCP + beta,
                normalize = "proportion"))

        res <- c(res, posterior = list(postList))
    }

    return(res)
}


.factorizeMatrixCelda_CG <- function(sce, useAssay, type) {
    counts <- SummarizedExperiment::assay(sce, i = useAssay)
    counts <- .processCounts(counts)

    K <- S4Vectors::metadata(sce)$celda_parameters$K
    L <- S4Vectors::metadata(sce)$celda_parameters$L
    z <- SummarizedExperiment::colData(sce)$celda_cell_cluster
    y <- SummarizedExperiment::rowData(sce)$celda_feature_module
    alpha <- S4Vectors::metadata(sce)$celda_parameters$alpha
    beta <- S4Vectors::metadata(sce)$celda_parameters$beta
    delta <- S4Vectors::metadata(sce)$celda_parameters$delta
    gamma <- S4Vectors::metadata(sce)$celda_parameters$gamma
    sampleLabel <- SummarizedExperiment::colData(sce)$celda_sample_label
    s <- as.integer(sampleLabel)

    ## Calculate counts one time up front
    p <- .cCGDecomposeCounts(counts, s, z, y, K, L)
    nS <- p$nS
    nG <- p$nG
    nM <- p$nM
    mCPByS <- p$mCPByS
    nTSByC <- p$nTSByC
    nTSByCP <- p$nTSByCP
    nByG <- p$nByG
    nByTS <- p$nByTS
    nGByTS <- p$nGByTS
    nGByTS[nGByTS == 0] <- 1

    GByTS <- matrix(0, nrow = length(y), ncol = L)
    GByTS[cbind(seq(nG), y)] <- p$nByG

    LNames <- paste0("L", seq(L))
    KNames <- paste0("K", seq(K))
    colnames(nTSByC) <- colnames(sce)
    rownames(nTSByC) <- LNames
    colnames(GByTS) <- LNames
    rownames(GByTS) <- rownames(sce)
    rownames(mCPByS) <- KNames
    colnames(mCPByS) <- S4Vectors::metadata(sce)$celda_parameters$sampleLevels
    colnames(nTSByCP) <- KNames
    rownames(nTSByCP) <- LNames

    countsList <- c()
    propList <- c()
    postList <- c()
    res <- list()

    if (any("counts" %in% type)) {
        countsList <- list(
            sample = mCPByS,
            cellPopulation = nTSByCP,
            cell = nTSByC,
            module = GByTS,
            geneDistribution = nGByTS
        )
        res <- c(res, list(counts = countsList))
    }

    if (any("proportion" %in% type)) {
        ## Need to avoid normalizing cell/gene states with zero cells/genes
        uniqueZ <- sort(unique(z))
        tempNTSByCP <- nTSByCP
        tempNTSByCP[, uniqueZ] <- normalizeCounts(tempNTSByCP[, uniqueZ],
            normalize = "proportion"
        )

        uniqueY <- sort(unique(y))
        tempGByTS <- GByTS
        tempGByTS[, uniqueY] <- normalizeCounts(tempGByTS[, uniqueY],
            normalize = "proportion"
        )
        tempNGByTS <- nGByTS / sum(nGByTS)

        propList <- list(
            sample = normalizeCounts(mCPByS, normalize = "proportion"),
            cellPopulation = tempNTSByCP,
            cell = normalizeCounts(nTSByC, normalize = "proportion"),
            module = tempGByTS,
            geneDistribution = tempNGByTS
        )
        res <- c(res, list(proportions = propList))
    }

    if (any("posterior" %in% type)) {
        gs <- GByTS
        gs[cbind(seq(nG), y)] <- gs[cbind(seq(nG), y)] + delta
        gs <- normalizeCounts(gs, normalize = "proportion")
        tempNGByTS <- (nGByTS + gamma) / sum(nGByTS + gamma)

        postList <- list(
            sample = normalizeCounts(mCPByS + alpha,
                normalize = "proportion"
            ),
            cellPopulation = normalizeCounts(nTSByCP + beta,
                normalize = "proportion"
            ),
            module = gs,
            geneDistribution = tempNGByTS
        )
        res <- c(res, posterior = list(postList))
    }

    return(res)
}


.factorizeMatrixCelda_G <- function(sce, useAssay, type) {
    counts <- SummarizedExperiment::assay(sce, i = useAssay)
    counts <- .processCounts(counts)
    # compareCountMatrix(counts, celdaMod)

    L <- S4Vectors::metadata(sce)$celda_parameters$L
    y <- SummarizedExperiment::rowData(sce)$celda_feature_module
    beta <- S4Vectors::metadata(sce)$celda_parameters$beta
    delta <- S4Vectors::metadata(sce)$celda_parameters$delta
    gamma <- S4Vectors::metadata(sce)$celda_parameters$gamma

    p <- .cGDecomposeCounts(counts = counts, y = y, L = L)
    nTSByC <- p$nTSByC
    nByG <- p$nByG
    nByTS <- p$nByTS
    nGByTS <- p$nGByTS
    nGByTS[nGByTS == 0] <- 1
    nM <- p$nM
    nG <- p$nG
    rm(p)

    GByTS <- matrix(0, nrow = length(y), ncol = L)
    GByTS[cbind(seq(nG), y)] <- nByG

    LNames <- paste0("L", seq(L))
    colnames(nTSByC) <- colnames(sce)
    rownames(nTSByC) <- LNames
    colnames(GByTS) <- LNames
    rownames(GByTS) <- rownames(sce)
    names(nGByTS) <- LNames

    countsList <- c()
    propList <- c()
    postList <- c()
    res <- list()

    if (any("counts" %in% type)) {
        countsList <- list(
            cell = nTSByC,
            module = GByTS,
            geneDistribution = nGByTS
        )
        res <- c(res, list(counts = countsList))
    }

    if (any("proportion" %in% type)) {
        ## Need to avoid normalizing cell/gene states with zero cells/genes
        uniqueY <- sort(unique(y))
        tempGByTS <- GByTS
        tempGByTS[, uniqueY] <- normalizeCounts(tempGByTS[, uniqueY],
            normalize = "proportion"
        )
        tempNGByTS <- nGByTS / sum(nGByTS)

        propList <- list(
            cell = normalizeCounts(nTSByC,
                normalize = "proportion"
            ),
            module = tempGByTS,
            geneDistribution = tempNGByTS
        )
        res <- c(res, list(proportions = propList))
    }

    if (any("posterior" %in% type)) {
        gs <- GByTS
        gs[cbind(seq(nG), y)] <- gs[cbind(seq(nG), y)] + delta
        gs <- normalizeCounts(gs, normalize = "proportion")
        tempNGByTS <- (nGByTS + gamma) / sum(nGByTS + gamma)

        postList <- list(
            cell = normalizeCounts(nTSByC + beta,
                normalize = "proportion"
            ),
            module = gs,
            geneDistribution = tempNGByTS
        )
        res <- c(res, posterior = list(postList))
    }

    return(res)
}