R/integration.R

Defines functions calcAlignment calcAgreement .centroidAlign.Hraw centroidAlign.Seurat centroidAlign.liger centroidAlign .same quantile_norm .autoFindRef_qn .quantileNorm.HList quantileNorm.Seurat quantileNorm.liger quantileNorm alignFactors.Seurat alignFactors.liger alignFactors .runUINMF.list runUINMF.liger runUINMF online_iNMF runOnlineINMF.Seurat .runOnlineINMF.list runOnlineINMF.liger runOnlineINMF optimizeALS .runINMF.list runINMF.Seurat runINMF.liger runINMF runIntegration.Seurat runIntegration.liger runIntegration

Documented in alignFactors alignFactors.liger alignFactors.Seurat calcAgreement calcAlignment centroidAlign centroidAlign.liger centroidAlign.Seurat online_iNMF optimizeALS quantile_norm quantileNorm quantileNorm.liger quantileNorm.Seurat runINMF runINMF.liger runINMF.Seurat runIntegration runIntegration.liger runIntegration.Seurat runOnlineINMF runOnlineINMF.liger runOnlineINMF.Seurat runUINMF runUINMF.liger

#' Integrate scaled datasets with iNMF or variant methods
#' @description
#' LIGER provides dataset integration methods based on iNMF (integrative
#' Non-negative Matrix Factorization \[1\]) and its variants (online iNMF \[2\]
#' and UINMF \[3\]). This function wraps \code{\link{runINMF}},
#' \code{\link{runOnlineINMF}} and \code{\link{runUINMF}}, of which the help
#' pages have more detailed description.
#' @param object A \linkS4class{liger} object or a Seurat object with
#' non-negative scaled data of variable features (Done with
#' \code{\link{scaleNotCenter}}).
#' @param k Inner dimension of factorization (number of factors). Generally, a
#' higher \code{k} will be needed for datasets with more sub-structure. Default
#' \code{20}.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more strongly (i.e. alignment should increase as
#' \code{lambda} increases). Default \code{5}.
#' @param method iNMF variant algorithm to use for integration. Choose from
#' \code{"iNMF"}, \code{"onlineINMF"}, \code{"UINMF"}. Default \code{"iNMF"}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to other methods and wrapped functions.
#' @export
#' @rdname runIntegration
#' @return Updated input object. For detail, please refer to the refered method
#' linked in Description.
#' @references
#' \enumerate{
#'  \item{Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares
#' and Contrasts Features of Brain Cell Identity, Cell, 2019}
#'  \item{Chao Gao and et al., Iterative single-cell multi-omic integration using
#' online learning, Nat Biotechnol., 2021}
#'  \item{April R. Kriebel and Joshua D. Welch, UINMF performs mosaic integration
#' of single-cell multi-omic datasets using nonnegative matrix factorization,
#' Nat. Comm., 2022}
#' }
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc)
#' pbmc <- scaleNotCenter(pbmc)
#' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
#'     pbmc <- runIntegration(pbmc)
#' }
runIntegration <- function(
        object,
        k = 20,
        lambda = 5.0,
        method = c("iNMF", "onlineINMF", "UINMF"),
        ...
) {
    UseMethod("runIntegration", object)
}

#' @rdname runIntegration
#' @export
#' @method runIntegration liger
runIntegration.liger <- function(
        object,
        k = 20,
        lambda = 5.0,
        method = c("iNMF", "onlineINMF", "UINMF"),
        seed = 1,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    method <- match.arg(method)
    object <- switch(
        method,
        iNMF = runINMF(object, k = k, lambda = lambda, seed = seed,
                       verbose = verbose, ...),
        onlineINMF = runOnlineINMF(object, k = k, lambda = lambda, seed = seed,
                                   verbose = verbose, ...),
        UINMF = runUINMF(object = object, k = k, lambda = lambda, seed = seed,
                         verbose = verbose, ...)
    )
    return(object)
}

#' @rdname runIntegration
#' @export
#' @method runIntegration Seurat
#' @param datasetVar Metadata variable name that stores the dataset source
#' annotation. Default \code{"orig.ident"}.
#' @param useLayer For Seurat>=4.9.9, the name of layer to retrieve input
#' non-negative scaled data. Default \code{"ligerScaleData"}. For older Seurat,
#' always retrieve from \code{scale.data} slot.
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
runIntegration.Seurat <- function(
        object,
        k = 20,
        lambda = 5.0,
        method = c("iNMF", "onlineINMF"),
        datasetVar = "orig.ident",
        useLayer = "ligerScaleData",
        assay = NULL,
        seed = 1,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    method <- match.arg(method)
    object <- switch(
        method,
        iNMF = runINMF(object, k = k, lambda = lambda, seed = seed,
                       useLayer = useLayer, assay = assay,
                       datasetVar = datasetVar, verbose = verbose, ...),
        onlineINMF = runOnlineINMF(object, k = k, lambda = lambda, seed = seed,
                                   useLayer = useLayer, assay = assay,
                                   datasetVar = datasetVar,
                                   verbose = verbose, ...)
    )
    return(object)
}


############################### regular INMF ###################################

#' Perform iNMF on scaled datasets
#' @description
#' Performs integrative non-negative matrix factorization (iNMF) (J.D. Welch,
#' 2019) using block coordinate descent (alternating non-negative
#' least squares, ANLS) to return factorized \eqn{H}, \eqn{W}, and \eqn{V}
#' matrices. The objective function is stated as
#'
#' \deqn{\arg\min_{H\ge0,W\ge0,V\ge0}\sum_{i}^{d}||E_i-(W+V_i)Hi||^2_F+
#' \lambda\sum_{i}^{d}||V_iH_i||_F^2}
#'
#' where \eqn{E_i} is the input non-negative matrix of the i'th dataset, \eqn{d}
#' is the total number of datasets. \eqn{E_i} is of size \eqn{m \times n_i} for
#' \eqn{m} variable genes and \eqn{n_i} cells, \eqn{H_i} is of size
#' \eqn{n_i \times k}, \eqn{V_i} is of size \eqn{m \times k}, and \eqn{W} is of
#' size \eqn{m \times k}.
#'
#' The factorization produces a shared \eqn{W} matrix (genes by k), and for each
#' dataset, an \eqn{H} matrix (k by cells) and a \eqn{V} matrix (genes by k).
#' The \eqn{H} matrices represent the cell factor loadings. \eqn{W} is held
#' consistent among all datasets, as it represents the shared components of the
#' metagenes across datasets. The \eqn{V} matrices represent the
#' dataset-specific components of the metagenes.
#'
#' This function adopts highly optimized fast and memory efficient
#' implementation extended from Planc (Kannan, 2016). Pre-installation of
#' extension package \code{RcppPlanc} is required. The underlying algorithm
#' adopts the identical ANLS strategy as \code{\link{optimizeALS}} in the old
#' version of LIGER.
#' @section Difference from optimizeALS():
#' In the old version implementation, we compute the objective error at the end
#' of each iteration, and then compares if the algorithm is reaching a
#' convergence, using an argument \code{thresh}. Now, since the computation of
#' objective error is indeed expensive, we canceled this feature and directly
#' runs a default of 30 (\code{nIteration}) iterations, which empirically leads
#' to a convergence most of the time. Given that the new version is highly
#' optimized, running this many iteration should be acceptable.
#' @param object A \linkS4class{liger} object or a Seurat object with
#' non-negative scaled data of variable features (Done with
#' \code{\link{scaleNotCenter}}).
#' @param k Inner dimension of factorization (number of factors). Generally, a
#' higher \code{k} will be needed for datasets with more sub-structure. Default
#' \code{20}.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more strongly (i.e. alignment should increase as
#' \code{lambda} increases). Default \code{5}.
#' @param nIteration Total number of block coordinate descent iterations to
#' perform. Default \code{30}.
#' @param nRandomStarts Number of restarts to perform (iNMF objective function
#' is non-convex, so taking the best objective from multiple successive
#' initialization is recommended). For easier reproducibility, this increments
#' the random seed by 1 for each consecutive restart, so future factorization
#' of the same dataset can be run with one rep if necessary. Default \code{1}.
#' @param HInit Initial values to use for \eqn{H} matrices. A list object where
#' each element is the initial \eqn{H} matrix of each dataset. Default
#' \code{NULL}.
#' @param WInit Initial values to use for \eqn{W} matrix. A matrix object.
#' Default \code{NULL}.
#' @param VInit Initial values to use for \eqn{V} matrices. A list object where
#' each element is the initial \eqn{V} matrix of each dataset. Default
#' \code{NULL}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param nCores The number of parallel tasks to speed up the computation.
#' Default \code{2L}. Only supported for platform with OpenMP support.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to methods.
#' @rdname runINMF
#' @export
#' @return
#' \itemize{
#'  \item{liger method - Returns updated input \linkS4class{liger} object
#'  \itemize{
#'      \item{A list of all \eqn{H} matrices can be accessed with
#'          \code{getMatrix(object, "H")}}
#'      \item{A list of all \eqn{V} matrices can be accessed with
#'          \code{getMatrix(object, "V")}}
#'      \item{The \eqn{W} matrix can be accessed with
#'          \code{getMatrix(object, "W")}}
#'  }}
#'  \item{Seurat method - Returns updated input Seurat object
#'  \itemize{
#'      \item{\eqn{H} matrices for all datasets will be concatenated and
#'          transposed (all cells by k), and form a DimReduc object in the
#'          \code{reductions} slot named by argument \code{reduction}.}
#'      \item{\eqn{W} matrix will be presented as \code{feature.loadings} in the
#'          same DimReduc object.}
#'      \item{\eqn{V} matrices, an objective error value and the dataset
#'          variable used for the factorization is currently stored in
#'          \code{misc} slot of the same DimReduc object.}
#'  }}
#' }
#' @references Joshua D. Welch and et al., Single-Cell Multi-omic Integration
#' Compares and Contrasts Features of Brain Cell Identity, Cell, 2019
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc)
#' pbmc <- scaleNotCenter(pbmc)
#' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
#'     pbmc <- runINMF(pbmc)
#' }
runINMF <- function(
        object,
        k = 20,
        lambda = 5.0,
        ...
) {
    UseMethod("runINMF", object)
}

#' @rdname runINMF
#' @export
#' @method runINMF liger
runINMF.liger <- function(
        object,
        k = 20,
        lambda = 5.0,
        nIteration = 30,
        nRandomStarts = 1,
        HInit = NULL,
        WInit = NULL,
        VInit = NULL,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    .checkObjVersion(object)
    object <- recordCommand(object, ..., dependencies = "RcppPlanc")
    object <- removeMissing(object, orient = "cell", verbose = verbose)
    data <- lapply(datasets(object), function(ld) {
        if (is.null(scaleData(ld)))
            cli::cli_abort("Scaled data not available. Run {.fn scaleNotCenter} first.")
        return(scaleData(ld))
    })
    dataClasses <- sapply(data, function(x) class(x)[1])
    if (!all(dataClasses == dataClasses[1])) {
        cli::cli_abort("Currently the scaledData of all datasets have to be of the same class.")
    }
    out <- .runINMF.list(
        object = data,
        k = k,
        lambda = lambda,
        nIteration = nIteration,
        nRandomStarts = nRandomStarts,
        HInit = HInit,
        WInit = WInit,
        VInit = VInit,
        seed = seed,
        nCores = nCores,
        verbose = verbose
    )

    object@W <- out$W
    rownames(object@W) <- varFeatures(object)
    for (d in names(object)) {
        ld <- dataset(object, d)
        ld@H <- out$H[[d]]
        ld@V <- out$V[[d]]
        if (isH5Liger(ld)) {
            colnames(ld@H) <- colnames(ld)
            rownames(ld@V) <- varFeatures(object)
        }
        datasets(object, check = FALSE)[[d]] <- ld
    }
    object@uns$factorization <- list(k = k, lambda = lambda)
    return(object)
}

#' @rdname runINMF
#' @export
#' @param datasetVar Metadata variable name that stores the dataset source
#' annotation. Default \code{"orig.ident"}.
#' @param layer For Seurat>=4.9.9, the name of layer to retrieve input
#' non-negative scaled data. Default \code{"ligerScaleData"}. For older Seurat,
#' always retrieve from \code{scale.data} slot.
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
#' @param reduction Name of the reduction to store result. Also used as the
#' feature key. Default \code{"inmf"}.
#' @method runINMF Seurat
runINMF.Seurat <- function(
        object,
        k = 20,
        lambda = 5.0,
        datasetVar = "orig.ident",
        layer = "ligerScaleData",
        assay = NULL,
        reduction = "inmf",
        nIteration = 30,
        nRandomStarts = 1,
        HInit = NULL,
        WInit = NULL,
        VInit = NULL,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    assay <- assay %||% SeuratObject::DefaultAssay(object)
    Es <- .getSeuratData(object, layer = layer, slot = "scale.data",
                              assay = assay)
    # the last [,1] converts data.frame to the vector/factor
    datasetVar <- object[[datasetVar]][,1]
    if (!is.factor(datasetVar)) datasetVar <- factor(datasetVar)
    datasetVar <- droplevels(datasetVar)

    if (!is.list(Es)) {
        Es <- splitRmMiss(Es, datasetVar)
        Es <- lapply(Es, methods::as, Class = "CsparseMatrix")
    }
    for (i in seq_along(Es)) {
        if (any(Es[[i]]@x < 0)) {
            cli::cli_abort("Negative data encountered for integrative {.emph Non-negative} Matrix Factorization.
                           Please run {.fn scaleNotCenter} first.")
        }
    }

    res <- .runINMF.list(
        object = Es,
        k = k,
        lambda = lambda,
        nIteration = nIteration,
        nRandomStarts = nRandomStarts,
        HInit = HInit,
        WInit = WInit,
        VInit = VInit,
        seed = seed,
        nCores = nCores,
        verbose = verbose
    )
    Hconcat <- t(Reduce(cbind, res$H))
    colnames(Hconcat) <- paste0(reduction, "_", seq_len(ncol(Hconcat)))
    object[[reduction]] <- Seurat::CreateDimReducObject(
        embeddings = Hconcat,
        loadings = res$W,
        assay = assay,
        misc = list(V = res$V, objErr = res$objErr, dataset = datasetVar)
    )
    return(object)
}

#' @param barcodeList List object of barcodes for each datasets, for setting
#' dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames}
#' of matrices in the \code{object}.
#' @param features Character vector of feature names, for setting dimnames of
#' output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames}
#' of matrices in the \code{object}.
#' @return The list method returns a list of entries \code{H}, \code{V} and
#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V}
#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared
#' \eqn{W} matrix.
#' @noRd
.runINMF.list <- function(
        object,
        k = 20,
        lambda = 5.0,
        nIteration = 30,
        nRandomStarts = 1,
        HInit = NULL,
        WInit = NULL,
        VInit = NULL,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE)
) {
    if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start
        cli::cli_abort(
        "Package {.pkg RcppPlanc} is required for iNMF integration.
        Please install it by command:
        {.code install.packages('RcppPlanc', repos = 'https:/welch-lab.r-universe.dev')}") # nocov end

    barcodeList <- lapply(object, colnames)
    allFeatures <- lapply(object, rownames)
    features <- Reduce(.same, allFeatures)

    if (min(lengths(barcodeList)) < k) {
        cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(lengths(barcodeList))}).")
    }
    if (length(features) < k) {
        cli::cli_abort("Number of factors (k={k}) should be less than the number of shared features ({length(features)}).")
    }

    bestResult <- list()
    bestObj <- Inf
    bestSeed <- seed
    for (i in seq(nRandomStarts)) {
        if (isTRUE(verbose) && nRandomStarts > 1) {
            cli::cli_alert_info("Replicate run [{i}/{nRandomStarts}]")
        }
        set.seed(seed = seed + i - 1)
        out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda,
                               niter = nIteration, Hinit = HInit,
                               Vinit = VInit, Winit = WInit, nCores = nCores,
                               verbose = verbose)
        if (out$objErr < bestObj) {
            bestResult <- out
            bestObj <- out$objErr
            bestSeed <- seed + i - 1
        }
    }
    if (isTRUE(verbose) && nRandomStarts > 1) {
        cli::cli_alert_success("Best objective error: {bestObj}; Best seed: {bestSeed}")
    }

    factorNames <- paste0("Factor_", seq(k))
    for (i in seq_along(object)) {
        bestResult$H[[i]] <- t(bestResult$H[[i]])
        dimnames(bestResult$H[[i]]) <- list(factorNames, barcodeList[[i]])
        dimnames(bestResult$V[[i]]) <- list(features, factorNames)
    }
    names(bestResult$V) <- names(bestResult$H) <- names(object)
    dimnames(bestResult$W) <- list(features, factorNames)
    return(bestResult)
}

#' `r lifecycle::badge("deprecated")` Perform iNMF on scaled datasets
#' @description
#' \bold{Please turn to \code{\link{runINMF}} or \code{\link{runIntegration}}}.
#'
#' Perform integrative non-negative matrix factorization to return factorized H,
#' W, and V matrices. It optimizes the iNMF objective function using block
#' coordinate descent (alternating non-negative least squares), where the number
#' of factors is set by k. TODO: include objective function equation here in
#' documentation (using deqn)
#'
#' For each dataset, this factorization produces an H matrix (cells by k), a V
#' matrix (k by genes), and a shared W matrix (k by genes). The H matrices
#' represent the cell factor loadings. W is held consistent among all datasets,
#' as it represents the shared components of the metagenes across datasets. The
#' V matrices represent the dataset-specific components of the metagenes.
#' @param object \code{liger} object. Should normalize, select genes, and scale
#' before calling.
#' @param k Inner dimension of factorization (number of factors). Run suggestK
#' to determine appropriate value; a general rule of thumb is that a higher k
#' will be needed for datasets with more sub-structure.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more strongly (ie. alignment should increase as
#' lambda increases). Run suggestLambda to determine most appropriate value for
#' balancing dataset alignment and agreement (default 5.0).
#' @param thresh Convergence threshold. Convergence occurs when
#' |obj0-obj|/(mean(obj0,obj)) < thresh. (default 1e-6)
#' @param max.iters Maximum number of block coordinate descent iterations to
#' perform (default 30).
#' @param nrep Number of restarts to perform (iNMF objective function is
#' non-convex, so taking the best objective from multiple successive
#' initializations is recommended). For easier reproducibility, this increments
#' the random seed by 1 for each consecutive restart, so future factorizations
#' of the same dataset can be run with one rep if necessary. (default 1)
#' @param H.init Initial values to use for H matrices. (default NULL)
#' @param W.init Initial values to use for W matrix (default NULL)
#' @param V.init Initial values to use for V matrices (default NULL)
#' @param rand.seed Random seed to allow reproducible results (default 1).
#' @param print.obj Print objective function values after convergence (default
#' FALSE).
#' @param verbose Print progress bar/messages (TRUE by default)
#' @param ... Arguments passed to other methods
#' @return \code{liger} object with H, W, and V slots set.
#' @name optimizeALS-deprecated
#' @seealso \code{\link{rliger-deprecated}}
NULL

#' @rdname rliger-deprecated
#' @section \code{optimizeALS}:
#' For \code{optimizeALS}, use \code{\link{runIntegration}} or
#' \code{\link{runINMF}}. For the case of
#' \code{optimizeALS(use.unshared = TRUE)}, use \code{\link{runIntegration}}
#' with \code{method = "UINMF"} or \code{\link{runUINMF}} instead.
#' @export
optimizeALS <- function( # nocov start
        object,
        k,
        lambda = 5.0,
        thresh = NULL,
        max.iters = 30,
        nrep = 1,
        H.init = NULL,
        W.init = NULL,
        V.init = NULL,
        use.unshared = FALSE,
        rand.seed = 1,
        print.obj = NULL,
        verbose = TRUE,
        ...
) {
    if (isTRUE(use.unshared)) {
        lifecycle::deprecate_warn(
            "1.99.0", "optimizeALS(use.unshared = 'TRUE')",
            details = "Please use `runIntegration()` with `method = 'UINMF'` or `runUINMF()` instead."
        )
        # Call UINMF
        object <- runUINMF(object = object, k = k, lambda = lambda,
                           nIteration = max.iters, nRandomStarts = nrep,
                           seed = rand.seed, verbose = verbose)
    } else {
        lifecycle::deprecate_warn(
            "1.99.0", "optimizeALS()",
            "runIntegration()")
        object <- runINMF(object = object, k = k, lambda = lambda,
                          nIteration = max.iters, nRandomStarts = nrep,
                          HInit = H.init, WInit = W.init, VInit = V.init,
                          seed = rand.seed, verbose = verbose)
    }
    return(object)
} # nocov end

############################### online inmf ####################################

#' Perform online iNMF on scaled datasets
#' @description Perform online integrative non-negative matrix factorization to
#' represent multiple single-cell datasets in terms of \eqn{H}, \eqn{W}, and
#' \eqn{V} matrices. It optimizes the iNMF objective function (see
#' \code{\link{runINMF}}) using online learning (non-negative least squares for
#' \eqn{H} matrices, and hierarchical alternating least squares (HALS) for
#' \eqn{V} matrices and \eqn{W}), where the number of factors is set by
#' \code{k}. The function allows online learning in 3 scenarios:
#'
#' \enumerate{
#'  \item Fully observed datasets;
#'  \item Iterative refinement using continually arriving datasets;
#'  \item Projection of new datasets without updating the existing factorization
#' }
#'
#' All three scenarios require fixed memory independent of the number of cells.
#'
#' For each dataset, this factorization produces an \eqn{H} matrix (k by cell),
#' a \eqn{V} matrix (genes by k), and a shared \eqn{W}
#' matrix (genes by k). The \eqn{H} matrices represent the cell factor loadings.
#' \eqn{W} is identical among all datasets, as it represents the shared
#' components of the metagenes across datasets. The \eqn{V} matrices represent
#' the dataset-specific components of the metagenes.
#'
#' @details
#' For performing scenario 2 or 3, a complete set of factorization result from
#' a run of scenario 1 is required. Given the structure of a \linkS4class{liger}
#' object, all of the required information can be retrieved automatically.
#' Under the circumstance where users need customized information for existing
#' factorization, arguments \code{WInit}, \code{VInit}, \code{AInit} and
#' \code{BInit} are exposed. The requirements for these argument follows:
#' \itemize{
#'  \item{WInit - A matrix object of size \eqn{m \times k}. (see
#'      \code{\link{runINMF}} for notation)}
#'  \item{VInit - A list object of matrices each of size \eqn{m \times k}.
#'      Number of matrices should match with \code{newDatasets}.}
#'  \item{AInit - A list object of matrices each of size \eqn{k \times k}.
#'      Number of matrices should match with \code{newDatasets}.}
#'  \item{BInit - A list object of matrices each of size \eqn{m \times k}.
#'      Number of matrices should match with \code{newDatasets}.}
#' }
#'
#' Minibatch iterations is performed on small subset of cells. The exact
#' minibatch size applied on each dataset is \code{minibatchSize} multiplied by
#' the proportion of cells in this dataset out of all cells. In general,
#' \code{minibatchSize} should be no larger than the number of cells in the
#' smallest dataset (considering both \code{object} and \code{newDatasets}).
#' Therefore, a smaller value may be necessary for analyzing very small
#' datasets.
#'
#' An epoch is one completion of calculation on all cells after a number of
#' iterations of minibatches. Therefore, the total number of iterations is
#' determined by the setting of \code{maxEpochs}, total number of cells, and
#' \code{minibatchSize}.
#'
#' Currently, Seurat S3 method does not support working on Scenario 2 and 3,
#' because there is no simple solution for organizing a number of miscellaneous
#' matrices with a single Seurat object. We strongly recommend that users create
#' a \linkS4class{liger} object which has the specific structure.
#' @param object \linkS4class{liger} object. Scaled data required.
#' @param newDatasets Named list of \link[Matrix]{dgCMatrix-class} object. New
#' datasets for scenario 2 or scenario 3. Default \code{NULL} triggers scenario
#' 1.
#' @param projection Whether to perform data integration with scenario 3 when
#' \code{newDatasets} is specified. See description. Default \code{FALSE}.
#' @param WInit,VInit,AInit,BInit Optional initialization for \eqn{W}, \eqn{V},
#' \eqn{A}, and \eqn{B} matrices, respectively. Must be presented all together.
#' See detail. Default \code{NULL}.
#' @param k Inner dimension of factorization--number of metagenes. A value in
#' the range 20-50 works well for most analyses. Default \code{20}.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more strongly (i.e. alignment should increase as
#' lambda increases). We recommend always using the default value except
#' possibly for analyses with relatively small differences (biological
#' replicates, male/female comparisons, etc.) in which case a lower value such
#' as 1.0 may improve reconstruction quality. Default \code{5.0}.
#' @param maxEpochs The number of epochs to iterate through. See detail.
#' Default \code{5}.
#' @param HALSiter Maximum number of block coordinate descent (HALS
#' algorithm) iterations to perform for each update of \eqn{W} and \eqn{V}.
#' Default \code{1}. Changing this parameter is not recommended.
#' @param minibatchSize Total number of cells in each minibatch. See detail.
#' Default \code{5000}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param nCores The number of parallel tasks to speed up the computation.
#' Default \code{2L}. Only supported for platform with OpenMP support.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to other S3 methods of this function.
#' @return
#' \itemize{
#'  \item{liger method - Returns updated input \linkS4class{liger} object.
#'  \itemize{
#'      \item{A list of all \eqn{H} matrices can be accessed with
#'          \code{getMatrix(object, "H")}}
#'      \item{A list of all \eqn{V} matrices can be accessed with
#'          \code{getMatrix(object, "V")}}
#'      \item{The \eqn{W} matrix can be accessed with
#'          \code{getMatrix(object, "W")}}
#'      \item{Meanwhile, intermediate matrices \eqn{A} and \eqn{B} produced in
#'          HALS update can also be accessed similarly.}
#'  }
#'  }
#'  \item{Seurat method - Returns updated input Seurat object.
#'  \itemize{
#'      \item{\eqn{H} matrices for all datasets will be concatenated and
#'          transposed (all cells by k), and form a DimReduc object in the
#'          \code{reductions} slot named by argument \code{reduction}.}
#'      \item{\eqn{W} matrix will be presented as \code{feature.loadings} in the
#'          same DimReduc object.}
#'      \item{\eqn{V} matrices, \eqn{A} matrices, \eqn{B} matricesm an objective
#'          error value and the dataset variable used for the factorization is
#'          currently stored in \code{misc} slot of the same DimReduc object.}
#'  }}
#' }
#' @references Chao Gao and et al., Iterative single-cell multi-omic integration
#' using online learning, Nat Biotechnol., 2021
#' @export
#' @rdname runOnlineINMF
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc)
#' pbmc <- scaleNotCenter(pbmc)
#' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
#'     # Scenario 1
#'     pbmc <- runOnlineINMF(pbmc, minibatchSize = 200)
#'     # Scenario 2
#'     # Fake new dataset by increasing all non-zero value in "ctrl" by 1
#'     ctrl2 <- rawData(dataset(pbmc, "ctrl"))
#'     ctrl2@x <- ctrl2@x + 1
#'     colnames(ctrl2) <- paste0(colnames(ctrl2), 2)
#'     pbmc2 <- runOnlineINMF(pbmc, k = 20, newDatasets = list(ctrl2 = ctrl2),
#'                            minibatchSize = 100)
#'     # Scenario 3
#'     pbmc3 <- runOnlineINMF(pbmc, k = 20, newDatasets = list(ctrl2 = ctrl2),
#'                            projection = TRUE)
#' }
runOnlineINMF <- function(
        object,
        k = 20,
        lambda = 5,
        ...
) {
    UseMethod("runOnlineINMF", object)
}

#' @export
#' @rdname runOnlineINMF
#' @method runOnlineINMF liger
runOnlineINMF.liger <- function(
        object,
        k = 20,
        lambda = 5,
        newDatasets = NULL,
        projection = FALSE,
        maxEpochs = 5,
        HALSiter = 1,
        minibatchSize = 5000,
        WInit = NULL,
        VInit = NULL,
        AInit = NULL,
        BInit = NULL,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    .checkObjVersion(object)
    object <- recordCommand(object, ..., dependencies = c("RcppPlanc"))
    Es <- getMatrix(object, "scaleData", returnList = TRUE)
    Es <- lapply(datasets(object), function(ld) {
        sd <- scaleData(ld)
        if (is.null(sd))
            cli::cli_abort("Scaled data not available. Run {.fn scaleNotCenter} first.")
        # if (inherits(sd, "H5D")) return(.H5DToH5Mat(sd))
        # else
        if (inherits(sd, "H5Group"))
            return(.H5GroupToH5SpMat(sd, c(length(varFeatures(object)),
                                           ncol(ld))))
        else return(sd)
    })
    if (!is.null(newDatasets)) {
        WInit <- WInit %||% getMatrix(object, "W", returnList = FALSE)
        VInit <- VInit %||% getMatrix(object, "V", returnList = TRUE)
        AInit <- AInit %||% getMatrix(object, "A", returnList = TRUE)
        BInit <- BInit %||% getMatrix(object, "B", returnList = TRUE)
        if (is.null(WInit) || any(sapply(VInit, is.null)) ||
            any(sapply(AInit, is.null)) || any(sapply(BInit, is.null))) {
            cli::cli_abort(
                "Cannot find complete online iNMF result for current datasets.
                Please run {.fn runOnlineINMF} without {.code newDataset} first"
            )
        }

        newNames <- names(newDatasets)
        if (any(newNames %in% names(object))) {
            cli::cli_abort("Names of {.var newDatasets} overlap with existing datasets.")
        }
        if (is.list(newDatasets)) {
            # A list of raw data
            if (is.null(names(newDatasets))) {
                cli::cli_abort("The list of new datasets must be named.")
            }
            for (i in seq_along(newDatasets)) {
                if (inherits(newDatasets[[i]], "dgCMatrix")) {
                    dataset(object, names(newDatasets)[i]) <- newDatasets[[i]]
                } else if (is.character(newDatasets[[i]])) {
                    # Assume is H5 filename
                    ld <- createH5LigerDataset(newDatasets[[i]])
                    dataset(object, names(newDatasets[i])) <- ld
                } else {
                    cli::cli_abort("Cannot interpret {.var newDatasets} element {i}")
                }
            }
        } else if (inherits(newDatasets, "liger")) {
            # A liger object with all new datasets
            object <- c(object, newDatasets)
        } else {
            cli::cli_abort("{.var newDatasets} must be either a named list or a {.cls liger} object")
        }

        object <- normalize(object, useDatasets = newNames)
        object <- scaleNotCenter(object, useDatasets = newNames)
        newDatasets <- list()
        for (d in newNames) {
            ld <- dataset(object, d)
            sd <- scaleData(ld)
            # if (inherits(sd, "H5D")) {
            #     newDatasets[[d]] <- .H5DToH5Mat(sd)
            # } else
            if (inherits(sd, "H5Group")) {
                newDatasets[[d]] <- .H5GroupToH5SpMat(sd, c(length(varFeatures(object)), ncol(ld)))
            } else {
                newDatasets[[d]] <- sd
            }
        }
    }
    closeAllH5(object)
    res <- .runOnlineINMF.list(Es, newDatasets = newDatasets,
                               projection = projection, k = k, lambda = lambda,
                               maxEpochs = maxEpochs,
                               minibatchSize = minibatchSize,
                               HALSiter = HALSiter, verbose = verbose,
                               WInit = WInit, VInit = VInit, AInit = AInit,
                               BInit = BInit, seed = seed, nCores = nCores)
    if (!isTRUE(projection)) {
        # Scenario 1&2, everything updated
        for (i in seq_along(object)) {
            ld <- dataset(object, i)
            ld@H <- res$H[[i]]
            ld@V <- res$V[[i]]
            ld@A <- res$A[[i]]
            ld@B <- res$B[[i]]
            if (isH5Liger(ld)) {
                colnames(ld@H) <- colnames(ld)
                rownames(ld@V) <- varFeatures(object)
                rownames(ld@B) <- varFeatures(object)
            }
            datasets(object, check = FALSE)[[i]] <- ld
        }
        object@W <- res$W
        rownames(object@W) <- varFeatures(object)
    } else {
        # Scenario 3, only H of newDatasets returned
        for (i in seq_along(newDatasets)) {
            dname <- names(newDatasets)[i]
            ld <- dataset(object, dname)
            ld@H <- res$H[[i]]
            if (isH5Liger(ld)) {
                colnames(ld@H) <- colnames(ld)
            }
            datasets(object, check = FALSE)[[dname]] <- ld
        }
    }
    object@uns$factorization <- list(k = k, lambda = lambda)
    suppressMessages({object <- restoreH5Liger(object)})
    return(object)
}

.runOnlineINMF.list <- function(
        object,
        k = 20,
        lambda = 5,
        newDatasets = NULL,
        projection = FALSE,
        maxEpochs = 5,
        WInit = NULL,
        VInit = NULL,
        AInit = NULL,
        BInit = NULL,
        HALSiter = 1,
        minibatchSize = 5000,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start
        cli::cli_abort(
            "Package {.pkg RcppPlanc} is required for online iNMF integration.
        Please install it by command:
        {.code install.packages('RcppPlanc', repos = 'https:/welch-lab.r-universe.dev')}") # nocov end
    nDatasets <- length(object) + length(newDatasets)
    barcodeList <- c(lapply(object, colnames), lapply(newDatasets, colnames))
    names(barcodeList) <- c(names(object), names(newDatasets))
    allFeatures <- c(lapply(object, rownames), lapply(newDatasets, rownames))
    features <- Reduce(.same, allFeatures)

    # In the case for H5 liger, we don't have dimnames associated with input
    # NOTE: RcppPlanc got dim method for the H5SpMat class.
    ncellPerDataset <- unlist(c(sapply(object, ncol), sapply(newDatasets, ncol)))
    nFeaturePerDataset <- unlist(c(sapply(object, nrow), sapply(newDatasets, nrow)))
    if (min(ncellPerDataset) < k) {
        cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(ncellPerDataset)}).")
    }
    if (nFeaturePerDataset[1] < k) {
        cli::cli_abort("Number of factors (k={k}) should be less than the number of shared features ({nFeaturePerDataset}).")
    }

    if (!is.null(seed)) set.seed(seed)

    res <- RcppPlanc::onlineINMF(objectList = object, newDatasets = newDatasets,
                                 project = projection, k = k, lambda = lambda,
                                 maxEpoch = maxEpochs,
                                 minibatchSize = minibatchSize,
                                 maxHALSIter = HALSiter, Vinit = VInit,
                                 Winit = WInit, Ainit = AInit, Binit = BInit,
                                 nCores = nCores, verbose = verbose)
    factorNames <- paste0("Factor_", seq(k))
    if (isTRUE(projection)) {
        # Scenario 3 only got H for new datasets
        for (i in seq_along(newDatasets)) {
            dname <- names(newDatasets)[i]
            res$H[[i]] <- t(res$H[[i]])
            dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[dname]])
            names(res$H) <- names(newDatasets)
        }
    } else {
        # Scenario 1&2 got everything
        for (i in seq(nDatasets)) {
            res$H[[i]] <- t(res$H[[i]])
            dimnames(res$H[[i]]) <- list(factorNames, barcodeList[[i]])
            dimnames(res$V[[i]]) <- list(features, factorNames)
            dimnames(res$A[[i]]) <- list(factorNames, factorNames)
            dimnames(res$B[[i]]) <- list(features, factorNames)
        }
        names(res$B) <- names(res$A) <- names(res$V) <- names(res$H) <-
            names(barcodeList)
        dimnames(res$W) <- list(features, factorNames)
    }
    return(res)
}

#' @export
#' @rdname runOnlineINMF
#' @method runOnlineINMF Seurat
#' @param datasetVar Metadata variable name that stores the dataset source
#' annotation. Default \code{"orig.ident"}.
#' @param layer For Seurat>=4.9.9, the name of layer to retrieve input
#' non-negative scaled data. Default \code{"ligerScaleData"}. For older Seurat,
#' always retrieve from \code{scale.data} slot.
#' @param assay Name of assay to use. Default \code{NULL} uses current active
#' assay.
#' @param reduction Name of the reduction to store result. Also used as the
#' feature key. Default \code{"onlineINMF"}.
runOnlineINMF.Seurat <- function(
        object,
        k = 20,
        lambda = 5,
        datasetVar = "orig.ident",
        layer = "ligerScaleData",
        assay = NULL,
        reduction = "onlineINMF",
        maxEpochs = 5,
        HALSiter = 1,
        minibatchSize = 5000,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    assay <- assay %||% SeuratObject::DefaultAssay(object)
    Es <- .getSeuratData(object, layer = layer, slot = "scale.data",
                         assay = assay)
    # the last [,1] converts data.frame to the vector/factor
    datasetVar <- object[[datasetVar]][,1]
    if (!is.factor(datasetVar)) datasetVar <- factor(datasetVar)
    datasetVar <- droplevels(datasetVar)

    if (!is.list(Es)) {
        Es <- splitRmMiss(Es, datasetVar)
        Es <- lapply(Es, methods::as, Class = "CsparseMatrix")
    }
    for (i in seq_along(Es)) {
        if (any(Es[[i]]@x < 0)) {
            cli::cli_abort("Negative data encountered for integrative {.emph Non-negative} Matrix Factorization.
                           Please run {.fn scaleNotCenter} first.")
        }
    }

    res <- .runOnlineINMF.list(
        object = Es, k = k, lambda = lambda,
        newDatasets = NULL, projection = FALSE,
        maxEpochs = maxEpochs, HALSiter = HALSiter,
        minibatchSize = minibatchSize, seed = seed, verbose = verbose,
        nCores = nCores,
        WInit = NULL, VInit = NULL, AInit = NULL, BInit = NULL,
    )
    Hconcat <- t(Reduce(cbind, res$H))
    colnames(Hconcat) <- paste0(reduction, "_", seq_len(ncol(Hconcat)))
    object[[reduction]] <- Seurat::CreateDimReducObject(
        embeddings = Hconcat,
        loadings = res$W,
        assay = assay,
        misc = list(V = res$V, A = res$A, B = res$B, objErr = res$objErr,
                    dataset = datasetVar)
    )
    return(object)
}

#' `r lifecycle::badge("deprecated")` Perform online iNMF on scaled datasets
#' @description
#' \bold{Please turn to \code{\link{runOnlineINMF}} or
#' \code{\link{runIntegration}}}.
#'
#' Perform online integrative non-negative matrix factorization to represent
#' multiple single-cell datasets in terms of H, W, and V matrices. It optimizes
#' the iNMF objective function using online learning (non-negative least squares
#' for H matrix, hierarchical alternating least squares for W and V matrices),
#' where the number of factors is set by k. The function allows online learning
#' in 3 scenarios: (1) fully observed datasets; (2) iterative refinement using
#' continually arriving datasets; and (3) projection of new datasets without
#' updating the existing factorization. All three scenarios require fixed memory
#' independent of the number of cells.
#'
#' For each dataset, this factorization produces an H matrix (cells by k), a V
#' matrix (k by genes), and a shared W matrix (k by genes). The H matrices
#' represent the cell factor loadings. W is identical among all datasets, as it
#' represents the shared components of the metagenes across datasets. The V
#' matrices represent the dataset-specific components of the metagenes.
#' @param object \code{liger} object with data stored in HDF5 files. Should
#' normalize, select genes, and scale before calling.
#' @param X_new List of new datasets for scenario 2 or scenario 3. Each list
#' element should be the name of an HDF5 file.
#' @param projection Perform data integration by shared metagene (W) projection
#' (scenario 3). (default FALSE)
#' @param W.init Optional initialization for W. (default NULL)
#' @param V.init Optional initialization for V (default NULL)
#' @param H.init Optional initialization for H (default NULL)
#' @param A.init Optional initialization for A (default NULL)
#' @param B.init Optional initialization for B (default NULL)
#' @param k Inner dimension of factorization--number of metagenes (default 20).
#' A value in the range 20-50 works well for most analyses.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more
#'   strongly (ie. alignment should increase as lambda increases). We recommend
#'   always using the default value except
#'   possibly for analyses with relatively small differences (biological
#'   replicates, male/female comparisons, etc.)
#'   in which case a lower value such as 1.0 may improve reconstruction quality.
#'   (default 5.0).
#' @param max.epochs Maximum number of epochs (complete passes through the
#' data). (default 5)
#' @param miniBatch_max_iters Maximum number of block coordinate descent (HALS
#' algorithm) iterations to perform for each update of W and V (default 1).
#' Changing this parameter is not  recommended.
#' @param miniBatch_size Total number of cells in each minibatch (default 5000).
#' This is a reasonable default, but a smaller value such as 1000 may be
#' necessary for analyzing very small datasets. In general, minibatch size
#' should be no larger than the number of cells in the smallest dataset.
#' @param h5_chunk_size Chunk size of input hdf5 files (default 1000). The chunk
#' size should be no larger than the batch size.
#' @param seed Random seed to allow reproducible results (default 123).
#' @param verbose Print progress bar/messages (TRUE by default)
#' @return \code{liger} object with H, W, V, A and B slots set.
#' @name online_iNMF-deprecated
NULL

#' @rdname rliger-deprecated
#' @section \code{online_iNMF}:
#' For \code{online_iNMF}, use \code{\link{runIntegration}} with
#' \code{method = "online"} or \code{\link{runOnlineINMF}}.
#' @export
online_iNMF <- function( # nocov start
        object,
        X_new = NULL,
        projection = FALSE,
        W.init = NULL,
        V.init = NULL,
        H.init = NULL,
        A.init = NULL,
        B.init = NULL,
        k = 20,
        lambda = 5,
        max.epochs = 5,
        miniBatch_max_iters = 1,
        miniBatch_size = 5000,
        h5_chunk_size = 1000,
        seed = 123,
        verbose = TRUE
) {
    lifecycle::deprecate_warn(
        "1.99.0", "online_iNMF()",
        details = "Please use `runIntegration()` with `method = 'online'`, or `runOnlineINMF()` instead."
    )
    object <- runOnlineINMF.liger(
        object = object, k = k, lambda = lambda, maxEpochs = max.epochs,
        HALSiter = miniBatch_max_iters, minibatchSize = miniBatch_size,
        seed = seed, verbose = verbose, newDatasets = X_new,
        projection = projection, WInit = W.init, VInit = V.init, AInit = A.init,
        BInit = B.init
    )
    return(object)
} # nocov end



################################### UINMF ######################################

#' Perform Mosaic iNMF (UINMF) on scaled datasets with unshared features
#' @description
#' Performs mosaic integrative non-negative matrix factorization (UINMF) (A.R.
#' Kriebel, 2022) using block coordinate descent (alternating non-negative
#' least squares, ANLS) to return factorized \eqn{H}, \eqn{W}, \eqn{V} and
#' \eqn{U} matrices. The objective function is stated as
#'
#' \deqn{\arg\min_{H\ge0,W\ge0,V\ge0,U\ge0}\sum_{i}^{d}
#' ||\begin{bmatrix}E_i \\ P_i \end{bmatrix} -
#' (\begin{bmatrix}W \\ 0 \end{bmatrix}+
#' \begin{bmatrix}V_i \\ U_i \end{bmatrix})Hi||^2_F+
#' \lambda_i\sum_{i}^{d}||\begin{bmatrix}V_i \\ U_i \end{bmatrix}H_i||_F^2}
#'
#' where \eqn{E_i} is the input non-negative matrix of the \eqn{i}'th dataset,
#' \eqn{P_i} is the input non-negative matrix for the unshared features,
#' \eqn{d} is the total number of datasets. \eqn{E_i} is of size
#' \eqn{m \times n_i} for \eqn{m} shared features and \eqn{n_i} cells, \eqn{P_i}
#' is of size \eqn{u_i \times n_i} for \eqn{u_i} unshared feaetures,
#' \eqn{H_i} is of size \eqn{k \times n_i}, \eqn{V_i} is of size
#' \eqn{m \times k}, \eqn{W} is of size \eqn{m \times k} and \eqn{U_i} is of
#' size \eqn{u_i \times k}.
#'
#' The factorization produces a shared \eqn{W} matrix (genes by k). For each
#' dataset, an \eqn{H} matrix (k by cells), a \eqn{V} matrix (genes by k) and
#' a \eqn{U} matrix (unshared genes by k). The \eqn{H} matrices represent the
#' cell factor loadings. \eqn{W} is held consistent among all datasets, as it
#' represents the shared components of the metagenes across datasets. The
#' \eqn{V} matrices represent the dataset-specific components of the metagenes,
#' \eqn{U} matrices are similar to \eqn{V}s but represents the loading
#' contributed by unshared features.
#'
#' This function adopts highly optimized fast and memory efficient
#' implementation extended from Planc (Kannan, 2016). Pre-installation of
#' extension package \code{RcppPlanc} is required. The underlying algorithm
#' adopts the identical ANLS strategy as \code{\link{optimizeALS}(unshared =
#' TRUE)} in the old version of LIGER.
#' @param object \linkS4class{liger} object. Should run
#' \code{\link{selectGenes}} with \code{unshared = TRUE} and then run
#' \code{\link{scaleNotCenter}} in advance.
#' @param k Inner dimension of factorization (number of factors). Generally, a
#' higher \code{k} will be needed for datasets with more sub-structure. Default
#' \code{20}.
#' @param lambda Regularization parameter. Larger values penalize
#' dataset-specific effects more strongly (i.e. alignment should increase as
#' \code{lambda} increases). Default \code{5}.
#' @param nIteration Total number of block coordinate descent iterations to
#' perform. Default \code{30}.
#' @param nRandomStarts Number of restarts to perform (iNMF objective function
#' is non-convex, so taking the best objective from multiple successive
#' initialization is recommended). For easier reproducibility, this increments
#' the random seed by 1 for each consecutive restart, so future factorization
#' of the same dataset can be run with one rep if necessary. Default \code{1}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param nCores The number of parallel tasks to speed up the computation.
#' Default \code{2L}. Only supported for platform with OpenMP support.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to other methods and wrapped functions.
#' @export
#' @references April R. Kriebel and Joshua D. Welch, UINMF performs mosaic
#' integration of single-cell multi-omic datasets using nonnegative matrix
#' factorization, Nat. Comm., 2022
#' @note
#' Currently, Seurat S3 method is not supported for UINMF because there is no
#' simple solution for organizing a number of miscellaneous matrices with a
#' single Seurat object. We strongly recommend that users create a
#' \linkS4class{liger} object which has the specific structure.
#' @return
#' \itemize{
#'  \item{liger method - Returns updated input \linkS4class{liger} object.
#'  \itemize{
#'      \item{A list of all \eqn{H} matrices can be accessed with
#'          \code{getMatrix(object, "H")}}
#'      \item{A list of all \eqn{V} matrices can be accessed with
#'          \code{getMatrix(object, "V")}}
#'      \item{The \eqn{W} matrix can be accessed with
#'          \code{getMatrix(object, "W")}}
#'      \item{A list of all \eqn{U} matrices can be accessed with
#'          \code{getMatrix(object, "U")}}
#'  }
#'  }
#' }
#' @rdname runUINMF
#' @examples
#' pbmc <- normalize(pbmc)
#' pbmc <- selectGenes(pbmc, useUnsharedDatasets = c("ctrl", "stim"))
#' pbmc <- scaleNotCenter(pbmc)
#' if (!is.null(getMatrix(pbmc, "scaleUnsharedData", "ctrl")) &&
#'     !is.null(getMatrix(pbmc, "scaleUnsharedData", "stim"))) {
#'     # TODO: unshared variable features cannot be detected from this example
#'     pbmc <- runUINMF(pbmc)
#' }
runUINMF <- function(
        object,
        k = 20,
        lambda = 5,
        ...
) {
    UseMethod("runUINMF", object)
}

#' @export
#' @rdname runUINMF
#' @method runUINMF liger
runUINMF.liger <- function(
        object,
        k = 20,
        lambda = 5,
        nIteration = 30,
        nRandomStarts = 1,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    .checkObjVersion(object)
    object <- recordCommand(object, ..., dependencies = "RcppPlanc")
    object <- removeMissing(object, orient = "cell", verbose = verbose)
    # Elist <- getMatrix(object, "scaleData", returnList = TRUE)

    Elist <- lapply(datasets(object), function(ld) {
        if (is.null(scaleData(ld)))
            cli::cli_abort("Scaled data not available. Run {.fn scaleNotCenter} first.")
        return(scaleData(ld))
    })
    Ulist <- getMatrix(object, "scaleUnsharedData", returnList = TRUE)
    if (all(sapply(Ulist, is.null))) {
        cli::cli_abort(
            "No scaled data for unshared feature found. Run {.fn selectGenes}
            with {.code useUnsharedDatasets} specified, and then {.fn scaleNotCenter}."
        )
    }
    res <- .runUINMF.list(Elist, Ulist, k = k, lambda = lambda,
                          nIteration = nIteration,
                          nRandomStarts = nRandomStarts, nCores = nCores,
                          seed = seed, verbose = verbose, ...)
    for (d in names(object)) {
        ld <- dataset(object, d)
        ld@H <- res$H[[d]]
        ld@V <- res$V[[d]]
        if (!is.null(ld@scaleUnsharedData)) {
            ld@U <- res$U[[d]]
        }
        datasets(object, check = FALSE)[[d]] <- ld
    }
    object@W <- res$W
    object@uns$factorization <- list(k = k, lambda = lambda)
    return(object)
}

#' @param unsharedList List of matrices for unshared features
#' @noRd
.runUINMF.list <- function(
        object,
        unsharedList,
        k = 20,
        lambda = 5,
        nIteration = 30,
        nRandomStarts = 1,
        seed = 1,
        nCores = 2L,
        verbose = getOption("ligerVerbose", TRUE)
) {
    if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start
        cli::cli_abort(
        "Package {.pkg RcppPlanc} is required for mosaic iNMF integration with unshared features.
        Please install it by command:
        {.code install.packages('RcppPlanc', repos = 'https:/welch-lab.r-universe.dev')}")# nocov end
    barcodeList <- lapply(object, colnames)
    allFeatures <- lapply(object, rownames)
    features <- Reduce(.same, allFeatures)

    if (min(lengths(barcodeList)) < k) {
        cli::cli_abort("Number of factors (k={k}) should be less than the number of cells in the smallest dataset ({min(lengths(barcodeList))}).")
    }

    bestObj <- Inf
    bestRes <- NULL
    bestSeed <- NULL
    for (i in seq(nRandomStarts)) {
        cli::cli_alert_info("Replicate start [{i}/{nRandomStarts}]")
        seed <- seed + i - 1
        set.seed(seed)
        res <- RcppPlanc::uinmf(object, unsharedList, k = k, lambda = lambda,
                                niter = nIteration, nCores = nCores,
                                verbose = verbose)
        if (res$objErr < bestObj) {
            bestRes <- res
            bestObj <- res$objErr
            bestSeed <- seed
        }
    }
    if (isTRUE(verbose) && nRandomStarts > 1) {
        cli::cli_alert_success("Best objective error: {bestObj}; Best seed: {bestSeed}")
    }
    rm(res)
    unsharedFeatures <- lapply(unsharedList, rownames)
    factorNames <- paste0("Factor_", seq(k))
    for (d in names(object)) {
        bestRes$H[[d]] <- t(bestRes$H[[d]])
        dimnames(bestRes$H[[d]]) <- list(factorNames, barcodeList[[d]])
        dimnames(bestRes$V[[d]]) <- list(features, factorNames)
        if (d %in% names(bestRes$U)) {
            dimnames(bestRes$U[[d]]) <- list(unsharedFeatures[[d]], factorNames)
        }
    }
    dimnames(bestRes$W) <- list(features, factorNames)
    return(bestRes)
}


#' Align factor loadings to get final integration
#' @description
#' This function is a wrapper to switch between alternative factor loading
#' alignment methods that LIGER provides, which is a required step for producing
#' the final integrated result. Two methods are provided (click on options for
#' more details):
#'
#' \itemize{
#' \item{\code{method = "\link{quantileNorm}"}: Previously published quantile
#' normalization method. (default)}
#' \item{\code{method = "\link{centroidAlign}"}: Newly developed centroid
#' alignment method. `r lifecycle::badge("experimental")`}
#' }
#' @export
#' @rdname alignFactors
#' @seealso \code{\link{quantileNorm}}, \code{\link{centroidAlign}}
#' @param object A \linkS4class{liger} or Seurat object with valid factorization
#' result available (i.e. \code{\link{runIntegration}} performed in advance).
#' @param method Character, method to align factors. Default
#' \code{"centroidAlign"}. Optionally \code{"quantileNorm"}.
#' @param ... Additional arguments passed to selected methods.
#' For \code{"quantileNorm"}:
#' \describe{
#'    \item{\code{quantiles}}{Number of quantiles to use for quantile
#'    normalization. Default \code{50}.}
#'    \item{\code{reference}}{Character, numeric or logical selection of one
#'    dataset, out of all available datasets in \code{object}, to use as a
#'    "reference" for quantile normalization. Default \code{NULL} tries to find
#'    an RNA dataset with the largest number of cells; if no RNA dataset
#'    available, use the globally largest dataset.}
#'    \item{\code{minCells}}{Minimum number of cells to consider a cluster
#'    shared across datasets. Default \code{20}.}
#'    \item{\code{nNeighbors}}{Number of nearest neighbors for within-dataset
#'    knn graph. Default \code{20}.}
#'    \item{\code{useDims}}{Indices of factors to use for shared nearest factor
#'    determination. Default \code{NULL} uses all factors.}
#'    \item{\code{center}}{Whether to center the data when scaling factors.
#'    Could be useful for less sparse modalities like methylation data.
#'    Default \code{FALSE}.}
#'    \item{\code{maxSample}}{Maximum number of cells used for quantile
#'    normalization of each cluster and factor. Default \code{1000}.}
#'    \item{\code{eps}}{The error bound of the nearest neighbor search. Lower
#'    values give more accurate nearest neighbor graphs but take much longer to
#'    compute. Default \code{0.9}.}
#'    \item{\code{refineKNN}}{Whether to increase robustness of cluster
#'    assignments using KNN graph. Default \code{TRUE}.}
#'    \item{\code{clusterName}}{Variable name that will store the clustering
#'    result in metadata of a \linkS4class{liger} object or a \code{Seurat}
#'    object. Default \code{"quantileNorm_cluster"}.}
#'    \item{\code{seed}}{Random seed to allow reproducible results. Default
#'    \code{1}.}
#'    \item{\code{verbose}}{Logical. Whether to show information of the
#'    progress. Default \code{getOption("ligerVerbose")} or \code{TRUE} if
#'    users have not set.}
#' }
#' For \code{"centroidAlign"} `r lifecycle::badge("experimental")`:
#' \describe{
#'    \item{\code{lambda}}{Ridge regression penalty applied to each dataset.
#'    Can be one number that applies to all datasets, or a numeric vector with
#'    length equal to the number of datasets. Default \code{1}.}
#'    \item{\code{useDims}}{Indices of factors to use considered for the
#'    alignment. Default \code{NULL} uses all factors.}
#'    \item{\code{scaleEmb}}{Logical, whether to scale the factor loading being
#'    considered as the embedding. Default \code{TRUE}.}
#'    \item{\code{centerEmb}}{Logical, whether to center the factor loading
#'    being considered as the embedding before scaling it. Default \code{TRUE}.}
#'    \item{\code{scaleCluster}}{Logical, whether to scale the factor loading
#'    being considered as the cluster assignment probability. Default
#'    \code{FALSE}.}
#'    \item{\code{centerCluster}}{Logical, whether to center the factor loading
#'    being considered as the cluster assignment probability before scaling it.
#'    Default \code{FALSE}.}
#'    \item{\code{shift}}{Logical, whether to shift the factor loading being
#'    considered as the cluster assignment probability after centered scaling.
#'    Default \code{FALSE}.}
#'    \item{\code{diagnosis}}{Logical, whether to return cell metadata variables
#'    with diagnostic information. Default \code{FALSE}.}
#'  }
alignFactors <- function(
        object,
        method = c("quantileNorm", "centroidAlign"),
        ...
) {
    UseMethod("alignFactors", object)
}

#' @export
#' @rdname alignFactors
#' @method alignFactors liger
alignFactors.liger <- function(
        object,
        method = c("quantileNorm", "centroidAlign"),
        ...
) {
    method <- match.arg(method)
    if (method == "centroidAlign") {
        object <- centroidAlign(object, ...)
    } else if (method == "quantileNorm") {
        object <- quantileNorm(object, ...)
    }
    return(object)
}

#' @export
#' @rdname alignFactors
#' @method alignFactors Seurat
alignFactors.Seurat <- function(
        object,
        method = c("quantileNorm", "centroidAlign"),
        ...
) {
    method <- match.arg(method)
    if (method == "centroidAlign") {
        object <- centroidAlign(object, ...)
    } else if (method == "quantileNorm") {
        object <- quantileNorm(object, ...)
    }
    return(object)
}

########################### Quantile Normalization #############################

#' Quantile Align (Normalize) Factor Loadings
#' @description This process builds a shared factor neighborhood graph to
#' jointly cluster cells, then quantile normalizes corresponding clusters.
#'
#' The first step, building the shared factor neighborhood graph, is performed
#' in SNF(), and produces a graph representation where edge weights between
#' cells (across all datasets) correspond to their similarity in the shared
#' factor neighborhood space. An important parameter here is \code{nNeighbors},
#' the number of neighbors used to build the shared factor space.
#'
#' Next we perform quantile alignment for each dataset, factor, and cluster (by
#' stretching/compressing datasets' quantiles to better match those of the
#' reference dataset).
#' @param object A \linkS4class{liger} or Seurat object with valid factorization
#' result available (i.e. \code{\link{runIntegration}} performed in advance).
#' @param quantiles Number of quantiles to use for quantile normalization.
#' Default \code{50}.
#' @param reference Character, numeric or logical selection of one dataset, out
#' of all available datasets in \code{object}, to use as a "reference" for
#' quantile normalization. Default \code{NULL} tries to find an RNA dataset with
#' the largest number of cells; if no RNA dataset available, use the globally
#' largest dataset.
#' @param minCells Minimum number of cells to consider a cluster shared across
#' datasets. Default \code{20}.
#' @param nNeighbors Number of nearest neighbors for within-dataset knn graph.
#' Default \code{20}.
#' @param useDims Indices of factors to use for shared nearest factor
#' determination. Default \code{NULL} uses all factors.
#' @param center Whether to center the data when scaling factors. Could be
#' useful for less sparse modalities like methylation data. Default
#' \code{FALSE}.
#' @param maxSample Maximum number of cells used for quantile normalization of
#' each cluster and factor. Default \code{1000}.
#' @param eps The error bound of the nearest neighbor search. Lower values give
#' more accurate nearest neighbor graphs but take much longer to compute.
#' Default \code{0.9}.
#' @param refineKNN whether to increase robustness of cluster assignments using
#' KNN graph. Default \code{TRUE}.
#' @param clusterName Variable name that will store the clustering result
#' in metadata of a \linkS4class{liger} object or a \code{Seurat} object.
#' Default \code{"quantileNorm_cluster"}
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param verbose Logical. Whether to show information of the progress. Default
#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
#' @param ... Arguments passed to other S3 methods of this function.
#' @return Updated input object
#' \itemize{
#'  \item{liger method
#'  \itemize{
#'      \item{Update the \code{H.norm} slot for the alignment cell factor
#'          loading, ready for running graph based community detection
#'          clustering or dimensionality reduction for visualization.}
#'      \item{Update the \code{cellMata} slot with a cluster assignment basing
#'          on cell factor loading}
#'  }}
#'  \item{Seurat method
#'  \itemize{
#'      \item{Update the \code{reductions} slot with a new \code{DimReduc}
#'          object containing the aligned cell factor loading.}
#'      \item{Update the metadata with a cluster assignment basing on cell
#'          factor loading}
#'  }}
#' }
#' @examples
#' pbmc <- quantileNorm(pbmcPlot)
#' @export
#' @rdname quantileNorm
quantileNorm <- function(
        object,
        ...
) {
    UseMethod("quantileNorm", object)
}

#' @export
#' @rdname quantileNorm
#' @method quantileNorm liger
quantileNorm.liger <- function(
        object,
        quantiles = 50,
        reference = NULL,
        minCells = 20,
        nNeighbors = 20,
        useDims = NULL,
        center = FALSE,
        maxSample = 1000,
        eps = 0.9,
        refineKNN = TRUE,
        clusterName = "quantileNorm_cluster",
        seed = 1,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    .checkObjVersion(object)
    .checkValidFactorResult(object, checkV = FALSE)
    # Choose largest RNA dataset as default if possible, as per Issue #297
    if (is.null(reference)) {
        reference <- .autoFindRef_qn(object)
    } else {
        reference <- .checkUseDatasets(object, useDatasets = reference)
        if (length(reference) != 1) {
            cli::cli_abort("Should specify only one reference dataset.")
        }
        if (inherits(dataset(object, reference), c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset"))) {
            cli::cli_alert_warning(
                "Dataset of {.val {modalOf(dataset(pbmc, reference))}} modality is not recommended to be set as reference."
            )
        }
    }

    object <- recordCommand(object, ..., dependencies = "RANN")

    out <- .quantileNorm.HList(
        object = getMatrix(object, "H"),
        quantiles = quantiles,
        reference = reference,
        minCells = minCells,
        nNeighbors = nNeighbors,
        useDims = useDims,
        center = center,
        maxSample = maxSample,
        eps = eps,
        refineKNN = refineKNN,
        seed = seed
    )
    object@H.norm <- out$H.norm
    cellMeta(object, clusterName, check = FALSE) <- out$clusters
    object@uns$alignmentMethod <- "quantileNorm"
    return(object)
}

#' @export
#' @rdname quantileNorm
#' @method quantileNorm Seurat
#' @param reduction Name of the reduction where LIGER integration result is
#' stored. Default \code{"inmf"}.
quantileNorm.Seurat <- function(
        object,
        reduction = "inmf",
        quantiles = 50,
        reference = NULL,
        minCells = 20,
        nNeighbors = 20,
        useDims = NULL,
        center = FALSE,
        maxSample = 1000,
        eps = 0.9,
        refineKNN = TRUE,
        clusterName = "quantileNorm_cluster",
        seed = 1,
        verbose = getOption("ligerVerbose", TRUE),
        ...
) {
    resName <- paste0(reduction, "Norm")
    reduction <- object[[reduction]]
    if (!inherits(reduction, "DimReduc")) {
        cli::cli_abort("Specified {.var reduction} does not points to a {.cls DimReduc}.")
    }
    # Retrieve some information. Might have better ways instead of using `@`
    ## Due to proper formatting in Seurat object, Hconcat is already cell x k
    Hconcat <- reduction[[]]
    datasetVar <- reduction@misc$dataset
    W <- reduction[]
    assay <- reduction@assay.used

    reference <- reference %||% names(which.max(table(datasetVar)))
    # Debating whether to implement something without the need to split it
    HList <- lapply(levels(datasetVar), function(d) {
        t(Hconcat[datasetVar == d, , drop = FALSE])
    })
    names(HList) <- levels(datasetVar)
    alignment <- .quantileNorm.HList(
        HList, quantiles = quantiles, reference = reference,
        minCells = minCells, nNeighbors = nNeighbors, useDims = useDims,
        center = center, maxSample = maxSample, eps = eps,
        refineKNN = refineKNN, seed = seed, verbose = verbose
    )
    reddim <- Seurat::CreateDimReducObject(
        embeddings = alignment$H.norm, loadings = W,
        assay = assay, key = paste0(resName, "_")
    )
    object[[resName]] <- reddim
    object[[paste0(resName, ".cluster")]] <- alignment$clusters
    return(object)
}

.quantileNorm.HList <- function(
        object,
        quantiles = 50,
        reference = NULL,
        minCells = 20,
        nNeighbors = 20,
        useDims = NULL,
        center = FALSE,
        maxSample = 1000,
        eps = 0.9,
        refineKNN = TRUE,
        seed = 1,
        verbose = getOption("ligerVerbose", TRUE)
) {
    set.seed(seed)
    if (is.character(reference)) {
        if (length(reference) != 1 || !reference %in% names(object))
            cli::cli_abort("Should specify one existing dataset as reference.")
    } else if (is.numeric(reference)) {
        if (length(reference) != 1 || reference > length(object))
            cli::cli_abort("Should specify one existing dataset as reference.")
    } else if (is.logical(reference)) {
        if (length(reference) != length(object) || sum(reference) != 1)
            cli::cli_abort("Should specify one existing dataset as reference.")
    } else {
        cli::cli_abort("Unable to understand {.var reference}. See {.code ?quantileNorm}.")
    }
    useDims <- useDims %||% seq_len(nrow(object[[1]]))
    # Transposing all H to cell x k
    Hs <- lapply(object, t)
    # fast max factor assignment with Rcpp code
    clusters <- lapply(Hs, max_factor_rcpp, dims_use = useDims, center = center)
    # TODO: Dumb to mess up factor with characters of numbers
    # Change to numeric in the future. Need to reproduce result for now.
    # clusterAssign <- factor(unlist(clusters))
    clusterAssign <- as.factor(unlist(lapply(clusters, as.character)))
    names(clusterAssign) <- unlist(lapply(Hs, rownames))
    # increase robustness of cluster assignments using knn graph
    if (isTRUE(refineKNN)) {
        for (i in seq_along(Hs)) {
            clustsH <- clusterAssign[rownames(Hs[[i]])]
            H_knn <- RANN::nn2(Hs[[i]], eps = eps, k = nNeighbors,
                               searchtype = "standard")
            # Rcpp method cluster_vote
            newClusts <- cluster_vote_rcpp(H_knn$nn.idx, clustsH)
            clusterAssign[rownames(Hs[[i]])] <- newClusts
        }
    }
    # TODO: This line need to be removed
    clusters <- lapply(Hs, function(H) clusterAssign[rownames(H)])
    dims <- ncol(Hs[[reference]])
    nClusters <- dims
    for (d in seq_along(Hs)) {
        for (c in seq(nClusters)) {
            # cells 2: cells in d'th dataset belonging to cluster c
            cellIdx2 <- clusters[[d]] == c
            # cells 1: cells in ref dataset belong to cluster c
            cellIdx1 <- clusters[[reference]] == c
            nCells2 <- sum(cellIdx2)
            nCells1 <- sum(cellIdx1)
            if (nCells1 < minCells || nCells2 < minCells) next
            for (k in seq(dims)) {
                if (nCells2 == 1) {
                    Hs[[d]][cellIdx2, k] <-
                        mean(Hs[[reference]][cellIdx1, k])
                    next
                }
                q2 <- stats::quantile(sample(Hs[[d]][cellIdx2, k],
                                             min(nCells2, maxSample)),
                                      seq(0, 1, by = 1 / quantiles))
                q1 <- stats::quantile(sample(Hs[[reference]][cellIdx1, k],
                                             min(nCells1, maxSample)),
                                      seq(0, 1, by = 1 / quantiles))
                if (sum(q1) == 0 | sum(q2) == 0 |
                    length(unique(q1)) < 2 | length(unique(q2)) < 2) {
                    newValue <- rep(0, nCells2)
                } else {
                    warp_func <- withCallingHandlers(
                        stats::approxfun(q2, q1, rule = 2),
                        warning = function(w) {
                            invokeRestart("muffleWarning")
                        }
                    )
                    newValue <- warp_func(Hs[[d]][cellIdx2, k])
                }
                Hs[[d]][cellIdx2, k] <- newValue
            }
        }
    }
    return(list('H.norm' = Reduce(rbind, Hs), 'clusters' = clusterAssign))
}

.autoFindRef_qn <- function(object) {
    notRecom <- c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset")
    recom <- !sapply(datasets(object), inherits, what = notRecom)
    if (sum(recom) == 0) {
        cli::cli_alert_warning(
            "Auto selecting reference, dataset of recommended type not found."
        )
        ref <- names(which.max(sapply(datasets(object), ncol)))
        cli::cli_alert_info(
            "Using globally largest dataset as reference: {.val {ref}} with {lengths(object)[ref]} cells"
        )
    } else {
        ref <- names(which.max(sapply(datasets(object)[recom], ncol)))
        cli::cli_alert_info(
            "Using largest dataset of recommended type as reference: {.val {ref}} with {lengths(object)[ref]} cells"
        )
    }
    return(ref)
}

#' `r lifecycle::badge("superseded")` Quantile align (normalize) factor loading
#' @description
#' \bold{Please turn to \code{\link{quantileNorm}}.}
#'
#' This process builds a shared factor neighborhood graph to jointly cluster
#' cells, then quantile normalizes corresponding clusters.
#'
#' The first step, building the shared factor neighborhood graph, is performed
#' in SNF(), and produces a graph representation where edge weights between
#' cells (across all datasets) correspond to their similarity in the shared
#' factor neighborhood space. An important parameter here is knn_k, the number
#' of neighbors used to build the shared factor space.
#'
#' Next we perform quantile alignment for each dataset, factor, and cluster (by
#' stretching/compressing datasets' quantiles to better match those of the
#' reference dataset). These aligned factor loadings are combined into a single
#' matrix and returned as H.norm.
#'
#' @param object \code{liger} object. Should run optimizeALS before calling.
#' @param knn_k Number of nearest neighbors for within-dataset knn graph
#' (default 20).
#' @param ref_dataset Name of dataset to use as a "reference" for normalization.
#' By default, the dataset with the largest number of cells is used.
#' @param min_cells Minimum number of cells to consider a cluster shared across
#' datasets (default 20)
#' @param quantiles Number of quantiles to use for quantile normalization
#' (default 50).
#' @param eps  The error bound of the nearest neighbor search. (default 0.9)
#' Lower values give more accurate nearest neighbor graphs but take much longer
#' to computer.
#' @param dims.use Indices of factors to use for shared nearest factor
#' determination (default \code{1:ncol(H[[1]])}).
#' @param do.center Centers the data when scaling factors (useful for less
#' sparse modalities like methylation data). (default FALSE)
#' @param max_sample Maximum number of cells used for quantile normalization of
#' each cluster and factor. (default 1000)
#' @param refine.knn whether to increase robustness of cluster assignments using
#' KNN graph.(default TRUE)
#' @param rand.seed Random seed to allow reproducible results (default 1)
#' @return \code{liger} object with 'H.norm' and 'clusters' slot set.
#' @name quantile_norm-deprecated
#' @seealso \code{\link{rliger-deprecated}}
NULL

#' @rdname rliger-deprecated
#' @section \code{quantile_norm}:
#' For \code{quantile_norm}, use \code{\link{quantileNorm}}.
#' @export
quantile_norm <- function( # nocov start
        object,
        quantiles = 50,
        ref_dataset = NULL,
        min_cells = 20,
        knn_k = 20,
        dims.use = NULL,
        do.center = FALSE,
        max_sample = 1000,
        eps = 0.9,
        refine.knn = TRUE,
        clusterName = "H.norm_cluster",
        rand.seed = 1,
        verbose = getOption("ligerVerbose", TRUE)
) {
    lifecycle::deprecate_warn("1.99.0", "quantile_norm()", "quantileNorm()")
    quantileNorm(
        object = object,
        quantiles = quantiles,
        reference = ref_dataset,
        minCells = min_cells,
        nNeighbors = knn_k,
        useDims = dims.use,
        center = do.center,
        maxSample = max_sample,
        eps = eps,
        refineKNN = refine.knn,
        seed = rand.seed,
        verbose = verbose
    )
} # nocov end

.same <- function(x, y) {
    if (identical(x, y)) return(x)
    else cli::cli_abort("Different features are used for each dataset.")
}



########################### align centroid #############################

#' `r lifecycle::badge("experimental")` Align factor loading by centroid alignment (beta)
#' @description
#' This process treats the factor loading of each dataset as the low dimensional
#' embedding as well as the cluster assignment probability, i.e. the soft
#' clustering result. Then the method aligns the embedding by linearly moving
#' the centroids of the same cluster but within each dataset towards each other.
#'
#' \bold{ATTENTION: This method is still under development while has shown
#' encouraging results in benchmarking tests. The arguments and their default
#' values reflect the best scored parameters in the tests and some of them may
#' be subject to change in the future.}
#' @details
#' Diagnostic information include:
#'
#' \itemize{
#' \item{object$raw_which.max: The index of the factor with the maximum value
#' in the raw factor loading.}
#' \item{object$R_which.max: The index of the factor with the maximum value in
#' the soft clustering probability matrix used for correction.}
#' \item{object$Z_which.max: The index of the factor with the maximum value in
#' the aligned factor loading.}
#' }
#'
#' @param object A \linkS4class{liger} or Seurat object with valid factorization
#' result available (i.e. \code{\link{runIntegration}} performed in advance).
#' @param lambda Ridge regression penalty applied to each dataset. Can be one
#' number that applies to all datasets, or a numeric vector with length equal to
#' the number of datasets. Default \code{1}.
#' @param useDims Indices of factors to use considered for the alignment.
#' Default \code{NULL} uses all factors.
#' @param scaleEmb Logical, whether to scale the factor loading being considered
#' as the embedding. Default \code{TRUE}.
#' @param centerEmb Logical, whether to center the factor loading being
#' considered as the embedding before scaling it. Default \code{TRUE}.
#' @param scaleCluster Logical, whether to scale the factor loading being
#' considered as the cluster assignment probability. Default \code{FALSE}.
#' @param centerCluster Logical, whether to center the factor loading being
#' considered as the cluster assignment probability before scaling it. Default
#' \code{FALSE}.
#' @param shift Logical, whether to shift the factor loading being considered as
#' the cluster assignment probability after centered scaling. Default
#' \code{FALSE}.
#' @param diagnosis Logical, whether to return cell metadata variables with
#' diagnostic information. See Details. Default \code{FALSE}.
#' @param ... Arguments passed to other S3 methods of this function.
#' @return Returns the updated input object
#' \itemize{
#'  \item{liger method
#'  \itemize{
#'      \item{Update the \code{H.norm} slot for the aligned cell factor
#'      loading, ready for running graph based community detection clustering
#'      or dimensionality reduction for visualization.}
#'      \item{Update the \code{cellMata} slot with diagnostic information if
#'      \code{diagnosis = TRUE}.}
#'  }}
#'  \item{Seurat method
#'  \itemize{
#'      \item{Update the \code{reductions} slot with a new \code{DimReduc}
#'          object containing the aligned cell factor loading.}
#'      \item{Update the metadata with diagnostic information if
#'      \code{diagnosis = TRUE}.}
#'  }}
#' }
#' @examples
#' pbmc <- centroidAlign(pbmcPlot)
#' @export
#' @rdname centroidAlign
centroidAlign <- function(
        object,
        ...
) {
    lifecycle::signal_stage(stage = "experimental", "centroidAlign()")
    UseMethod("centroidAlign", object)
}

#' @export
#' @rdname centroidAlign
#' @method centroidAlign liger
centroidAlign.liger <- function(
        object,
        lambda = 1,
        useDims = NULL,
        scaleEmb = TRUE,
        centerEmb = TRUE,
        scaleCluster = FALSE,
        centerCluster = FALSE,
        shift = FALSE,
        diagnosis = FALSE,
        ...
) {
    .checkObjVersion(object)
    .checkValidFactorResult(object, checkV = FALSE)

    object <- recordCommand(object, ...)

    if (any(modalOf(object) == "meth")) {
        cli::cli_alert_warning(
            "Methylation data is detected while centroid alignment method is not optimized for methylation data yet."
        )
        cli::cli_alert_info("It is recommended to use {.fn quantileNorm} method instead.")
    }

    lambda <- .checkArgLen(lambda, length(object), repN = TRUE, class = "numeric")

    out <- .centroidAlign.Hraw(
        object = Reduce(cbind, getMatrix(object, "H")),
        datasetVar = object$dataset,
        lambda = lambda,
        useDims = useDims,
        diagnosis = diagnosis,
        scaleEmb = scaleEmb, centerEmb = centerEmb,
        scaleCluster = scaleCluster, centerCluster = centerCluster,
        shift = shift,
        ...
    )
    object@H.norm <- out$aligned
    object@uns$alignmentMethod <- "centroidAlign"
    if ("raw_which.max" %in% names(out)) {
        cellMeta(object, "raw_which.max", check = FALSE) <- out$raw_which.max
    }
    if ("Z_which.max" %in% names(out)) {
        cellMeta(object, "Z_which.max", check = FALSE) <- out$Z_which.max
    }
    if ("R_which.max" %in% names(out)) {
        cellMeta(object, "R_which.max", check = FALSE) <- out$R_which.max
    }
    return(object)
}



#' @export
#' @rdname centroidAlign
#' @method centroidAlign Seurat
#' @param reduction Name of the reduction where LIGER integration result is
#' stored. Default \code{"inmf"}.
centroidAlign.Seurat <- function(
        object,
        reduction = "inmf",
        lambda = 1,
        useDims = NULL,
        scaleEmb = TRUE,
        centerEmb = TRUE,
        scaleCluster = FALSE,
        centerCluster = FALSE,
        shift = FALSE,
        diagnosis = FALSE,
        ...
) {
    resName <- paste0(reduction, "Norm")
    reduction <- object[[reduction]]
    if (!inherits(reduction, "DimReduc")) {
        cli::cli_abort("Specified {.var reduction} does not points to a {.cls DimReduc}.")
    }
    # Retrieve some information. Might have better ways instead of using `@`
    ## Due to proper formatting in Seurat object, Hconcat is already cell x k
    ## Transposed to k x N, as in liger
    Hconcat <- t(reduction[[]])
    datasetVar <- reduction@misc$dataset
    assay <- reduction@assay.used
    W <- reduction[]

    lambda <- .checkArgLen(lambda, length(unique(datasetVar)), repN = TRUE, class = "numeric")

    result <- .centroidAlign.Hraw(
        object = Hconcat,
        datasetVar = datasetVar,
        lambda = lambda,
        useDims = useDims,
        scaleEmb = scaleEmb, centerEmb = centerEmb,
        scaleCluster = scaleCluster, centerCluster = centerCluster,
        shift = shift,
        diagnosis = diagnosis
    )

    reddim <- Seurat::CreateDimReducObject(
        embeddings = result$aligned, loadings = W,
        assay = assay, key = paste0(resName, "_")
    )
    object[[resName]] <- reddim
    if ("raw_which.max" %in% names(result)) {
        object[["raw_which.max"]] <- result$raw_which.max
    }
    if ("Z_which.max" %in% names(result)) {
        object[["Z_which.max"]] <- result$Z_which.max
    }
    if ("R_which.max" %in% names(result)) {
        object[["R_which.max"]] <- result$R_which.max
    }
    return(object)
}

# object - raw H matrices concatenated, k factors by N cells
.centroidAlign.Hraw <- function(
        object,
        datasetVar,
        lambda,
        scaleEmb = TRUE,
        centerEmb = TRUE,
        scaleCluster = FALSE,
        centerCluster = FALSE,
        useDims = NULL,
        shift = FALSE,
        diagnosis = FALSE
) {
    # Initiate output list
    out <- list()

    # transposed to N cells by k factors
    object <- t(object)
    useDims <- useDims %||% seq_len(ncol(object))
    object <- object[, useDims]

    if (isTRUE(diagnosis)) {
        raw_vote <- apply(object, 1, which.max)
        raw_vote <- factor(raw_vote)
        names(raw_vote) <- rownames(object)
        out$raw_which.max <- raw_vote
    }

    Z <- safe_scale(object, center = centerEmb, scale = scaleEmb)
    # Z transposed to k factors by N cells
    Z <- t(Z)
    # phi - binary design matrix for dataset belonging, B datasets by N cells
    phi <- Matrix::fac2sparse(datasetVar)

    R <- safe_scale(object, center = centerCluster, scale = scaleCluster)
    # R transposed to k clusters by N cells
    R <- t(R)
    if (isTRUE(shift)) {
        R <- R - min(R)
    }
    if (any(R < 0)) {
        cli::cli_abort(
            c(x = "Negative values found prior to normalizing the cluster assignment probablity distribution",
              i = "Can only do either {.code centerCluster = FALSE} or {.code centerCluster = TRUE, shift = TRUE}.")
        )
    }

    R <- normalize_byCol_dense_rcpp(R)

    Z_corr <- moe_correct_ridge_cpp(
        Z_orig = Z, R = R, lambda = lambda, Phi = phi, B = nrow(phi), N = ncol(phi)
    )
    # Z_corr transposed back to N cells by k factors
    Z_corr <- t(Z_corr)
    dimnames(Z_corr) <- list(rownames(object), paste0("Factor_", seq_len(ncol(Z_corr))))

    out$aligned <- Z_corr

    if (isTRUE(diagnosis)) {
        Z_cluster <- apply(Z_corr, 1, which.max)
        Z_cluster <- factor(Z_cluster)
        names(Z_cluster) <- colnames(Z)
        out$Z_which.max <- Z_cluster
        R_cluster <- apply(R, 2, which.max)
        R_cluster <- factor(R_cluster)
        names(R_cluster) <- colnames(R)
        out$R_which.max <- R_cluster
    }

    return(out)
}

################################## EVALUATION ##################################

#' Calculate agreement metric after integration
#' @description
#' This metric quantifies how much the factorization and alignment distorts the
#' geometry of the original datasets. The greater the agreement, the less
#' distortion of geometry there is. This is calculated by performing
#' dimensionality reduction on the original and integrated (factorized or plus
#' aligned) datasets, and measuring similarity between the k nearest
#' neighbors for each cell in original and integrated datasets. The Jaccard
#' index is used to quantify similarity, and is the final metric averages across
#' all cells.
#'
#' Note that for most datasets, the greater the chosen \code{nNeighbor}, the
#' greater the agreement in general. Although agreement can theoretically
#' approach 1, in practice it is usually no higher than 0.2-0.3.
#' @param object \code{liger} object. Should call \code{\link{alignFactors}}
#' before calling.
#' @param ndims Number of factors to produce in NMF. Default \code{40}.
#' @param nNeighbors Number of nearest neighbors to use in calculating Jaccard
#' index. Default \code{15}.
#' @param useRaw Whether to evaluate just factorized \eqn{H} matrices instead of
#' using aligned \eqn{H.norm} matrix. Default \code{FALSE} uses
#' aligned matrix.
#' @param byDataset Whether to return agreement calculated for each dataset
#' instead of the average for all datasets. Default \code{FALSE}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param k,rand.seed,by.dataset `r lifecycle::badge("superseded")` See Usage
#' for replacement.
#' @param use.aligned `r lifecycle::badge("superseded")` Use \code{useRaw}
#' instead.
#' @param dr.method `r lifecycle::badge("defunct")` We no longer support other
#' methods but just NMF.
#' @return A numeric vector of agreement metric. A single value if
#' \code{byDataset = FALSE} or each dataset a value otherwise.
#' @export
#' @examples
#' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
#'     pbmc <- pbmc %>%
#'     normalize %>%
#'     selectGenes %>%
#'     scaleNotCenter %>%
#'     runINMF %>%
#'     alignFactors
#'     calcAgreement(pbmc)
#' }
calcAgreement <- function(
        object,
        ndims = 40,
        nNeighbors = 15,
        useRaw = FALSE,
        byDataset = FALSE,
        seed = 1,
        # Deprecated
        dr.method = NULL,
        k = nNeighbors,
        use.aligned = NULL,
        rand.seed = seed,
        by.dataset = byDataset
) {
    .deprecateArgs(
        list(k = "nNeighbors", by.dataset = "byDataset", rand.seed = "seed"),
        c("dr.method", "use.aligned")
    )
    if (isH5Liger(object)) {
        cli::cli_abort(
            c("x" = "HDF5 based liger object is not supported for now.",
              "i" = "Please create another object with {.field scaleData} loaded into memory and use that as input.",
              "i" = "e.g. {.code memCopy <- subsetLiger(object, useSlot = 'scaleData', newH5 = FALSE)}")
        )
    }

    scaled <- getMatrix(object, "scaleData", returnList = TRUE)
    scaleDataIsNull <- sapply(scaled, is.null)
    if (any(scaleDataIsNull)) {
        cli::cli_abort("No {.field scaleData} available for dataset: {.val {names(scaleDataIsNull)[scaleDataIsNull]}}.")
    }
    if (isTRUE(useRaw)) {
        H <- getMatrix(object, "H", returnList = TRUE)
        HIsNull <- sapply(H, is.null)
        if (any(HIsNull)) {
            cli::cli_abort("No {.field H} available for dataset: {.val {names(HIsNull)[HIsNull]}}.")
        }
        H <- Reduce(cbind, H)
        H <- t(H)
    } else {
        H <- getMatrix(object, "H.norm", returnList = FALSE)
        if (is.null(H)) {
            cli::cli_abort("No {.field H.norm} available.")
        }
    }
    set.seed(seed)
    dr <- lapply(scaled, RcppPlanc::nmf, k = ndims) %>%
        lapply(`[[`, i = "H")
    nCells <- lengths(object)
    jaccard_inds <- c()
    distorts <- c()

    for (i in seq_along(object)) {
        datasetName <- names(object)[i]
        idx <- object$dataset %in% datasetName
        Hsub <- H[idx, , drop = FALSE]
        knn1 <- RANN::nn2(dr[[i]], k = nNeighbors + 1)$nn.idx[,2:(nNeighbors + 1)]
        knn2 <- RANN::nn2(Hsub, k = nNeighbors + 1)$nn.idx[,2:(nNeighbors + 1)]
        jaccard_inds_i <- sapply(seq_len(nCells[i]), function(i) {
            intersect <- intersect(knn1[i, ], knn2[i, ])
            union <- union(knn1[i, ], knn2[i, ])
            length(intersect) / length(union)
        })
        jaccard_inds_i <- jaccard_inds_i[is.finite(jaccard_inds_i)]
        jaccard_inds <- c(jaccard_inds, jaccard_inds_i)
        distorts <- c(distorts, mean(jaccard_inds_i))
    }
    if (isTRUE(byDataset)) {
        return(distorts)
    }
    return(mean(jaccard_inds))
}

#' Calculate alignment metric after integration
#' @description
#' This metric quantifies how well-aligned two or more datasets are. We randomly
#' downsample all datasets to have as many cells as the smallest one. We
#' construct a nearest-neighbor graph and calculate for each cell how many of
#' its neighbors are from the same dataset. We average across all cells and
#' compare to the expected value for perfectly mixed datasets, and scale the
#' value from 0 to 1. Note that in practice, alignment can be greater than 1
#' occasionally.
#' @details
#' \eqn{\bar{x}} is the average number of neighbors belonging to any cells' same
#' dataset, \eqn{N} is the number of datasets, \eqn{k} is the number of
#' neighbors in the KNN graph.
#' \deqn{1 - \frac{\bar{x} - \frac{k}{N}}{k - \frac{k}{N}}}
#'
#' The selection on cells to be measured can be done in various way and
#' represent different scenarios:
#' \enumerate{
#' \item{By default, all cells are considered and the alignment across all
#' datasets will be calculated.}
#' \item{Select \code{clustersUse} from \code{clusterVar} to use cells from the
#' clusters of interests. This measures the alignment across all covered
#' datasets within the specified clusters.}
#' \item{Only Specify \code{cellIdx} for flexible selection. This measures the
#' alignment across all covered datasets within the specified cells. A none-NULL
#' \code{cellIdx} privileges over \code{clustersUse}.}
#' \item{Specify \code{cellIdx} and \code{cellComp} at the same time, so that
#' the original dataset source will be ignored and cells specified by each
#' argument will be regarded as from each a dataset. This measures the alignment
#' between cells specified by the two arguments. \code{cellComp} can contain
#' cells already specified in \code{cellIdx}.}
#' }
#' @param object A \linkS4class{liger} object, with \code{\link{alignFactors}}
#' already run.
#' @param clustersUse The clusters to consider for calculating the alignment.
#' Should be a vector of existing levels in \code{clusterVar}. Default
#' \code{NULL}. See Details.
#' @param clusterVar The name of one variable in \code{cellMeta(object)}.
#' Default \code{NULL} uses default clusters.
#' @param nNeighbors Number of neighbors to use in calculating alignment.
#' Default \code{NULL} uses \code{floor(0.01*ncol(object))}, with a lower bound
#' of 10 in all cases except where the total number of sampled cells is less
#' than 10.
#' @param cellIdx,cellComp Character, logical or numeric index that can
#' subscribe cells. Default \code{NULL}. See Details.
#' @param resultBy Select from \code{"all"}, \code{"dataset"} or \code{"cell"}.
#' On which level should the mean alignment be calculated. Default \code{"all"}.
#' @param seed Random seed to allow reproducible results. Default \code{1}.
#' @param k,rand.seed,cells.use,cells.comp,clusters.use
#' `r lifecycle::badge("superseded")` Please see Usage for replacement.
#' @param by.cell,by.dataset `r lifecycle::badge("superseded")` Use
#' \code{resultBy} instead.
#' @return The alignment metric.
#' @export
#' @examples
#' if (requireNamespace("RcppPlanc", quietly = TRUE)) {
#'     pbmc <- pbmc %>%
#'     normalize %>%
#'     selectGenes %>%
#'     scaleNotCenter %>%
#'     runINMF %>%
#'     alignFactors
#'     calcAlignment(pbmc)
#' }
calcAlignment <- function(
        object,
        clustersUse = NULL,
        clusterVar = NULL,
        nNeighbors = NULL,
        cellIdx = NULL,
        cellComp = NULL,
        resultBy = c("all", "dataset", "cell"),
        seed = 1,
        # Deprecated
        k = nNeighbors,
        rand.seed = seed,
        cells.use = cellIdx,
        cells.comp = cellComp,
        clusters.use = clustersUse,
        by.cell = NULL,
        by.dataset = NULL
) {
    .deprecateArgs(
        list(rand.seed = "seed", cells.use = "cellIdx", cells.comp = "cellComp",
             clusters.use = "clustersUse"),
        c("by.cell", "by.dataset")
    )
    resultBy <- match.arg(resultBy)
    hnorm <- getMatrix(object, 'H.norm')
    if (is.null(hnorm)) {
        cli::cli_abort(
            c("x" = "Aligned cell factor loading {.field H.norm} not available.",
              "i" = "Please run {.fn quantileNorm} first.")
        )
    }
    if (is.null(cellIdx) && is.null(clustersUse)) {
        cellIdx <- seq_len(ncol(object))
        datasetVar <- droplevels(object$dataset)
    } else if (!is.null(cellIdx)) {
        cellIdx <- .idxCheck(object, cellIdx, "cell")
        if (!is.null(cellComp)) {
            cellComp <- .idxCheck(object, cellComp, "cell")
            datasetVar <- factor(rep.int(c("cellIdx", "cellComp"), c(length(cellIdx), length(cellComp))))
            cellIdx <- c(cellIdx, cellComp)
            cli::cli_alert_info("Using designated sets {.var cellIdx} and {.var cellComp} as subsets to compare.")
        } else {
            datasetVar <- droplevels(object$dataset[cellIdx])
        }
    } else {
        clusterVar <- clusterVar %||% object@uns$defaultCluster
        if (is.null(clusterVar)) {
            cli::cli_abort("No {.field clusterVar} specified or default preset by {.fn runCluster}.")
        }
        clusters <- .fetchCellMetaVar(object, clusterVar, checkCategorical = TRUE)
        notFound <- clustersUse[!clustersUse %in% clusters]
        if (length(notFound) > 0) {
            cli::cli_abort(
                c("x" = "{length(notFound)} cluster{?s} not found in {.val {clusterVar}}: {.val {notFound}}.",
                  "i" = "Available clusters: {.val {levels(droplevels(clusters))}}")
            )
        }
        cellIdx <- which(clusters %in% clustersUse)
        datasetVar <- droplevels(object$dataset[cellIdx])
    }
    if (length(cellIdx) == 0) cli::cli_abort("No cell is selected.")
    if (nlevels(datasetVar) == 1) {
        cli::cli_alert_warning("Alignment null for single dataset.")
    }
    set.seed(seed)
    minCells <- min(table(datasetVar))
    sampledCells <- lapply(split(cellIdx, datasetVar), sample, size = minCells)
    sampledCells <- unlist(sampledCells)
    nSampled <- nlevels(datasetVar)*minCells
    maxNNeighbors <- nSampled - 1
    if (is.null(nNeighbors)) {
        nNeighbors <- min(max(floor(0.01 * length(cellIdx)), 10), maxNNeighbors)
    } else if (nNeighbors > maxNNeighbors) {
        cli::cli_abort("Please select {.var nNeighbors} <= {maxNNeighbors}.")
    }
    # RANN::nn2 always consider the query itself as the nearest nearest neighbor
    # So we have to find one more neighbor than we need and remove the first one
    knn <- RANN::nn2(
        hnorm[sampledCells, , drop = FALSE],
        k = nNeighbors + 1
    )
    datasetVar <- datasetVar[sampledCells]
    knnIdx <- knn$nn.idx[, -1]
    kOverN <- nNeighbors/nlevels(datasetVar)
    nSameDataset <- sapply(seq_len(nSampled), function(i) {
        sum(datasetVar[knnIdx[i, ]] == datasetVar[i])
    })
    alignmentPerCell <- sapply(seq_len(nSampled), function(i) {
        1 - (nSameDataset[i] - kOverN)/(nNeighbors - kOverN)
    })

    if (resultBy == "all") {
        alignment <- mean(alignmentPerCell)
    } else if (resultBy == "dataset") {
        sampledDatasetVar <- rep(levels(datasetVar), each = minCells)
        alignment <- sapply(
            split(nSameDataset, sampledDatasetVar),
            function(x) 1 - (mean(x) - kOverN)/(nNeighbors - kOverN)
        )
    } else {
        alignment <- stats::setNames(
            alignmentPerCell,
            colnames(object)[unlist(sampledCells)]
        )
    }
    return(alignment)
}

Try the rliger package in your browser

Any scripts or data that you put into this service are public.

rliger documentation built on Oct. 30, 2024, 1:07 a.m.