CAM3: Convex Analysis of Mixtures Version 3

Documented in CAM3Run

#' Convex Analysis of Mixtures Version 3
#'
#' This function performs a fully unsupervised computational deconvolution
#'     to identify marker genes that define each of the multiple
#'     subpopulations, and estimate the proportions of these subpopulations in
#'     the mixture tissues as well as their respective expression profiles.
#' @param data Matrix of mixture expression profiles.
#'     Data frame, SummarizedExperiment or ExpressionSet object will be
#'     internally coerced into a matrix.
#'     Each row is a gene and each column is a sample.
#'     Data should be in non-log linear space with non-negative numerical values
#'     (i.e. >= 0). Missing values are not supported.
#'     All-zero rows will be removed internally.
#' @param K The candidate subpopulation number(s), e.g. K = 2:8.
#' @param dim.rdc Reduced data dimension;
#'     should be not less than maximum candidate K.
#' @param thres.low The lower bound of percentage of genes to keep for CAM
#'     with ranked norm. The value should be between 0 and 1.
#'     The default is 0.05.
#' @param thres.high The higher bound of percentage of genes to keep for CAM
#'     with ranked norm. The value should be between 0 and 1.
#'     The default is 1.
#' @param cluster.method The method to do clustering.
#'     The default "Fixed-Radius" will make all the clusters with the same size.
#'     The alternative "K-Means" will use \code{\link{kmeans}}.
#' @param radius.thres The "cosine" radius of "Fixed-Radius" clustering. The
#'     default is 0.95
#' @param sim.thres The cosine similarity threshold of cluster centers. For
#'     clusters with cosine similarity higher than the threshold, they would be
#'     merged until the number of clusters equals to cluster.num. This parameter
#'     could control the upper bound of similarity amoung sources. The default 
#'     is 0.95.
#' @param cluster.num The lower bound of cluster number, which should be much 
#'     larger than K. The default is 50.
#' @param MG.num.thres The clusters with the gene number smaller than
#'     MG.num.thres will be treated as outliers.
#'     The default is 20.
#' @param sample.weight Vector of sample weights. If NULL, all samples have
#'     the same weights. The length should be the same as sample numbers.
#'     All values should be positive.
#' @param fast.mode Use fast mode of greedy search or not. The normal mode may
#'     give more accurate results, but computation time is much longer. The
#'     default is TRUE.
#' @param generalNMF If TRUE, the decomposed proportion matrix has no sum-to-one
#'     constraint for each row. The default is FALSE.
#'     TRUE value brings two changes: (1) Without assuming samples are
#'     normalized, the first principal component will not forced to be along
#'     c(1,1,..,1) but a standard PCA will be applied during preprocessing.
#'     (2) Without sum-to-one constraint for each row, the scale ambiguity of
#'     each column vector in proportion matrix will not be removed.
#' @param cores The number of system cores for parallel computing.
#'     If not provided, one core for each element in K will be invoked.
#'     Zero value will disable parallel computing.
#' @details This function includes three necessary steps to decompose a matrix
#' of mixture expression profiles: data preprocessing, marker gene cluster
#' search, and matrix decomposition. They are implemented in
#' \code{\link{CAM3Prep}}, \code{\link{CAM3MGCluster}} and
#' \code{\link{CAM3ASest}}, separately.
#' More details can be found in the help document of each function.
#'
#' For this function, you needs to specify the range of possible
#' subpopulation numbers and the percentage of low/high-expressed genes to
#' be removed. Typically, 30\% ~ 50\% low-expressed genes can be removed from
#' gene expression data. The removal of high-expressed genes has much less
#' impact on results, and usually set to be 0\% ~ 10\%.
#'
#' This function can also analyze other molecular expression data, such as
#' proteomics data. Much less low-expressed proteins need to be removed,
#' e.g. 0\% ~ 10\%, due to a limited number of proteins without missing values.

#' @return An object of class "\code{\link[debCAM]{CAMObj}}" containing the 
#' following components:
#' \item{PrepResult}{An object of class "\code{\link[debCAM]{CAMPrepObj}}" 
#' containing data preprocessing results from \code{\link[debCAM]{CAMPrep}}
#' function.}
#' \item{MGResult}{A list of "\code{\link[debCAM]{CAMMGObj}}" objects containing
#' marker gene detection results from \code{\link[debCAM]{CAMMGCluster}} 
#' function for each K value.}
#' \item{ASestResult}{A list of "\code{\link[debCAM]{CAMASObj}}" objects 
#' containing estimated proportions, subpopulation-specific expressions and mdl 
#' values from \code{\link[debCAM]{CAMASest}} function for each K value.}
#' @export
#' @examples
#' #obtain data
#' data(ratMix3)
#' data <- ratMix3$X
#'
#' #CAM3 with known subpopulation number
#' rCAM3 <- CAM3Run(data, K = 3, dim.rdc = 3, thres.low = 0.30, thres.high = 0.95)
#' #Larger dim.rdc can improve performance but increase time complexity
#'
#' \dontrun{
#' #CAM with a range of subpopulation number
#' rCAM3 <- CAM3Run(data, K = 2:5, dim.rdc = 10, thres.low = 0.30, 
#' thres.high = 0.95)
#' }
CAM3Run <- function(data, K=NULL, dim.rdc=10, thres.low=0.05, thres.high=1,
                cluster.method=c("Fixed-Radius" ,"K-Means"), radius.thres=0.95,
                sim.thres = 0.95, cluster.num = 50, MG.num.thres = 20, 
                sample.weight = NULL, fast.mode = TRUE, generalNMF = FALSE){
    message('### CAM3 starting...\n')
    
    if (is.null(K)) {
        stop("K is missing")
    }
    if (!is.numeric(K)) {
        stop("K is not numeric")
    }
    if (is(data, "data.frame")) {
        data <- as.matrix(data)
    } else if (is(data, "SummarizedExperiment")) {
        data <- SummarizedExperiment::assay(data)
    } else if (is(data, "ExpressionSet")) {
        data <- Biobase::exprs(data)
    } else if (is(data, "matrix") == FALSE) {
        stop("Only matrix, data frame, SummarizedExperiment and ExpressionSet
            object are supported for expression data!")
    }
    if (sum(is.na(data)) > 0) {
        stop("Data with missing values are not supported!")
    }
    if (sum(data<0) > 0) {
        stop("Only non-negative data are supported!")
    }
    if (is.null(rownames(data))) {
        rownames(data) <- seq_len(nrow(data))
    }
    if (dim.rdc < max(K)) {
        warning("dim.rdc is less than max(K)!")
    }

    data <- data[rowSums(data) > 0,]

    ################ Preprocessing ############################################
    message('## Preprocessing...\n')
    
    PrepResult <- CAM3Prep(data, dim.rdc, thres.low, thres.high,
                           cluster.method = c('Fixed-Radius' ,'K-Means'),
                           radius.thres, sim.thres, cluster.num,
                           MG.num.thres, sample.weight)

    
    ################ Marker Gene Selection ####################################
    message('## Marker Gene Selection...\n')

    MGResult <- CAM3MGCluster(PrepResult, fast.mode)
    MGResult <- MGResult[K]
    names(MGResult) <- as.character(K)

    
    ################ A and S Matrix Estimation ################################
    message('## A and S Matrix Estimation...\n')
    
    ASestResultF <- lapply(MGResult, CAM3ASest, PrepResult, data,
                            1, generalNMF)
    ASestResultB <- lapply(MGResult, CAM3ASest, PrepResult, data,
                            2, generalNMF)
    
    ASestResult <- vector("list", length(K))
    for (k in seq_along(K)){
        if (ASestResultF[[k]]@mdl < ASestResultB[[k]]@mdl)
            ASestResult[[k]] <- ASestResultF[[k]]
        else
            ASestResult[[k]] <- ASestResultB[[k]]
    }
    names(ASestResult) <- as.character(K)

    
    return(new("CAMObj",PrepResult=PrepResult, MGResult=MGResult,
                ASestResult=ASestResult))
}