#' Convex Analysis of Mixtures Version 3
#'
#' This function performs a fully unsupervised computational deconvolution
#' to identify marker genes that define each of the multiple
#' subpopulations, and estimate the proportions of these subpopulations in
#' the mixture tissues as well as their respective expression profiles.
#' @param data Matrix of mixture expression profiles.
#' Data frame, SummarizedExperiment or ExpressionSet object will be
#' internally coerced into a matrix.
#' Each row is a gene and each column is a sample.
#' Data should be in non-log linear space with non-negative numerical values
#' (i.e. >= 0). Missing values are not supported.
#' All-zero rows will be removed internally.
#' @param K The candidate subpopulation number(s), e.g. K = 2:8.
#' @param dim.rdc Reduced data dimension;
#' should be not less than maximum candidate K.
#' @param thres.low The lower bound of percentage of genes to keep for CAM
#' with ranked norm. The value should be between 0 and 1.
#' The default is 0.05.
#' @param thres.high The higher bound of percentage of genes to keep for CAM
#' with ranked norm. The value should be between 0 and 1.
#' The default is 1.
#' @param cluster.method The method to do clustering.
#' The default "Fixed-Radius" will make all the clusters with the same size.
#' The alternative "K-Means" will use \code{\link{kmeans}}.
#' @param radius.thres The "cosine" radius of "Fixed-Radius" clustering. The
#' default is 0.95
#' @param sim.thres The cosine similarity threshold of cluster centers. For
#' clusters with cosine similarity higher than the threshold, they would be
#' merged until the number of clusters equals to cluster.num. This parameter
#' could control the upper bound of similarity amoung sources. The default
#' is 0.95.
#' @param cluster.num The lower bound of cluster number, which should be much
#' larger than K. The default is 50.
#' @param MG.num.thres The clusters with the gene number smaller than
#' MG.num.thres will be treated as outliers.
#' The default is 20.
#' @param sample.weight Vector of sample weights. If NULL, all samples have
#' the same weights. The length should be the same as sample numbers.
#' All values should be positive.
#' @param fast.mode Use fast mode of greedy search or not. The normal mode may
#' give more accurate results, but computation time is much longer. The
#' default is TRUE.
#' @param generalNMF If TRUE, the decomposed proportion matrix has no sum-to-one
#' constraint for each row. The default is FALSE.
#' TRUE value brings two changes: (1) Without assuming samples are
#' normalized, the first principal component will not forced to be along
#' c(1,1,..,1) but a standard PCA will be applied during preprocessing.
#' (2) Without sum-to-one constraint for each row, the scale ambiguity of
#' each column vector in proportion matrix will not be removed.
#' @param cores The number of system cores for parallel computing.
#' If not provided, one core for each element in K will be invoked.
#' Zero value will disable parallel computing.
#' @details This function includes three necessary steps to decompose a matrix
#' of mixture expression profiles: data preprocessing, marker gene cluster
#' search, and matrix decomposition. They are implemented in
#' \code{\link{CAM3Prep}}, \code{\link{CAM3MGCluster}} and
#' \code{\link{CAM3ASest}}, separately.
#' More details can be found in the help document of each function.
#'
#' For this function, you needs to specify the range of possible
#' subpopulation numbers and the percentage of low/high-expressed genes to
#' be removed. Typically, 30\% ~ 50\% low-expressed genes can be removed from
#' gene expression data. The removal of high-expressed genes has much less
#' impact on results, and usually set to be 0\% ~ 10\%.
#'
#' This function can also analyze other molecular expression data, such as
#' proteomics data. Much less low-expressed proteins need to be removed,
#' e.g. 0\% ~ 10\%, due to a limited number of proteins without missing values.
#' @return An object of class "\code{\link[debCAM]{CAMObj}}" containing the
#' following components:
#' \item{PrepResult}{An object of class "\code{\link[debCAM]{CAMPrepObj}}"
#' containing data preprocessing results from \code{\link[debCAM]{CAMPrep}}
#' function.}
#' \item{MGResult}{A list of "\code{\link[debCAM]{CAMMGObj}}" objects containing
#' marker gene detection results from \code{\link[debCAM]{CAMMGCluster}}
#' function for each K value.}
#' \item{ASestResult}{A list of "\code{\link[debCAM]{CAMASObj}}" objects
#' containing estimated proportions, subpopulation-specific expressions and mdl
#' values from \code{\link[debCAM]{CAMASest}} function for each K value.}
#' @export
#' @examples
#' #obtain data
#' data(ratMix3)
#' data <- ratMix3$X
#'
#' #CAM3 with known subpopulation number
#' rCAM3 <- CAM3Run(data, K = 3, dim.rdc = 3, thres.low = 0.30, thres.high = 0.95)
#' #Larger dim.rdc can improve performance but increase time complexity
#'
#' \dontrun{
#' #CAM with a range of subpopulation number
#' rCAM3 <- CAM3Run(data, K = 2:5, dim.rdc = 10, thres.low = 0.30,
#' thres.high = 0.95)
#' }
CAM3Run <- function(data, K=NULL, dim.rdc=10, thres.low=0.05, thres.high=1,
cluster.method=c("Fixed-Radius" ,"K-Means"), radius.thres=0.95,
sim.thres = 0.95, cluster.num = 50, MG.num.thres = 20,
sample.weight = NULL, fast.mode = TRUE, generalNMF = FALSE){
message('### CAM3 starting...\n')
if (is.null(K)) {
stop("K is missing")
}
if (!is.numeric(K)) {
stop("K is not numeric")
}
if (is(data, "data.frame")) {
data <- as.matrix(data)
} else if (is(data, "SummarizedExperiment")) {
data <- SummarizedExperiment::assay(data)
} else if (is(data, "ExpressionSet")) {
data <- Biobase::exprs(data)
} else if (is(data, "matrix") == FALSE) {
stop("Only matrix, data frame, SummarizedExperiment and ExpressionSet
object are supported for expression data!")
}
if (sum(is.na(data)) > 0) {
stop("Data with missing values are not supported!")
}
if (sum(data<0) > 0) {
stop("Only non-negative data are supported!")
}
if (is.null(rownames(data))) {
rownames(data) <- seq_len(nrow(data))
}
if (dim.rdc < max(K)) {
warning("dim.rdc is less than max(K)!")
}
data <- data[rowSums(data) > 0,]
################ Preprocessing ############################################
message('## Preprocessing...\n')
PrepResult <- CAM3Prep(data, dim.rdc, thres.low, thres.high,
cluster.method = c('Fixed-Radius' ,'K-Means'),
radius.thres, sim.thres, cluster.num,
MG.num.thres, sample.weight)
################ Marker Gene Selection ####################################
message('## Marker Gene Selection...\n')
MGResult <- CAM3MGCluster(PrepResult, fast.mode)
MGResult <- MGResult[K]
names(MGResult) <- as.character(K)
################ A and S Matrix Estimation ################################
message('## A and S Matrix Estimation...\n')
ASestResultF <- lapply(MGResult, CAM3ASest, PrepResult, data,
1, generalNMF)
ASestResultB <- lapply(MGResult, CAM3ASest, PrepResult, data,
2, generalNMF)
ASestResult <- vector("list", length(K))
for (k in seq_along(K)){
if (ASestResultF[[k]]@mdl < ASestResultB[[k]]@mdl)
ASestResult[[k]] <- ASestResultF[[k]]
else
ASestResult[[k]] <- ASestResultB[[k]]
}
names(ASestResult) <- as.character(K)
return(new("CAMObj",PrepResult=PrepResult, MGResult=MGResult,
ASestResult=ASestResult))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.