###############################################################################
## Functions to cluster samples in the low-dimensional factor representation ##
###############################################################################
#' @title clusterSamples: K-means clustering on samples based on latent factors
#' @name clusterSamples
#' @description MOFA factors are continuous in nature
#' but they can be used to predict discrete clusters of samples,
#' similar to the iCluster model (Shen, 2009). \cr
#' The clustering can be performed in a single factor, which is equivalent to setting a manual threshold;
#' or using multiple factors, where multiple sources of variation are aggregated. \cr
#' Importantly, this type of clustering is not weighted and
#' does not take into account the different importance of the latent factors.
#' @param object a trained \code{\link{MOFAmodel}} object.
#' @param k number of clusters
#' @param factors character vector with the factor name(s),
#' or numeric vector with the index of the factor(s) to use.
#' Default is 'all'
#' @param ... extra arguments passed to \code{\link{kmeans}}
#' @details In some cases, samples can have missing values in the factor space.
#' This occurs when a factor is active in a single view and some samples are missing this data. \cr
#' In such a case, there are several strategies to follow: \cr
#' \itemize{
#' \item{}{ Use clustering approaches that deal with NAs (not implemented in MOFA)}
#' \item{}{ If the factor in question is not important, you can remove it with \code{\link{subsetFactors}}}
#' \item{}{ If the factor in question is important and just a small number of samples are conflictive,
#' you can manually set them to 0 using \code{object@Expectations$Z[is.na(object@Expectations$Z)] <- 0}}
#' }
#' By default, the conflictive samples are ignored in the clustering procedure and NAs are returned.
#'
#' @return output from \code{\link{kmeans}} function
#' @importFrom stats kmeans
#' @export
#' @examples
#' # Example on the CLL data
#' filepath <- system.file("extdata", "CLL_model.hdf5", package = "MOFAdata")
#' MOFA_CLL <- loadModel(filepath)
#' # cluster samples based into 3 groups based on all factors
#' clusterSamples(MOFA_CLL, k=3, factors="all")
#' # cluster samples based into 2 groups based on factor 1
#' clusters <- clusterSamples(MOFA_CLL, k=2, factors=1)
#' # cluster can be visualized for example on the factors values:
#' plotFactorBeeswarm(MOFA_CLL, factor=1, color_by=clusters)
#'
#' # Example on the scMT data
#' filepath <- system.file("extdata", "scMT_model.hdf5", package = "MOFAdata")
#' MOFA_scMT <- loadModel(filepath)
#' # cluster samples based into 2 groups based on all factor 1 and 2
#' clusters <- clusterSamples(MOFA_CLL, k=2, factors=1:2)
#' # cluster can be visualized for example on the factors values:
#' plotFactorScatter(MOFA_CLL, factors=1:2, color_by=clusters)
clusterSamples <- function(object, k, factors = "all",...) {
# Sanity checks
if (!is(object, "MOFAmodel")) stop("'object' has to be an instance of MOFAmodel")
# Define factors
if (paste0(factors,collapse="") == "all") { factors <- factorNames(object) }
else if(is.numeric(factors)) {
factors <- factorNames(object)[factors]
}
else{ stopifnot(all(factors %in% factorNames(object))) }
# Collect relevant data
Z <- getFactors(object, factors=factors)
N <- getDimensions(object)[["N"]]
# For now remove sample with missing values on factors
# (TO-DO) incorporate a clustering function that is able to cope with missing values
haveAllZ <- apply(Z,1, function(x) all(!is.na(x)))
if(!all(haveAllZ)) warning(paste("Removing", sum(!haveAllZ), "samples with missing values on at least one factor"))
Z_sub <- Z[haveAllZ,]
# Perform k-means clustering
kmeans.out <- kmeans(Z_sub, centers=k, ...)
clusters <- rep(NA, length(sampleNames(object)))
names(clusters) <- sampleNames(object)
clusters[haveAllZ] <- kmeans.out$cluster
return(clusters)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.