#' @title Perform clustering analysis from a musica result object
#' @description Proportional sample exposures will be used as input to perform clustering.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool.
#' @param nclust Pre-defined number of clusters.
#' @param proportional Logical, indicating if proportional exposure (default) will be used for clustering.
#' @param method Clustering algorithms. Options are "kmeans" (K-means), "hkmeans" (hybrid of hierarchical
#' K-means), "hclust" (hierarchical clustering), "pam" (PAM), and "clara" (Clara).
#' @param dis.method Methods to calculate dissimilarity matrix. Options are "euclidean" (default), "manhattan",
#' "jaccard", "cosine", and "canberra".
#' @param hc.method Methods to perform hierarchical clustering. Options are "ward.D" (default), "ward.D2",
#' "single", "complete", "average", "mcquitty", "median", and "centroid".
#' @param clara.samples Number of samples to be drawn from dataset. Only used when "clara" is selected.
#' Default is 5.
#' @param iter.max Maximum number of iterations for k-means clustering.
#' @param tol Tolerance level for kmeans clustering level iterations
#' @return A one-column data frame with sample IDs as row names and cluster number for each sample.
#' @seealso \link[stats]{kmeans}
#' @examples
#' set.seed(123)
#' data(res_annot)
#' clust_out <- cluster_exposure(res_annot, nclust = 2)
#' @export
cluster_exposure <- function(result, nclust, proportional = TRUE, method = "kmeans", dis.method = "euclidean",
hc.method = "ward.D", clara.samples = 5, iter.max = 10, tol = 1e-15){
method <- match.arg(method, c("kmeans", "hkmeans", "hclust",
"pam", "clara"))
dis.method <- match.arg(dis.method, c("euclidean", "manhattan",
"jaccard", "cosine", "canberra"))
hc.method <- match.arg(hc.method, c("ward.D", "ward.D2", "single", "complete",
"average", "mcquitty", "median", "centroid"))
#read exposure data
expos <- exposures(result = result)
if (isTRUE(proportional)){
expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
} else{
expos <- t(expos)
}
#Calculate dissimilarity matrix
diss <- philentropy::distance(x = expos, method = dis.method, use.row.names = TRUE, as.dist.obj = TRUE)
#Perform clustering
if(method == "kmeans"){
res <- cluster::fanny(x = diss, k = nclust, diss = TRUE, maxit = iter.max,
tol = tol)
clust_out <- data.frame(cluster = res$clustering)
} else if(method == "hkmeans"){
if(!dis.method %in% c("euclidean", "manhattan", "canberra")){
stop("For hkmeans clustering, please choose the following methods to calculate dissimilarity matrix:
euclidean, manhattan, canberra")
}
res <- factoextra::hkmeans(x = expos, k = nclust, hc.metric = dis.method,
hc.method = hc.method, iter.max = iter.max)
clust_out <- data.frame(cluster = res$cluster)
} else if(method == "hclust"){
res <- factoextra::hcut(x = diss, k = nclust, isdiss = FALSE, hc_method = hc.method)
clust_out <-data.frame(cluster = res$cluster)
} else if(method == "pam"){
clust_out <- data.frame(cluster = cluster::pam(x = diss, k = nclust,
diss = TRUE,
cluster.only = TRUE))
} else{
if(!dis.method %in% c("euclidean", "manhattan", "jaccard")){
stop("For clara clustering, please choose the following methods to calculate dissimilarity matrix:
euclidean, manhattan, jaccard")
}
res <- cluster::clara(x = expos, k = nclust, metric = dis.method, samples = clara.samples)
clust_out <- data.frame(cluster = res$clustering)
}
return(clust_out)
}
#' @title Visualize clustering results
#' @description The clustering results can be visualized on a UMAP panel.
#' Three different types of plots can be generated using this function: cluster-by-signature
#' plot, cluster-by-annotation plot, and a single UMAP plot.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool. A two-dimensional UMAP has to
#' be stored in this object.
#' @param clusters The result generated from cluster_exposure function.
#' @param group A single character string indicating the grouping factor.
#' Possible options are: "signature" (columns are signatures in a grid),
#' "annotation" (columns are sample annotation), and "none" (a single UMAP plot).
#' Default is "signature".
#' @param annotation Column name of annotation.
#' @param plotly If TRUE, the plot will be made interactive using plotly.
#' @return Generate a ggplot or plotly object.
#' @seealso \link{create_umap}
#' @examples
#' set.seed(123)
#' data(res_annot)
#' #Get clustering result
#' clust_out <- cluster_exposure(result = res_annot, nclust = 2)
#' #UMAP
#' create_umap(result = res_annot)
#' #generate cluster X signature plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "signature")
#' #generate cluster X annotation plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "annotation",
#' annotation = "Tumor_Subtypes")
#' #generate a single UMAP plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "none")
#' @export
plot_cluster <- function(result, clusters, group = "signature", annotation = NULL, plotly = TRUE){
group <- match.arg(group, c("signature", "annotation", "none"))
if(length(umap(result)) == 0){
stop("UMAP not found in musica_result object. Run create_umap(", deparse(substitute(result)),") first.")
}
k_toplot <- cbind(result@umap, clusters)
k_toplot$cluster <- factor(k_toplot$cluster)
if(group == "signature"){
expos <- exposures(result = result)
expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
clust_by_sigs <- cbind(k_toplot, expos) %>%
tibble::rownames_to_column(var = "sample") %>%
tidyr::pivot_longer(cols = colnames(expos),
names_to = "signature",
values_to = "exposure",
names_repair = "minimal")
p <- ggplot2::ggplot(clust_by_sigs, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "exposure")) +
geom_point() +
facet_grid(cluster ~ signature) +
ggplot2::scale_colour_gradientn(colors = c("blue", "green", "yellow", "orange", "red"), name = "Fraction") +
ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank())
}
else if(group == "annotation"){
if(is.null(annotation) || !annotation %in% colnames(samp_annot(result))){
stop("Sample annotation not found or invalid annotation column name.")
}
else{
annot <- samp_annot(result) %>% tibble::column_to_rownames(var = "Samples")
colnames(annot) <- "annotation"
annot[["annotation"]] <- factor(annot[["annotation"]])
clust_by_annot <- cbind(k_toplot, annot)
p <- ggplot2::ggplot(clust_by_annot, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "cluster")) +
geom_point() +
facet_grid(cluster ~ annotation) +
ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank())
}
}
else{
p <- ggplot2::ggplot(k_toplot, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "cluster")) +
geom_point() +
ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank())
}
if(plotly){
p <- plotly::ggplotly(p)
return(p)
}
else{
return(p)
}
}
#' @title Plots for helping decide number of clusters
#' @description To help decide the number of cluster, three different methods
#' are provided: total within cluster sum of squares, average silhouette coefficient,
#' and gap statistics.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool.
#' @param method A single character string indicating which statistic to use for plot.
#' Options are "wss" (total within cluster sum of squares), "silhouette" (average silhouette
#' coefficient), and "gap_stat" (gap statistic). Default is "wss".
#' @param clust.method A character string indicating clustering method. Options are "kmeans"
#' (default), "hclust" (hierarchical clustering), "hkmeans", "pam", and "clara".
#' @param n An integer indicating maximum number of clusters to test. Default is 10.
#' @param proportional Logical, indicating if proportional exposure (default) will be used for clustering.
#' @return A ggplot object.
#' @seealso \link[factoextra]{fviz_nbclust}
#' @examples
#' data(res_annot)
#' set.seed(123)
#' #Make an elbow plot
#' k_select(res_annot, method = "wss", n = 6)
#' #Plot average silhouette coefficient against number of clusters
#' k_select(res_annot, method = "silhouette", n = 6)
#' #Plot gap statistics against number of clusters
#' k_select(res_annot, method = "gap_stat", n = 6)
#' @export
k_select <- function(result, method = "wss", clust.method = "kmeans", n = 10, proportional = TRUE){
method <- match.arg(method, c("wss", "silhouette", "gap_stat"))
clust.method <- match.arg(clust.method, c("kmeans", "hclust", "hkmeans", "pam", "clara"))
expos <- exposures(result = result)
if(isTRUE(proportional)){
expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
}
else{
expos <- t(expos)
}
if(clust.method == "kmeans"){
factoextra::fviz_nbclust(expos, stats::kmeans, method = method, k.max = n)
}
else if(clust.method == "hclust"){
factoextra::fviz_nbclust(expos, factoextra::hcut, method = method, k.max = n)
}
else if(clust.method == "hkmeans"){
factoextra::fviz_nbclust(expos, factoextra::hkmeans, method = method, k.max = n)
}
else if(clust.method == "pam"){
factoextra::fviz_nbclust(expos, cluster::pam, method = method, k.max = n)
}
else{
factoextra::fviz_nbclust(expos, cluster::clara, method = method, k.max = n)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.