R/kmeans.R

Defines functions k_select plot_cluster cluster_exposure

Documented in cluster_exposure k_select plot_cluster

#' @title Perform clustering analysis from a musica result object
#' @description Proportional sample exposures will be used as input to perform clustering.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool.
#' @param nclust Pre-defined number of clusters.
#' @param proportional Logical, indicating if proportional exposure (default) will be used for clustering.
#' @param method Clustering algorithms. Options are "kmeans" (K-means), "hkmeans" (hybrid of hierarchical 
#' K-means), "hclust" (hierarchical clustering), "pam" (PAM), and "clara" (Clara).
#' @param dis.method Methods to calculate dissimilarity matrix. Options are "euclidean" (default), "manhattan", 
#' "jaccard", "cosine", and "canberra".
#' @param hc.method Methods to perform hierarchical clustering. Options are "ward.D" (default), "ward.D2", 
#' "single", "complete", "average", "mcquitty", "median", and "centroid".
#' @param clara.samples Number of samples to be drawn from dataset. Only used when "clara" is selected. 
#' Default is 5.
#' @param iter.max Maximum number of iterations for k-means clustering.
#' @param tol Tolerance level for kmeans clustering level iterations
#' @return A one-column data frame with sample IDs as row names and cluster number for each sample.
#' @seealso \link[stats]{kmeans}
#' @examples 
#' set.seed(123)
#' data(res_annot)
#' clust_out <- cluster_exposure(res_annot, nclust = 2)
#' @export
cluster_exposure <- function(result, nclust, proportional = TRUE, method = "kmeans", dis.method = "euclidean", 
                             hc.method = "ward.D", clara.samples = 5, iter.max = 10, tol = 1e-15){
  method <- match.arg(method, c("kmeans", "hkmeans", "hclust", 
                                "pam", "clara"))
  dis.method <- match.arg(dis.method, c("euclidean", "manhattan", 
                                        "jaccard", "cosine", "canberra"))
  hc.method <- match.arg(hc.method, c("ward.D", "ward.D2", "single", "complete", 
                                    "average", "mcquitty", "median", "centroid"))
  #read exposure data
  expos <- exposures(result = result)
  if (isTRUE(proportional)){
    expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
  } else{
    expos <- t(expos)
  }
  #Calculate dissimilarity matrix
  diss <- philentropy::distance(x = expos, method = dis.method, use.row.names = TRUE, as.dist.obj = TRUE)
  #Perform clustering
  if(method == "kmeans"){
    res <- cluster::fanny(x = diss, k = nclust, diss = TRUE, maxit = iter.max, 
                          tol = tol)
    clust_out <- data.frame(cluster = res$clustering)
  } else if(method == "hkmeans"){
    if(!dis.method %in% c("euclidean", "manhattan", "canberra")){
      stop("For hkmeans clustering, please choose the following methods to calculate dissimilarity matrix: 
           euclidean, manhattan, canberra")
    }
    res <- factoextra::hkmeans(x = expos, k = nclust, hc.metric = dis.method, 
                               hc.method = hc.method, iter.max = iter.max)
    clust_out <- data.frame(cluster = res$cluster)
  } else if(method == "hclust"){
    res <- factoextra::hcut(x = diss, k = nclust, isdiss = FALSE, hc_method = hc.method)
    clust_out <-data.frame(cluster = res$cluster)
  } else if(method == "pam"){
    clust_out <- data.frame(cluster = cluster::pam(x = diss, k = nclust,
                                                   diss = TRUE,
                                                   cluster.only = TRUE))
  } else{
    if(!dis.method %in% c("euclidean", "manhattan", "jaccard")){
      stop("For clara clustering, please choose the following methods to calculate dissimilarity matrix: 
           euclidean, manhattan, jaccard")
    }
    res <- cluster::clara(x = expos, k = nclust, metric = dis.method, samples = clara.samples)
    clust_out <- data.frame(cluster = res$clustering)
  }
  return(clust_out)
}

#' @title Visualize clustering results
#' @description The clustering results can be visualized on a UMAP panel. 
#' Three different types of plots can be generated using this function: cluster-by-signature 
#' plot, cluster-by-annotation plot, and a single UMAP plot.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool. A two-dimensional UMAP has to
#' be stored in this object.
#' @param clusters The result generated from cluster_exposure function.
#' @param group A single character string indicating the grouping factor. 
#' Possible options are: "signature" (columns are signatures in a grid), 
#' "annotation" (columns are sample annotation), and "none" (a single UMAP plot). 
#' Default is "signature".
#' @param annotation Column name of annotation.
#' @param plotly If TRUE, the plot will be made interactive using plotly.
#' @return Generate a ggplot or plotly object.
#' @seealso \link{create_umap}
#' @examples 
#' set.seed(123)
#' data(res_annot)
#' #Get clustering result
#' clust_out <- cluster_exposure(result = res_annot, nclust = 2)
#' #UMAP
#' create_umap(result = res_annot)
#' #generate cluster X signature plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "signature")
#' #generate cluster X annotation plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "annotation",
#'             annotation = "Tumor_Subtypes")
#' #generate a single UMAP plot
#' plot_cluster(result = res_annot, clusters = clust_out, group = "none")
#' @export

plot_cluster <- function(result, clusters, group = "signature", annotation = NULL, plotly = TRUE){
  group <- match.arg(group, c("signature", "annotation", "none"))
  if(length(umap(result)) == 0){
    stop("UMAP not found in musica_result object. Run create_umap(", deparse(substitute(result)),") first.")
  }
  k_toplot <- cbind(result@umap, clusters)
  k_toplot$cluster <- factor(k_toplot$cluster)
  if(group == "signature"){
    expos <- exposures(result = result)
    expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
    clust_by_sigs <- cbind(k_toplot, expos) %>%
      tibble::rownames_to_column(var = "sample") %>%
      tidyr::pivot_longer(cols = colnames(expos),
                          names_to = "signature",
                          values_to = "exposure",
                          names_repair = "minimal")
    p <- ggplot2::ggplot(clust_by_sigs, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "exposure")) +
           geom_point() +
           facet_grid(cluster ~ signature) +
           ggplot2::scale_colour_gradientn(colors = c("blue", "green", "yellow", "orange", "red"), name = "Fraction") +
           ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                    panel.background = element_blank())
  }
  else if(group == "annotation"){
    if(is.null(annotation) || !annotation %in% colnames(samp_annot(result))){
      stop("Sample annotation not found or invalid annotation column name.")
    }
    else{
      annot <- samp_annot(result) %>% tibble::column_to_rownames(var = "Samples")
      colnames(annot) <- "annotation"
      annot[["annotation"]] <- factor(annot[["annotation"]])
      clust_by_annot <- cbind(k_toplot, annot)
      p <- ggplot2::ggplot(clust_by_annot, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "cluster")) +
             geom_point() +
             facet_grid(cluster ~ annotation) +
           ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                          panel.background = element_blank())
    }
  }
  else{
    p <- ggplot2::ggplot(k_toplot, aes_string(x = "UMAP_1", y = "UMAP_2", colour = "cluster")) +
           geom_point() +
         ggplot2::theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                        panel.background = element_blank())
  }
  if(plotly){
    p <- plotly::ggplotly(p)
    return(p)
  }
  else{
    return(p)
  }
}

#' @title Plots for helping decide number of clusters
#' @description To help decide the number of cluster, three different methods 
#' are provided: total within cluster sum of squares, average silhouette coefficient, 
#' and gap statistics.
#' @param result A \code{\linkS4class{musica_result}} object generated by
#' a mutational discovery or prediction tool.
#' @param method A single character string indicating which statistic to use for plot. 
#' Options are "wss" (total within cluster sum of squares), "silhouette" (average silhouette 
#' coefficient), and "gap_stat" (gap statistic). Default is "wss".
#' @param clust.method A character string indicating clustering method. Options are "kmeans" 
#' (default), "hclust" (hierarchical clustering), "hkmeans", "pam", and "clara".
#' @param n An integer indicating maximum number of clusters to test. Default is 10.
#' @param proportional Logical, indicating if proportional exposure (default) will be used for clustering.
#' @return A ggplot object.
#' @seealso \link[factoextra]{fviz_nbclust}
#' @examples 
#' data(res_annot)
#' set.seed(123)
#' #Make an elbow plot
#' k_select(res_annot, method = "wss", n = 6)
#' #Plot average silhouette coefficient against number of clusters
#' k_select(res_annot, method = "silhouette", n = 6)
#' #Plot gap statistics against number of clusters
#' k_select(res_annot, method = "gap_stat", n = 6)
#' @export

k_select <- function(result, method = "wss", clust.method = "kmeans", n = 10, proportional = TRUE){
  method <- match.arg(method, c("wss", "silhouette", "gap_stat"))
  clust.method <- match.arg(clust.method, c("kmeans", "hclust", "hkmeans", "pam", "clara"))
  expos <- exposures(result = result)
  if(isTRUE(proportional)){
    expos <- t(sweep(expos, 2, colSums(expos), FUN = "/"))
  }
  else{
    expos <- t(expos)
  }
  if(clust.method == "kmeans"){
    factoextra::fviz_nbclust(expos, stats::kmeans, method = method, k.max = n)
  }
  else if(clust.method == "hclust"){
    factoextra::fviz_nbclust(expos, factoextra::hcut, method = method, k.max = n)
  }
  else if(clust.method == "hkmeans"){
    factoextra::fviz_nbclust(expos, factoextra::hkmeans, method = method, k.max = n)
  }
  else if(clust.method == "pam"){
    factoextra::fviz_nbclust(expos, cluster::pam, method = method, k.max = n)
  }
  else{
    factoextra::fviz_nbclust(expos, cluster::clara, method = method, k.max = n)
  }
}
campbio/musicatk documentation built on Oct. 22, 2023, 8:28 p.m.