R/clustering.R

Defines functions HierarchicalClustering generate.3D.clustering.with.labeled.subgroup generate.2D.clustering.with.labeled.subgroup GenerateParcoordForClusters generate.plots.comparing.clusters CalcOptimalNumClustersForKMeans

Documented in CalcOptimalNumClustersForKMeans generate.2D.clustering.with.labeled.subgroup generate.3D.clustering.with.labeled.subgroup GenerateParcoordForClusters generate.plots.comparing.clusters HierarchicalClustering

#' Generate plots to help decide optimal number of clusters for Kmeans
#'
#' Multiple methods are included for assessing optimal number of clusters.
#'
#' Function generates plots that allow you to evaluate optimal number of clusters
#' using 3 different methods:
#' 1. Elbow method: https://www.datanovia.com/en/lessons/determining-the-optimal-number-of-clusters-3-must-know-methods/.
#' Plot intra-cluster variation. Pick the K (cluster) corresponding to a sharp decrease (the elbow).
#'
#' 2. Calinski-Harabasz Index (ch) method:
#' https://www.datasciencecentral.com/profiles/blogs/machine-learning-unsupervised-k-means-clustering-and and
#' https://medium.com/@haataa/how-to-measure-clustering-performances-when-there-are-no-ground-truth-db027e9a871c
#'
#' 3. Average Silhouette Width (asw) method: https://www.datasciencecentral.com/profiles/blogs/machine-learning-unsupervised-k-means-clustering-and
#' and https://medium.com/@haataa/how-to-measure-clustering-performances-when-there-are-no-ground-truth-db027e9a871c
#'
#' ch and asw both produce a score. Higher score = better defined cluster. Maximize between
#' cluster dispersion and minimize within cluster dispersion. Can plot the score versus the number of clusters.
#'
#' @param inputted.data A dataframe.
#' @param clustering.columns Name of columns with data to use for kmeans clustering.
#'
#' @return A list is returned with two elements:
#' 1. ggplot for elbow plot.
#' 2. ggplot for ch and asw plot.
#'
#'
#' @export
#'
#' @family Clustering functions
#'
#' @examples
#'
#' example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
#'                                  35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
#'                           y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
#'                                 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#'
#' #Results should say that 3 clusters is optimal
#' output <- CalcOptimalNumClustersForKMeans(example.data, c("x", "y"))
#'
#' elbow.plot <- output[[1]]
#'
#' ch.and.asw.plot <- output[[2]]
#'
#' #dev.new()
#' elbow.plot
#'
#' #dev.new()
#' ch.and.asw.plot
#'
CalcOptimalNumClustersForKMeans <- function(inputted.data, clustering.columns){


  working.data <- inputted.data


  elbow.plot <- factoextra::fviz_nbclust(working.data[,clustering.columns], stats::kmeans, method = "wss") + ggplot2::labs(subtitle = "Elbow method")

  #grDevices::dev.new()
  #elbow.plot

  # Two other methods
  kclust.ch <- fpc::kmeansruns(working.data[,clustering.columns],krange=1:10,criterion='ch')
  kclust.ch$bestk
  kclust.asw <- fpc::kmeansruns(working.data[,clustering.columns],krange=1:10,criterion='asw')
  kclust.asw$bestk
  ##Looks like 2 is a good number to use.
  crit.df <- data.frame(k=1:10, ch=scale(kclust.ch$crit), asw=scale(kclust.asw$crit))
  crit.df <- reshape2::melt(crit.df, id.vars=c('k'), variable.name = "measure", value.name="score")

  #grDevices::dev.new()
  ch.and.asw.plot <- ggplot2::ggplot(crit.df, ggplot2::aes_string(x="k", y="score", color="measure"))+
    ggplot2::geom_point(ggplot2::aes_string(shape="measure"))+
    ggplot2::geom_line(ggplot2::aes_string(linetype="measure"))+
    ggplot2::scale_x_continuous(breaks = 1:10, labels=1:10)

  return(list(elbow.plot, ch.and.asw.plot))

}


#' Compare clusters
#'
#' For each cluster as specified by rows.for.each.cluster, box plots are generated
#' for each column as specified in name.of.values.to.compare.
#'
#' @param data.input A dataframe
#' @param rows.for.each.cluster A numerical vector that indicates which cluster each observation (row in data.input) belongs to
#' @param name.of.values.to.compare A vector of strings that indicates the columns in the data.input that should be
#'
#' @return No objects are returned, but a plot will be outputted.
#'
#'
#' @export
#'
#' @family Clustering functions
#'
#' @examples
#'
#' example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
#'                                  35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
#'                           y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
#'                                 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#'
#' km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)
#'
#' grouped <- km.res$cluster
#'
#' generate.plots.comparing.clusters(example.data, grouped, c("x", "y"))
#'
#'
generate.plots.comparing.clusters <- function(data.input, rows.for.each.cluster, name.of.values.to.compare){

  number.of.clusters.to.use <- length(unique(rows.for.each.cluster))

  #grDevices::dev.new()
  graphics::par(mfrow=c(1,number.of.clusters.to.use))

  running.vals <- NULL

  ##Use this loop to figure out axes
  for (K in 1:number.of.clusters.to.use) {
    d0 <- data.input[rows.for.each.cluster==K,name.of.values.to.compare]
    running.vals <- c(running.vals, d0)
  }
  lmts <- range(running.vals)

  ##Use this loop to plot
  for (K in 1:number.of.clusters.to.use) {
    d0 <- data.input[rows.for.each.cluster==K,name.of.values.to.compare]
    graphics::boxplot(d0, main=paste(c("Cluster ", K)), ylim=lmts)
  }
}


#' Generate parallel plot to show each observation and which cluster they belong in.
#'
#' @param inputted.data A dataframe where there are numerical columns for clustering.
#' @param cluster.assignment.column A string that specifies the name of the column in inputted.data that specify which cluster each observation in inputted.data belongs to.
#' @param x_axis_features A vector of strings that specify which features from inputted.data should be displayed on the x-axis of the parallel plot.
#'
#' @return No object is returned but a plot will be created.
#'
#'
#' @export
#'
#' @family Clustering functions
#'
#' @examples
#'
#' example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31,
#'                                  35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
#'                           y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
#'                                 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45))
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#'
#' km.res <- stats::kmeans(example.data[,c("x", "y")], 3, nstart = 25, iter.max=10)
#'
#' grouped <- km.res$cluster
#'
#' example.data <- cbind(example.data, grouped)
#'
#' GenerateParcoordForClusters(example.data, "grouped", c("x", "y"))
#'
#'
GenerateParcoordForClusters <- function(inputted.data, cluster.assignment.column, x_axis_features){

  temp.data <- inputted.data
  grouped <- inputted.data[,cluster.assignment.column]

  #Will be subsetted for only features of interest and will contain avg lines.
  temp.data2 <- inputted.data[,x_axis_features]

  cluster.categories <- sort(unique(temp.data[,cluster.assignment.column]))
  num.clusters <- length(cluster.categories)
  parcoord.col <- c(grouped+1)
  parcoord.lwd <- c(rep(1, times = length(grouped)) )

  legend.lab <- c(sort(unique(grouped)) ) #c(cluster.categories,)
  legend.col <- c(sort(unique(grouped+1)) )
  legend.lty <- rep(1, times = length(c(sort(unique(grouped+1)))))
  legend.lwd <- rep(1, times = length(c(sort(unique(grouped+1)))))

  color.num <- 5

  for(cluster.index in cluster.categories)
  {
    cluster.avg.line <- colMeans(subset(temp.data, temp.data[,cluster.assignment.column]==cluster.index)[,x_axis_features])

    temp.data2 <- rbind(temp.data2, cluster.avg.line)
    parcoord.col <- c(parcoord.col, color.num)
    parcoord.lwd <- c(parcoord.lwd, 6)

    label <- paste("Avg line for ", cluster.index)
    legend.lab <- c(legend.lab, label)
    legend.col <- c(legend.col, color.num)
    legend.lty <- c(legend.lty, 1)
    legend.lwd <- c(legend.lwd, 6)

    color.num <- color.num + 2

  }

  #grDevices::dev.new()
  MASS::parcoord(temp.data2 , col= parcoord.col, lwd = parcoord.lwd)

  graphics::legend("topright", legend=legend.lab,
         col=legend.col, lty=legend.lty, lwd = legend.lwd, cex=0.8)


}


#' Make a 2D scatter plot that shows the data as represented by PC1 and PC2
#'
#' After clustering of a dataset with two or more dimensions, we often want to
#' visualize the result of the clustering on a 2D plot. If there are more than
#' two dimensions, we want to first reduce the data down to two dimensions.
#' This can be done with PCA. After PCA is completed, the data can be plotted
#' with this function.
#'
#' This function plots PC1 vs PC2 as well as PC1 vs PC3. This function uses
#' the output of stat::prcomp(). The input into prcomp() needs to have
#' at least 3 dimensions. Points are colored by the cluster input and they are
#' labeled by the subgroup input. 
#' 
#' Additionally, this function also calculates chi-square results
#' to see if cluster.labels.input and subgroup.labels.input are associated.
#'
#' @param pca.results.input An object outputted by stats::prcomp(). The PCA of all the features used for clustering. There should be at least 3 features.
#' @param cluster.labels.input A vector of integers that specify which cluster each observation belongs to
#' (order of observations must match the data inputted to prcomp() to generate pca.results.input).
#' @param subgroup.labels.input A vector of strings that specify an additional label for each observations.
#'
#' @return A list of 4 objects:
#' 1.ggplot objct for PC1 vs PC2.
#' 2.ggplot object for PC1 vs PC3.
#' 3.Chi-square results.
#' 4.Table used for chi-square.
#'
#' @export
#'
#' @family Clustering functions
#'
#' @examples
#' example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
#'                                  39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
#'                            y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27,
#'                                  29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
#'                            z = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#'                                  1, 1, 1, 1, 1, 1, 1, 1, 1))
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#'
#' km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)
#'
#' grouped <- km.res$cluster
#'
#' pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)
#'
#' actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", 
#'                         "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")
#'
#' results <- generate.2D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)
#'                                              
#' #PC1 vs PC2
#' results[[1]]
#' 
#' #PC1 vs PC3
#' results[[2]]
#' 
#' #Chi-square results
#' results[[3]]
#' 
#' #Table
#' results[[4]]
#'
generate.2D.clustering.with.labeled.subgroup <- function(pca.results.input, cluster.labels.input, subgroup.labels.input){

  #Outputs:
  #tbl
  #chisq.res
  
  tbl <- table(subgroup.labels.input, cluster.labels.input)
  sink("NUL")
  chisq.res <- stats::chisq.test(tbl)
  sink()

  ##Plot
  #print(tbl)
  #utils::str(tbl)

  #grDevices::dev.new()
  #graphics::par(mfrow=c(1,3), xpd=TRUE)

  totalvar <- (pca.results.input[[1]]^2)
  variancePer <- round(totalvar/sum(totalvar)*100,1)

  xlab_string <- paste(c("Principal Component 1 (",variancePer[1],"%)"),collapse="")
  ylab_string <- paste(c("Principal Component 2 (",variancePer[2],"%)"),collapse="")
  zlab_string <- paste(c("Principal Component 3 (",variancePer[3],"%)"),collapse="")
  
  xdata <- pca.results.input$x[,c(1)]
  ydata <- pca.results.input$x[,c(2)]
  zdata <- pca.results.input$x[,c(3)]
  cluster_color <- as.factor(cluster.labels.input)
  subgroup_label <- subgroup.labels.input
  
  pca.data <- data.frame(xdata, ydata, zdata, cluster_color, subgroup_label)
  
  PC1vPC2 <- ggplot2::ggplot(pca.data, ggplot2::aes_string(x="xdata", y="ydata")) + 
             ggplot2::geom_point(ggplot2::aes_string(color="cluster_color"), size=7) +
             ggplot2::geom_text(ggplot2::aes_string(label="subgroup_label")) +
             ggplot2::xlab(xlab_string) +
             ggplot2::ylab(ylab_string)
  
  
  PC1vPC3 <- ggplot2::ggplot(pca.data, ggplot2::aes_string(x="xdata", y="zdata")) + 
             ggplot2::geom_point(ggplot2::aes_string(color="cluster_color"), size=7) +
             ggplot2::geom_text(ggplot2::aes_string(label="subgroup_label")) +
             ggplot2::xlab(xlab_string) +
             ggplot2::ylab(zlab_string)
  
  output <- list()
  
  output[[1]] <- PC1vPC2
  output[[2]] <- PC1vPC3
  output[[3]] <- chisq.res
  output[[4]] <- tbl

  return(output)
  
}

#' Make a 3D scatter plot that shows the data as represented by PC1, PC2, and PC3 and color labels clusters
#'
#' After clustering of a dataset with three or more dimensions, we often want to
#' visualize the result of the clustering on a 3D plot. If there are more than
#' three dimensions, we want to first reduce the data down to three dimensions.
#' This can be done with PCA. After PCA is completed, the data can be plotted
#' with this function.
#'
#' This function outputs values that can be used to plot PC1 vs PC2 vs PC3 using the rgl package. 
#' This function uses the output of stat::prcomp(). The input into prcomp() needs to have
#' at least 3 dimensions. PC = principal component. 
#' 
#' This function creates a contingency table to show if the cluster labels
#' and subgroup labels are significantly associated. 
#'
#' @param pca.results.input An object outputted by stats::prcomp(). The PCA of all the features used for clustering. There should be at least 3 features.
#' @param cluster.labels.input A vector of integers that specify which cluster each observation belongs to.
#' @param subgroup.labels.input A vector of strings that specify an additional label for each observations. Each point needs to be labeled
#'
#' @return A list with 8 objects:
#'   1. String specifying x axis label with percent variance for PC1.
#'   2. String specifying y axis label with percent variance for PC2.
#'   3. String specifying z axis label with percent variance for PC3.
#'   4. A vector specifying values for x coordinates of points. PC1 values.
#'   5. A vector specifying values for y coordinates of points. PC2 values.
#'   6. A vector specifying values for z coordinates of points. PC3 values
#'   7. A chisq.test() result object.
#'   8. A table used for chisq.test()
#'
#' @export
#'
#' @family Clustering functions
#'
#' @examples
#'
#' example.data <- data.frame(x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35,
#'                                  39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30),
#'                            y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44,
#'                                  27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45),
#'                            z = c(1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3,
#'                                  3, 3, 3, 3, 3, 3, 3, 3))
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#'
#' km.res <- stats::kmeans(example.data[,c("x", "y", "z")], 3, nstart = 25, iter.max=10)
#'
#' grouped <- km.res$cluster
#'
#' pca.results <- prcomp(example.data[,c("x", "y", "z")], scale=FALSE)
#'
#' actual.group.label <- c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", 
#' "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B")
#' 
#' results <- generate.3D.clustering.with.labeled.subgroup(pca.results, grouped, actual.group.label)
#' 
#' xlab.values <- results[[1]]
#' ylab.values <- results[[2]]
#' zlab.values <- results[[3]]
#' xdata.values <- results[[4]]
#' ydata.values <- results[[5]]
#' zdata.values <- results[[6]]
#' 
#' #rgl::rgl.bg(color = "white")
#' 
#' #rgl::plot3d(x= xdata.values, y= ydata.values, z= zdata.values,
#' #xlab = xlab.values, ylab = ylab.values, zlab = zlab.values, col=(grouped+1), pch=20, cex=2)
#' 
#' #rgl::text3d(x= xdata.values, y= ydata.values, z= zdata.values, text= actual.group.label, cex=1)
#'
generate.3D.clustering.with.labeled.subgroup <- function(pca.results.input, cluster.labels.input, subgroup.labels.input){

  tbl <- table(subgroup.labels.input, cluster.labels.input)
  chisq.res <- stats::chisq.test(tbl)

  totalvar <- (pca.results.input[[1]]^2)
  variancePer <- round(totalvar/sum(totalvar)*100,1)

  xlab <- paste(c("PC 1 (",variancePer[1],"%)"),collapse="")
  ylab <- paste(c("PC 2 (",variancePer[2],"%)"),collapse="")
  zlab <- paste(c("PC 3 (",variancePer[3],"%)"),collapse="")

  xdata <- pca.results.input$x[,c(1)]
  ydata <- pca.results.input$x[,c(2)]
  zdata <- pca.results.input$x[,c(3)]
  
  output <- list()
  
  output[[1]] <- xlab
  output[[2]] <- ylab
  output[[3]] <- zlab
  output[[4]] <- xdata
  output[[5]] <- ydata
  output[[6]] <- zdata
  output[[7]] <- chisq.res
  output[[8]] <- tbl
  
  return(output)
  
  ##Plot
  #rgl::rgl.open()
  #rgl::rgl.bg(color = "white")
  #rgl::plot3d(x= pca.results.input$x[,c(1)], y= pca.results.input$x[,c(2)], z= pca.results.input$x[,c(3)],
  #       xlab = xlab, ylab = ylab, zlab = zlab, col=(cluster.labels.input+1), pch=20, cex=2, main=main.text)
  
  #rgl::text3d(x= pca.results.input$x[,c(1)], y= pca.results.input$x[,c(2)], z= pca.results.input$x[,c(3)], text= subgroup.labels.input, cex=subgroup.text.size)
  
  #Allow plot to be displayed on html file
  #rgl::rglwidget()
  
  #rgl::pch3d(x= pca.results.input$x[,c(1)], y= pca.results.input$x[,c(2)], z= pca.results.input$x[,c(3)], pch= subgroup.labels.input, cex=0.3)

}

#' Automated hierarchical clustering with labeling of observations and groups
#'
#' Versatile hierarchical clustering function that can use correlation or distance
#' clustering. The linkage can also be specified.
#'
#' Correlation type, distance type, linkage type, and coloring of groups can
#' all be specified. The result is a dendrogram of the hierarchical clustering with
#' coloring scheme that shows which observations belong to which cluster. Additionally,
#' coloring of the terminal branches can be done with meta data so that the
#' meta data can be compared to the cluster assignment. Links to resources
#' used to make this function are provided in the code.
#'
#' @param working.data A dataframe of data
#' @param clustering.columns A vector of strings that indicate the names of columns to be used for clustering. The columns should be numerical.
#' @param label.column.name A string that indicates the name of column to be used for labeling the terminal branches of the dendrogram.
#' @param grouping.column.name A string that indicates the name of column to be used for coloring terminal branches. The column should contain numerical values and should be a factor. A value of 0 will result in a black terminal branch. A value of 1 will result in a red terminal branch.
#' @param number.of.clusters.to.use A numerical value indicating how many clusters (main branches) to be colored.
#' @param distance_method A string that specifies the distance method for clustering. Default option is "euclidean". See documentation for stats::dist() for all available options. This is only used if Use.correlation.for.hclust is FALSE.
#' @param correlation_method A string that specifies the correlation method to be used for clustering. Default option is "spearman". See documentation for stats::cor() for all available options. This is only used if Use.correlation.for.hclust is TRUE.
#' @param linkage_method_type A string that specifies the linkage method to be used for clustering. See documentation for stats::hclust() for all available options. Examples are "ward.D", "complete".
#' @param Use.correlation.for.hclust A boolean specifying if correlation between observations should be used to cluster. If correlation is not used, then distance between observations is used instead.
#' @param terminal.branch.font.size A numeric value specifying font size of the labels for the terminal branches as specified by label.column.name. Default value is 1.
#' @param title.to.use A string that indicates the title of the plot.
#'
#' @return A list with three objects:
#' 1.hclust.res: The object outputted from stats::hclust().
#' 2.dend1: The object outtputed from converting hclust.res into a dendrogram.
#' 3.kcboot: The results from fpc::clusterboot() which evaluates the stability of the clusters.
#' 4.title: Title to use for dendrogram
#' 5.cluster.labels.renamed.reordered: Cluster group matched with labels of samples. 
#' 6.d: Table generated with kcboot to assess stability of clusters
#'
#' @export
#'
#' @family Clustering functions
#'
#' @import dendextend
#'
#' @examples
#'
#' id = c("1a", "1b", "1c", "1d", "1e", "1f", "1g", "2a", "2b", "2c", "2d", "3h", "3i", "3a",
#'        "3b", "3c", "3d", "3e", "3f", "3g", "2g", "2h")
#'
#' x = c(18, 21, 22, 24, 26, 26, 27, 30, 31, 35, 39, 40, 41, 42, 44, 46, 47, 48, 49, 54, 35, 30)
#'
#' y = c(10, 11, 22, 15, 12, 13, 14, 33, 39, 37, 44, 27, 29, 20, 28, 21, 30, 31, 23, 24, 40, 45)
#'
#' color = as.factor(c(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1))
#'
#' example.data <- data.frame(id, x, y, color)
#'
#' #dev.new()
#' plot(example.data$x, example.data$y)
#' text(example.data$x, example.data$y, labels = id, cex=0.9, font=2)
#'
#' results <- HierarchicalClustering(working.data = example.data,
#'                        clustering.columns = c("x", "y"),
#'                        label.column.name = "id",
#'                        grouping.column.name = "color",
#'                        number.of.clusters.to.use = 3,
#'                        distance_method = "euclidean",
#'                        correlation_method = NULL,
#'                        linkage_method_type = "ward.D",
#'                        Use.correlation.for.hclust = FALSE,
#'                        terminal.branch.font.size = 1,
#'                        title.to.use = "Clustering based on x and y data")
#'                        
#' hclust.res <- results[[1]]
#' dend <- results[[2]]
#' kcboot.res <- results[[3]]
#' title.to.use <- results[[4]]
#' labeled.samples <- results[[5]]
#' stability.table <- results[[6]]
#'                      
#' #Plot dendrogram                         
#' plot(dend, main = title.to.use)
#' 
#' #Add legend to dendrogram
#' legend.labels.to.use <- levels(example.data[,"color"])
#' col.to.use <- as.integer(levels(example.data[,"color"])) + 1
#' pch.to.use <- rep(20, times = length(legend.labels.to.use))
#' graphics::legend("topright",
#'       legend = legend.labels.to.use,
#'       col = col.to.use,
#'       pch = pch.to.use, bty = "n",  pt.cex = 1.5, cex = 0.8 ,
#'       text.col = "black", horiz = FALSE, inset = c(0, 0.1),
#'       title = "Color")
#'       
#' #Display sample assignment
#' labeled.samples
#'       
#' #Display stability of clusters
#' gridExtra::grid.table(stability.table)
#' 
#' #Display kcboot full output
#' kcboot.res
#' 
#'
HierarchicalClustering <- function(working.data, clustering.columns, label.column.name,
                                   grouping.column.name, number.of.clusters.to.use,
                                   distance_method = "euclidean", correlation_method,
                                   linkage_method_type, Use.correlation.for.hclust,
                                   terminal.branch.font.size, title.to.use){

  #Assumes that data is already normalized.

  #------------------------------------------------------------------------------
  # Clustering with Hierarchical clustering
  #------------------------------------------------------------------------------
  #Resources:
  #https://www.datacamp.com/community/tutorials/hierarchical-clustering-R
  #https://cran.r-project.org/web/packages/dendextend/vignettes/FAQ.html. How to color some labels
  #How to use correlation matrix: https://www.datanovia.com/en/blog/clustering-using-correlation-as-distance-measures-in-r/


  # #Testing conditions
  # working.data <- working.data.scaled.columns
  # distance_method <- c("euclidean")
  # correlation_method <- c("spearman")
  # linkage_methods <- c("ward.D", "ward.D2")
  # clustering.columns <- names.of.dependent.variables
  # label.column.name <- "ID"
  # grouping.column.name <- "HIV"
  # Use.correlation.for.hclust <- FALSE
  # number.of.clusters.to.use <- 2
  # title.to.use <- "HIV clustering based on volumetric data"
  # #linkage_methods <- c("ward.D", "ward.D2", "single", "complete", "average", "mcquitty", "median", "centroid")


  if(Use.correlation.for.hclust == TRUE)
  {
    data.dist = stats::as.dist(1 - stats::cor(t(working.data[,clustering.columns]), method = correlation_method))
  }else{
    data.dist = stats::dist(working.data[,clustering.columns], method = distance_method)
  }


  #Label the observations
  set.seed(1)
  hclust.res <- stats::hclust(data.dist, method = linkage_method_type)
  #hclust.res.list[[list.index]] <- hclust.res
  dend <- stats::as.dendrogram(hclust.res)

  #Get the order that should occur on the dendrogram: https://stackoverflow.com/questions/33611111/how-to-change-dendrogram-labels-in-r
  #and add labels.
  #labels() is from the dendextend package.
  labels(dend) <- working.data[,label.column.name][hclust.res$order]

  #Color by an additional group label
  #labels_colors is from dendextend package.
  labels_colors(dend) <- as.integer(as.character(working.data[,grouping.column.name][hclust.res$order])) + 1 #col = 1 is black. col = 2 is red.
  dend <- dendextend::set(dend, "labels_cex", value = terminal.branch.font.size)
  dend1 <- dendextend::color_branches(dend, k = number.of.clusters.to.use)

  #Calculate the quality of clusters using Dunn's index
  #Dunn's index is the ratio between the minimum inter-cluster distances to the maximum intra-cluster diameter.
  #The diameter of a cluster is the distance between its two furthermost points. In order to have well separated
  #and compact clusters you should aim for a higher Dunn's index.
  set.seed(1)
  tree <- stats::hclust(data.dist, method = linkage_method_type)
  cluster <- stats::cutree(tree, k = number.of.clusters.to.use) #LEFT OFF HERE. CLUSTER LABEL AND COLORING ON DEND DO NOT MATCH
  dunn.index <- clValid::dunn(data.dist, cluster) #From 0 to inf. Should be maximized.

  #grDevices::dev.new()
  title <- paste(title.to.use, linkage_method_type, " linkage. ", distance_method, " distance. ", correlation_method, " correlation.\n", "Correlation Used", Use.correlation.for.hclust, ". Dunn's index= ", dunn.index)
  #plot(dend1, main = title)

  #Add legend to dendrogram
  #legend.labels.to.use <- levels(working.data[,grouping.column.name])
  #col.to.use <- as.integer(levels(working.data[,grouping.column.name])) + 1
  #pch.to.use <- rep(20, times = length(legend.labels.to.use))
  #graphics::legend("topright",
  #       legend = legend.labels.to.use,
  #       col = col.to.use,
  #       pch = pch.to.use, bty = "n",  pt.cex = 1.5, cex = 0.8 ,
  #       text.col = "black", horiz = FALSE, inset = c(0, 0.1),
  #       title = grouping.column.name)

  #Get the cluster assignment for each leaf
  ##The name labels of the leaves on the outputted dendrogram reflects the cluster assignment
  cluster.labels.renamed <- cluster
  names(cluster.labels.renamed) <- working.data[,label.column.name]
  cluster.labels.renamed.reordered <- cluster.labels.renamed[hclust.res$order]
  #print(title)
  #print("Cluster assignment")
  #print(cluster.labels.renamed.reordered)

  ##Evaluate stability of clusters
  #https://win-vector.com/2015/09/04/bootstrap-evaluation-of-clusters/
  #disthclustCBI needs to be used because the input matrix is already a distance (dissimilarity)
  #matrix. https://rdrr.io/cran/fpc/src/R/clusterboot.R
  set.seed(1)
  invisible(utils::capture.output(kcboot <- fpc::clusterboot(data.dist, distances = TRUE, B=100, clustermethod = fpc::disthclustCBI ,method=linkage_method_type, k=number.of.clusters.to.use, seed=1)))
  kcboot$bootmean
  kcboot$bootbrd
  ##Display stability
  stability.table <- rbind(kcboot$bootmean, kcboot$bootbrd)
  rownames(stability.table) <- c("bootmean", "bootbrd")
  colnames(stability.table) <- c(1:number.of.clusters.to.use)
  #grDevices::dev.new()
  d <- stability.table
  #gridExtra::grid.table(d)

  #From the stability output, how do you tell which cluster on the dendrogram
  #is stable?
  groups <- kcboot$result$partition
  groups_relabeled <- groups
  names(groups_relabeled) <- working.data[,label.column.name]
  groups_relabeled_reordered <- groups_relabeled[hclust.res$order]

  #Checked that group assignment from hclust and cluster boot match by
  #comparing cluster.labels.renamed.reordered and groups_relabeled_reordered

  output <- list(hclust.res, 
                 dend1, 
                 kcboot, 
                 title,
                 cluster.labels.renamed.reordered,
                 d)

  return(output)

}
yhhc2/machinelearnr documentation built on Dec. 23, 2021, 7:19 p.m.