R/silhouette.R

Defines functions maximiseSilhouette plotDunn2s plotDunns plotWidestGap plotSilhouette

Documented in maximiseSilhouette plotDunn2s plotDunns plotSilhouette plotWidestGap

#' Plot silhouette
#'
#' Plot average silhouette for different values of K.
#' @param sil vector of the average silhouette for K from 2 to some value maxK
#' @param chosenK Chosen number of clusters. If specified, a vertical line is
#' plotted in correspondence of the indicated value. Default is NULL.
#' @param fileName Name of the png file.
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @keywords internal
#'
plotSilhouette <- function(sil, chosenK = NULL, fileName) {
    maxK <- length(sil) + 1

    fileName <- paste(fileName, "_silhouette.png", sep = "")
    grDevices::png(fileName, width = 400, height = 400)
    graphics::plot(2:maxK,
                   sil,
                   xlab = "Number of clusters",
                   ylab = "Average silhouette",
                   type = "o")
    if (!is.null(chosenK))
        graphics::abline(v = chosenK, col = "darkred")
    grDevices::dev.off()

    invisible(0)
}

#' Plot widest gap
#'
#' Plot widest gap for different values of K.
#' @param widestGap Vector of widest gap values for K from 2 to some value maxK.
#' @param chosenK Chosen number of clusters. If specified, a vertical line is
#' plotted in correspondence of the indicated value. Default is NULL.
#' @param fileName Name of the png file.
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @keywords internal
#'
plotWidestGap <- function(widestGap, chosenK = NULL, fileName) {
    maxK <- length(widestGap) + 1

    fileName <- paste(fileName, "_widestGap.png", sep = "")
    grDevices::png(fileName, width = 400, height = 400)
    graphics::plot(2:maxK,
                   widestGap,
                   xlab = "Number of clusters",
                   ylab = "Widest gap",
                   type = "o")
    if (!is.null(chosenK))
        graphics::abline(v = chosenK, col = "darkred")
    grDevices::dev.off()
}

#' Plot Dunn's index
#'
#' Plot Dunn's index (minimum separation / maximum diameter) for different
#' values of K.
#' @param dunns Vector of widest gap values for K from 2 to some value maxK.
#' @param chosenK Chosen number of clusters. If specified, a vertical line is
#' plotted in correspondence of the indicated value. Default is NULL.
#' @param fileName Name of the png file.
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @keywords internal
#'
plotDunns <- function(dunns, chosenK = NULL, fileName) {
    maxK <- length(dunns) + 1

    fileName <- paste(fileName, "_dunns.png", sep = "")
    grDevices::png(fileName, width = 400, height = 400)
    graphics::plot(2:maxK,
                   dunns,
                   xlab = "Number of clusters",
                   ylab = "Dunn's index",
                   type = "o")
    if (!is.null(chosenK))
        graphics::abline(v = chosenK, col = "darkred")
    grDevices::dev.off()

}

#' Plot Dunn's alternative index
#'
#' Plot Dunn's alternative index (minimum average dissimilarity between two
#' cluster divided by maximum average within cluster dissimilarity) for
#' different values of K.
#' @param dunns Vector of widest gap values for K from 2 to some value maxK.
#' @param chosenK Chosen number of clusters. If specified, a vertical line is
#' plotted in correspondence of the indicated value. Default is NULL.
#' @param fileName Name of the png file.
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @keywords internal
#'
plotDunn2s <- function(dunns, chosenK = NULL, fileName) {
    maxK <- length(dunns) + 1

    fileName <- paste(fileName, "_dunn2s.png", sep = "")
    grDevices::png(fileName, width = 400, height = 400)
    graphics::plot(2:maxK,
                   dunns,
                   xlab = "Number of clusters",
                   ylab = "Dunn's alternative index",
                   type = "o")
    if (!is.null(chosenK))
        graphics::abline(v = chosenK, col = "darkred")
    grDevices::dev.off()

    invisible(0)
}

#' Choose K that maximises the silhouette from a set of kernel matrices and
#' clusterings
#'
#' Choose the number of clusters K that maximises the silhouette, starting from
#' a set of kernel matrices each corresponding to a different choice of K and
#' the corresponding clusterings of the data for each of those values of K.
#'
#' @param kernelMatrix N X N X (maxK-1) array of kernel matrices.
#' @param clLabels (maxK-1) X N matrix containing the clusterings obtained for
#' different values of K.
#' @param maxK Maximum number of clusters considered.
#' @param savePNG If TRUE, a plot of the silhouette is saved in the working
#' folder. Defaults to FALSE.
#' @param fileName If \code{savePNG} is TRUE, this is the name of the png file.
#' @param isDistance Boolean. If TRUE, the kernel matrices are interpreted as
#' matrices of distances, otherwise as matrices of similarities.
#' @param widestGap Boolean. If TRUE, also computes widest gap index (and plots
#' it if \code{savePNG} is TRUE).
#' @param dunns Boolean. If TRUE, also computes Dunn's index: minimum separation
#' / maximum diameter (and plots it if \code{savePNG} is TRUE).
#' @param dunn2s Boolean. If TRUE, also computes an alternative version
#' of Dunn's index: minimum average dissimilarity between two cluster / maximum
#' average within cluster dissimilarity (and plots it if \code{savePNG} is
#' TRUE).
#' @return The function returns a list containing:
#' \item{silh}{a vector of length \code{maxK-1} such that \code{silh[i]} is the
#' silhouette for \code{K = i+1}}
#' \item{K}{the lowest number of clusters for which the silhouette is
#' maximised.}
#' @author Alessandra Cabassi \email{alessandra.cabassi@mrc-bsu.cam.ac.uk}
#' @export
#'
maximiseSilhouette <-
    function(kernelMatrix,
             clLabels,
             maxK,
             savePNG = FALSE,
             fileName = "silhouette",
             isDistance = FALSE,
             widestGap = FALSE,
             dunns = FALSE,
             dunn2s = FALSE) {

    # Initialise vector of average silhouette
    sil <- rep(NA, maxK - 1)
    if (widestGap)
        widestGap_ <- rep(NA, maxK - 1)
    if (dunns)
        dunns_ <- rep(NA, maxK - 1)
    if (dunn2s)
        dunn2s_ <- rep(NA, maxK - 1)

    for (i in 2:maxK) {
        if (isDistance) {
            DM <-
                stats::as.dist(kernelMatrix[, , i - 1], diag = FALSE,
                               upper = FALSE)
        } else {
            # Use the kernel matrix as distance matrix
            DM <-
                stats::as.dist(1 - kernelMatrix[, , i - 1], diag = FALSE,
                               upper = FALSE)
        }

        # Calculate the average silhouette over all the points
        if (widestGap | dunns | dunn2s)
            cluster_stats <- fpc::cluster.stats(DM, clLabels[i - 1, ])
        if (widestGap)
            widestGap_[i - 1] <- cluster_stats$widestgap
        if (dunns)
            dunns_[i - 1] <- cluster_stats$dunn
        if (dunn2s)
            dunn2s_[i - 1] <- cluster_stats$dunn2
        sil[i - 1] <-
            mean(cluster::silhouette(clLabels[i - 1, ], DM)[, "sil_width"])

    }

    K <- which.max(sil)[1] + 1
    # The "+1" is there because the silhouette is stored in a vector where
    # element i corresponds to number of clusters equal to i+1
    if (savePNG) {
        plotSilhouette(sil, K, fileName)
        if (widestGap)
            plotWidestGap(widestGap_, K, fileName)
        if (dunns)
            plotDunns(dunns_, K, fileName)
        if (dunn2s)
            plotDunn2s(dunn2s_, K, fileName)

    }

    return(list(silhouette = sil, K = K))
}
acabassi/coca documentation built on July 14, 2020, 8:31 p.m.