R/internal_cluster_validity.R

Defines functions clusterValidityPerK getK internalClusterValidity avgSilhouette

# ---------------------------------------------------------------------------- #
#
# Functions which implement internal cluster validation metrics
#
# ---------------------------------------------------------------------------- #


# avgSilhouette
#
# For a clustering solution for any k, compute the average silhouette
# width as an internal measure of cluster validity
#
# @param clusters Vector where values are cluster assignments for samples
# @param dmatrix Dissimilarity matrix as generated by the
# \code{dist()} function
#
# @return Numeric value
avgSilhouette <- function(clusters, dmatrix) {

    stats <- summary(cluster::silhouette(clusters, dmatrix))
    return(stats[["avg.width"]])
}


# clusterValidity
#
# For a given value of k, an hclust object and the corresponding dissimilarity
# matrix, cut the tree at height \code{k} and compute internal measures of
# cluster validity. For now, we compute only the Average Silhouette width
# of the partition.
#
# @param k Numeric value giving number of clusters, used to cut the dendrogram
# from an \code{hclust} object
# @param hclust_obj hclust object
# @param dmatrix Dissimilarity matrix as produced by the \code{dist} function
#
# @return Data frame with one row
internalClusterValidity <- function(k, hclust_obj, dmatrix) {

    clusters <- cutree(hclust_obj, k)
    data.frame("k" = k,
                "Average_Silhouette" = avgSilhouette(clusters, dmatrix))
}


getK <- function(ft_mat, optimal_clusters) {

    if (optimal_clusters) {
        ft_mat %>%
            clusterValidityPerK() %>%
            dplyr::slice(which.max(Average_Silhouette))
    } else {
        ft_mat %>%
            clusterValidityPerK() %>%
            dplyr::filter(k == 2)
    }

}


# clusterValidityPerK
#
# Given an hclust object and the corresponding dissimilarity
# matrix, cut the tree at every possible height \code{k} and compute internal
# measures of cluster validity. For now, we compute only the Average Silhouette
# width of the partition.
#
# @param ft_mat Data frame or matrix, feature matrix as generated by
# \code{\link{summarizePeaks}} or \code{\link{binarizePeaks}}
#
# @return Data frame with one row per value of k
clusterValidityPerK <- function(ft_mat) {

    dmatrix <- dist(ft_mat)
    hclust_obj <- hclust(dmatrix, method = "complete")

    k <- seq(2, (nrow(ft_mat) - 1))
    lapply(k, internalClusterValidity, hclust_obj, dmatrix) %>%
        dplyr::bind_rows()

}
selinj/chromswitch documentation built on Jan. 27, 2024, 12:36 p.m.