In Core-Bioinformatics/ClustAssess: Tools for Assessing Clustering

knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)
set.seed(1001)

is_e1071 <- require("e1071", quietly = TRUE)
if (!is_e1071) {
    install.packages("e1071", repos = "https://cloud.r-project.org")
    library(e1071)
}

is_dbscan <- require("dbscan", quietly = TRUE)
if (!is_dbscan) {
    install.packages("dbscan", repos = "https://cloud.r-project.org")
    library(dbscan)
}

is_dendextend <- require("dendextend", quietly = TRUE)
if (!is_dendextend) {
    install.packages("dendextend", repos = "https://cloud.r-project.org")
    library(dendextend)
}

install.packages("harmony", repos = "https://cloud.r-project.org")

In this vignette we will illustrate how element-centric similarity can be used to compare different kinds of clustering results: flat disjoint clusterings, flat overlapping clusterings, and hierarchical clusterings.

library(ClustAssess)
library(ggplot2)
suppressPackageStartupMessages(library(dendextend))
theme_set(theme_classic())

# we will use the Iris dataset for this vignette
data <- as.matrix(iris[, 1:4])
df.iris <- as.data.frame(prcomp(data)$x)
df.iris$species <- iris$Species
ggplot(df.iris, aes(x = PC1, y = PC2, color = species)) +
    geom_point() +
    labs(title = "Iris PCA")

Next, we cluster the data using three different approaches.

# a flat, disjoint clustering with DBscan
db.res <- dbscan(data, eps = 1)$cluster
df.iris$db.cluster <- as.factor(db.res)
ggplot(df.iris, aes(x = PC1, y = PC2, color = db.cluster)) +
    geom_point() +
    labs(title = "DBScan clustering")

# an overlapping clustering with soft k-means
cmeans.res <- cmeans(data, centers = 6)$membership
# get the strongest cluster assignment for each observation to plot
# but we will still use the soft cluster assignments to calculate ECS
df.iris$cmeans.cluster <- as.factor(apply(cmeans.res, 1, which.max))
ggplot(df.iris, aes(x = PC1, y = PC2, color = cmeans.cluster)) +
    geom_point() +
    labs(title = "c-means clustering")

# a hierarchical clustering
distances <- dist(data, method = "euclidean")
hc.res <- hclust(distances, method = "complete")
# plot the resulting dendrogram
node.colors <- c("blue", "red", "green")
hc.res %>%
    as.dendrogram() %>%
    set("leaves_pch", 19) %>%
    set("leaves_cex", 0.5) %>%
    set("leaves_col", node.colors[df.iris$species]) %>%
    plot(main = "Complete linkage hierarchical clustering", leaflab = "none")

Now, we will compare the clustering results using element-centric similarity. ECS allows us to compare different kinds of clustering results, including the overlapping clustering and the hierarchical clustering we just calculated. The results tell us how similarly each observation was clustered by the two methods; the higher the ECS, the more similar the clustering of that observation.

# which observations are clustered more similarly?
# first compare flat disjoint and flat soft clusterings
df.iris$dbscan.cmeans.ecs <- element_sim_elscore(
    db.res,
    cmeans.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = dbscan.cmeans.ecs)) +
    geom_point() +
    labs(title = "DBScan vs c-means similarity") +
    scale_colour_viridis_c()
mean(df.iris$dbscan.cmeans.ecs)

# next compare flat disjoint and hierarchical disjoint clusterings
df.iris$dbscan.hc.ecs <- element_sim_elscore(
    db.res,
    hc.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = dbscan.hc.ecs)) +
    geom_point() +
    labs(title = "DBScan vs hclust similarity") +
    scale_colour_viridis_c()
mean(df.iris$dbscan.hc.ecs)

# finally compare flat soft and hierarchical disjoint clusterings
df.iris$cmeans.hc.ecs <- element_sim_elscore(
    cmeans.res,
    hc.res
)
ggplot(df.iris, aes(x = PC1, y = PC2, color = cmeans.hc.ecs)) +
    geom_point() +
    labs(title = "c-means vs hclust similarity") +
    scale_colour_viridis_c()
mean(df.iris$cmeans.hc.ecs)