In Monoxido45/PhyloHclust: What the Package Does (One Line, Title Case)

Using dplyr, tidyr and other packages

library(dplyr)
library(tidyr)
library(cluster)
library(tibble)
library(magrittr)
library(factoextra)
library(ggplot2)
library(Hmisc)
library(GGally)
library(gridExtra)
library(reshape2)
library(clValid)
library(NbClust)
library(fpc)
library(ggpubr)
library(tidyverse)
library(ggpubr)

First, we will practice with R built-in datasets, like USArrests

data("USArrests")
arrests_df = USArrests %>%
  as_tibble()
arrests_df

Describing the data

describe(arrests_df)

\ Summarizing

summary(arrests_df)

\ Viewing histogram and multivariate plots:

arr.colnames = arrests_df %>%
  colnames()
graphs = list()
for (j in (1:ncol(arrests_df))){
  graphs[[j]] = arrests_df %>%
  ggplot(aes_string(x = arr.colnames[j])) +
  geom_histogram(bins = 18, fill="#69b3a2", color="#e9ecef", alpha=0.9)
}
hist.arr = do.call(grid.arrange, graphs)

mult.arr = ggpairs(arrests_df, diag = list(continuous = "barDiag")) + theme_bw()
mult.arr

Creating some boxplots:

# melting dataframe
melted_df = melt(arrests_df)
melted_df %>% ggplot(aes(x = variable, y = value, fill = variable)) + 
  geom_boxplot() + 
  xlab("variables")

\ Plotting the heatmatrix of correlations

ggcorr(arrests_df, method = c("everything", "pearson"))

\ Applying some hierarchical clustering and k-means methods in the data. But first, we will compute some dissimilarity matrix and evaluate them.

# scaling the data
arrests_df = arrests_df %>%
  scale()

arr.eucl_dist = get_dist(arrests_df, method = "euclidean")
arr.pearson_dist = get_dist(arrests_df, method = "pearson")
fviz_dist(arr.pearson_dist)

Now computing all clustering methods using eclust (good to make other graphs).

# computing diana
arr.diana = arrests_df %>% 
  eclust(FUNcluster = "diana", hc_metric = "euclidean")

# computing agglomerative hclust with average linkage and ward linkage
arr.av_hc = arrests_df %>%
  eclust(FUNcluster = "hclust", hc_method = "average")

arr.wd_hc = arrests_df %>%
  eclust(FUNcluster = "hclust", hc_method = "ward.D2")


# computing agnes (agglomerative nesting (Hierarchical Clustering)) with ward linkage
arr.agnes = arrests_df %>%
  eclust(FUNcluster = "agnes", hc_metric = "euclidean")

# computing k-means
arr.km = arrests_df %>%
  eclust(FUNcluster = "kmeans", nstart = 25)

hc_list = list('Average linkage' = arr.av_hc, 
               'Ward linkage' = arr.wd_hc, 
               'Agnes' = arr.agnes, 
               'Diana' = arr.diana)

\ Assessing the optimal number of clusters in k-means according to gap statistics, silhouette plot and elbow method.

fviz_gap_stat(arr.km$gap_stat)

\ So, the optimal number of clusters is $k = 4$, according to the Gap statistic. This is emphasized by the following result:

arr.km$nbclust

\ Assessing the silhouette measure of K-means with 4 clusters.

fviz_silhouette(arr.km)

\ but, we can investigate the problem of determining the optimal number of clusters more further, plotting some scores and methods for each number of clusters in between 1 and 10 clusters. Inittialy, we analyse the optimal number of clusters according to the silhouette method:

fviz_nbclust(arrests_df, kmeans, method = "silhouette") + 
  labs(subtitle = "Silhouette method")

\ Assessing the number of clusters using the Elbow method:

fviz_nbclust(arrests_df, kmeans, method = "wss") +
  labs(subtitle = "Elbow method")

\ Viewing the correspondent values of average silhouette measure for $k = 4$.

km.stats = cluster.stats(arr.eucl_dist, arr.km$cluster)
print(km.stats$dunn)
print(km.stats$dunn2)
print(km.stats$avg.silwidth)

And the indices distribution, which points out the amount of indices that recommend each $k$ as the optimal number of clusters.

km.nbclust= NbClust(data = arrests_df, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")
km.index = data.frame(km.nbclust$All.index)
km.index$index = row.names(km.index)

plot1 <- km.index %>%
  ggplot(aes(x = index, y  = Hubert, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point()


plot2 <- km.index %>%
  ggplot(aes(x = index, y = Dindex, group = 1))+
  geom_line(linetype = "dashed")+
  geom_point()

plot3 <- km.index %>%
  ggplot(aes(x = index, y = SDindex, group = 1))+
  geom_line(linetype = "dashed")+
  geom_point()

ggarrange(plot1, ggarrange(plot2, plot3, labels = c("Dindex", "SDindex"),
                           hjust = c(0,0), vjust = c(0.5, 0.5)),
          labels = "Hubert", nrow = 2, hjust = 0,
          vjust = 1.25)

This frequency distribution can be seen in the graph below:

fviz_nbclust(km.nbclust, ggtheme = theme_minimal())

Plotting Kmeans clustering with $k = 2$

arr.km2 = eclust(arrests_df, FUNcluster = "kmeans", k = 2)

Computing the average silhouette and dunn measures for $k = 2$:

km.stats = cluster.stats(arr.eucl_dist, arr.km2$cluster)
print(km.stats$dunn)
print(km.stats$dunn2)
print(km.stats$avg.silwidth)

Thus, for K-means clustering, the better choices for $k$ are $k = 2$ or $k = 4$, having $k = 4$ as an interesting choice because of the gap statistics results along with the amount of other measures that asserts the choice of $k = 4$ as an optimum number of clusters.

We also can work with hiearchical clustering methods and assess them. The following dendrograms are generated by each hierachical method chosen.

plot_list = list()
for (i in (1:length(hc_list))){
  plot_list[[i]] = fviz_dend(hc_list[[i]], rect = TRUE,
                             main = names(hc_list)[i])
}
hc_dends = do.call(grid.arrange, plot_list)
hc_dends

With each correspondent silhouette plot and silhouette average width for $k = 3$

plot_list = list()
for(i in (1:length(hc_list))){
  plot_list[[i]] = fviz_silhouette(hc_list[[i]], main = names(hc_list)[i])
}
hc_sill = do.call(grid.arrange, plot_list)

And average silhouette width of each hiearchical clustering method

sil_list = list()
for(i in (1:length(hc_list))){
  silinfo = hc_list[[i]]$silinfo
  sil_list[names(hc_list)[i]] = silinfo$avg.width
}
print(sil_list)

Now, concerning the optimum number of clusters for each method, we initially repeat all the plots made above for hierarchical methods, starting with: Gap statistics

 fviz_gap_stat(hc_list[[1]]$gap_stat) +
    labs(subtitle = "Gap statistics for hierarchical clustering")

Silhouette method

fviz_nbclust(arrests_df, FUNcluster = hcut, method = "silhouette") + 
  labs(subtitle = "silhouette method")

Elbow method

fviz_nbclust(arrests_df, FUNcluster = hcut, method = "wss") + 
  labs(subtitle = "Elbow method")

Which gives us a hint that $k = 2$ is somewhat an interesting choice for the number of clusters in the hierarchical methods. Now, we analyse the frequency of each optimum $k$

hc.nbclust= NbClust(data = arrests_df, distance = "euclidean", min.nc = 2, max.nc = 10, method = "ward.D2")
hc.index = data.frame(hc.nbclust$All.index)
hc.index$index = row.names(hc.index)

plot1 <- hc.index %>%
  ggplot(aes(x = index, y  = Hubert, group = 1)) +
  geom_line(linetype = "dashed") +
  geom_point()


plot2 <- hc.index %>%
  ggplot(aes(x = index, y = Dindex, group = 1))+
  geom_line(linetype = "dashed")+
  geom_point()

plot3 <- hc.index %>%
  ggplot(aes(x = index, y = SDindex, group = 1))+
  geom_line(linetype = "dashed")+
  geom_point()

ggarrange(plot1, ggarrange(plot2, plot3, labels = c("Dindex", "SDindex"),
                           hjust = c(0,0), vjust = c(0.5, 0.5)),
          labels = "Hubert", nrow = 2, hjust = 0,
          vjust = 1.25)

fviz_nbclust(hc.nbclust, ggtheme = theme_minimal())

\ Giving the same result as revealed above for K-means. But, apart from only assessing the number of clusters, we will now make internal cluster validation, seeeking the best clustering methods according to each measure of each type of validation. So, we will analyse graphically some clustering validations methods assessing: \ stability

cl_methods = c("hierarchical", "agnes" , "diana", "kmeans")
n = 2:10
better_stability = clValid(arrests_df, n, cl_methods,validation = "stability" ,metric = "euclidean", method = "ward")
optim.stab = optimalScores(better_stability)

# trying to plot using ggplot
# first converting the data to data frame
str(better_stability)
measures = better_stability@measures

for (i in 1:length(cl_methods)){
  local_measure = measures[, , cl_methods[i]]
  names.row = rownames(local_measure)
  names.col = colnames(local_measure)
  for (j in 1:nrow(local_measure)){
  rep.clustering = rep(cl_methods[i], ncol(local_measure))
  rep.row = rep(names.row[j],ncol(local_measure))
  row = local_measure[j, 1:length(n)]
  col.names = as.integer(names(row))
  if (j == 1 && i == 1){
  stab.data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
  }
  else{
    new_data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
    stab.data = bind_rows(stab.data, new_data)
  }
  }
}

plot1 <- stab.data %>%
  filter( type == "APN") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot2 <- stab.data %>%
  filter( type == "AD") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot3 <- stab.data %>%
  filter( type == "ADM") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot4 <- stab.data %>%
  filter( type == "FOM") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))


ggarrange(plot1, plot2, plot3, plot4, labels = c("APN", "AD", "ADM", "FOM"),
          ncol = 2, nrow = 2, hjust = c(-0.1, -0.5, -0.1, -0.5),
          vjust = c(1.5, 1.5, 0.25, 0.25), common.legend = TRUE)

\ with the respective optimum statistics:

print(optim.stab)

\ internal measures

better_internal_measures = clValid(arrests_df, n, cl_methods,validation = "internal" ,metric = "euclidean", method = "ward")
optim.intern = optimalScores(better_internal_measures)
str(better_internal_measures)
measures = better_internal_measures@measures

for (i in 1:length(cl_methods)){
  local_measure = measures[, , cl_methods[i]]
  names.row = rownames(local_measure)
  names.col = colnames(local_measure)
  for (j in 1:nrow(local_measure)){
  rep.clustering = rep(cl_methods[i], ncol(local_measure))
  rep.row = rep(names.row[j],ncol(local_measure))
  row = local_measure[j, 1:length(n)]
  col.names = as.integer(names(row))
  if (j == 1 && i == 1){
  intern.data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
  }
  else{
    new_data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
    intern.data = bind_rows(intern.data, new_data)
  }
  }
}

plot1 <- intern.data %>%
  filter( type == "Connectivity") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot2 <- intern.data %>%
  filter( type == "Dunn") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot3 <- intern.data %>%
  filter( type == "Silhouette") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))


ggarrange(plot1, 
             ggarrange(plot2, plot3, ncol = 2, labels = c("Dunn", "Silhouette"),
                       vjust = c(0.25, 0.25)), 
             nrow = 2, labels = "Connectivity", vjust = 0.15, 
          hjust = -0.25, common.legend = TRUE)

\ with the respective optimum statistics:

print(optim.intern)

\ So, according to the graphs and the above summary, we conclude that diana with $k = 2$ is the best clustering method for this dataset. Now that we analysed this built-in data, lets move forward to a new real one:\ Indian Primer League dataset

league_df = read.csv("datasets/total_data_na.csv") %>%
  tibble()
league_df

Since we dont have any informations about the variables, and there are many, we will skip the descriptive/graphic part and go straight ahead to the clustering analysis. Before moving forward, lets make a heatmatrix graph for assessing correlations, and after this, feature scalling (except in variables: PLAYER, BBI, X5w, y)

ggcorr(league_df %>%
         select(-c(PLAYER)), method = c("everything", "pearson"))

\ Removing variable y and BBI

league_df = league_df %>%
  select(-c(y, BBI))

# substituing missing values by mean and scaling
league_df = league_df %>%
 mutate(Avg.x = as.numeric(as.character(Avg.x)),
               Avg.y =as.numeric(as.character(Avg.y)),
          SR.y = as.numeric(as.character(SR.y))) %>%
  mutate(across(where(is.numeric), zoo::na.aggregate)) %>%
  mutate(across(-c(PLAYER), scale))
league_df

\ Now, we can analyse clustering tendencies, and how clusterable the data is (accordinto to hopkins statistics)

gradient.color = list(low = "steelblue", high = "white")
clust_tend.league = league_df[, -1] %>%
  get_clust_tendency(n = 100, gradient = gradient.color)

with the following hopkins statistics:

clust_tend.league$hopkins_stat

and with respective trend plot:

clust_tend.league$plot

\ The hopkins statistics and the plot show that our current data is very clusterable. So, we can move forward with a cluster analysis of the data similar to the previous one:

# computing diana
league.names = league_df[1]
league_df = league_df %>%
  select(-c(PLAYER))

league.diana = league_df %>% 
  eclust(FUNcluster = "diana", hc_metric = "euclidean")

# computing agglomerative hclust with ward linkage
league.wd_hc = league_df %>%
  eclust(FUNcluster = "hclust", hc_method = "ward.D2")


# computing agnes (agglomerative nesting (Hierarchical Clustering)) with ward linkage
league.agnes = league_df %>%
  eclust(FUNcluster = "agnes", hc_metric = "euclidean")

# computing k-means
league.km = league_df %>%
  eclust(FUNcluster = "kmeans", nstart = 25)

clust_list = list('K means' = league.km,
               'Ward linkage' = league.wd_hc, 
               'Agnes' = league.agnes, 
               'Diana' = league.diana)

\ assessing the optimal number of clusters in all chosen clustering techiniques accordingo to: Gap statistics

km.gap = fviz_gap_stat(league.km$gap_stat)
hc.gap = fviz_gap_stat(league.wd_hc$gap_stat)
ggarrange(km.gap, hc.gap, labels = c("km", "hclust"), hjust = c(-0.75, 0.25))

\ Silhouette

sil.km = fviz_nbclust(league_df, FUNcluster = kmeans, method = "silhouette") + 
  labs(subtitle = "silhouette method")
sil.hc= fviz_nbclust(league_df, FUNcluster = hcut, method = "silhouette") + 
  labs(subtitle = "silhouette method")

ggarrange(sil.km, sil.hc, labels = c("km", "hclust"), hjust = c(-0.5, 0),nrow = 2,
          vjust = c(1.5, 0.5))

\ Elbow method

hc.elbow = fviz_nbclust(league_df, FUNcluster = hcut, method = "wss") + 
  labs(subtitle = "Elbow method")

km.elbow = fviz_nbclust(arrests_df, FUNcluster = kmeans, method = "wss") + 
  labs(subtitle = "Elbow method")

ggarrange(km.elbow, hc.elbow, labels = c("km", "hclust"), hjust = c(-0.75, 0),nrow = 2,
          vjust = c(1, 0.5))

\ Silhouette plot

plot_list = list()
for(i in (1:length(clust_list))){
  plot_list[[i]] = fviz_silhouette(clust_list[[i]], main = names(clust_list)[i])
}

ggarrange(plotlist = plot_list, ncol = 2, nrow = 2)

\ Before we continue with the determination of the best number of clusters and method among the chosen, lets see the dendrograms produced by each hierarchical method: \ Ward agglomerative

fviz_dend(clust_list[[2]], rect = TRUE,
                             main = names(clust_list)[2])

\ Agnes

fviz_dend(clust_list[[3]], rect = TRUE,
                             main = names(clust_list)[3])

\ Diana

fviz_dend(clust_list[[4]], rect = TRUE,
                             main = names(clust_list)[4])

\ And now, after displaying all the dendrograms, lets analyse the distribution of the number of clusters in k-means and hierarchical methods:

hc.nbclust = NbClust(data = league_df, distance = "euclidean", min.nc = 2, max.nc = 10, method = "ward.D2")

km.nbclust = NbClust(data = league_df, distance = "euclidean", min.nc = 2, max.nc = 10, method = "kmeans")

hc.nbgraph = fviz_nbclust(hc.nbclust, ggtheme = theme_minimal()) +
  ylim(0,8)
km.nbgraph = fviz_nbclust(km.nbclust, ggtheme = theme_minimal()) + 
  ylim(0,8)

ggarrange(hc.nbgraph, km.nbgraph, nrow = 2, labels = c("hc", "km"),
          hjust = c(-25, -20))

\ And now, using stability and internal measures for clusterin validation, we will assess all methods and choose the best in each measure.\ Stability measures

cl_methods = c("hierarchical", "agnes" , "diana", "kmeans")
n = 2:10
better_stability = clValid(as.matrix(league_df), n, cl_methods, 
                           validation ="stability",
                           metric = "euclidean", method = "ward")
optim.stab = optimalScores(better_stability)

# copying the code above

# trying to plot using ggplot
# first converting the data to data frame
str(better_stability)
measures = better_stability@measures

for (i in 1:length(cl_methods)){
  local_measure = measures[, , cl_methods[i]]
  names.row = rownames(local_measure)
  names.col = colnames(local_measure)
  for (j in 1:nrow(local_measure)){
  rep.clustering = rep(cl_methods[i], ncol(local_measure))
  rep.row = rep(names.row[j],ncol(local_measure))
  row = local_measure[j, 1:length(n)]
  col.names = as.integer(names(row))
  if (j == 1 && i == 1){
  stab.data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
  }
  else{
    new_data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
    stab.data = bind_rows(stab.data, new_data)
  }
  }
}

plot1 <- stab.data %>%
  filter( type == "APN") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot2 <- stab.data %>%
  filter( type == "AD") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot3 <- stab.data %>%
  filter( type == "ADM") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot4 <- stab.data %>%
  filter( type == "FOM") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))


ggarrange(plot1, plot2, plot3, plot4, labels = c("APN", "AD", "ADM", "FOM"),
          ncol = 2, nrow = 2, hjust = c(-0.1, -0.5, -0.1, -0.5),
          vjust = c(1.5, 1.5, 0.25, 0.25), common.legend = TRUE)

print(optim.stab)

\ internal measures

better_internal_measures = clValid(as.matrix(league_df), n, cl_methods,validation = "internal" ,metric = "euclidean", method = "ward")
optim.intern = optimalScores(better_internal_measures)
str(better_internal_measures)
measures = better_internal_measures@measures

for (i in 1:length(cl_methods)){
  local_measure = measures[, , cl_methods[i]]
  names.row = rownames(local_measure)
  names.col = colnames(local_measure)
  for (j in 1:nrow(local_measure)){
  rep.clustering = rep(cl_methods[i], ncol(local_measure))
  rep.row = rep(names.row[j],ncol(local_measure))
  row = local_measure[j, 1:length(n)]
  col.names = as.integer(names(row))
  if (j == 1 && i == 1){
  intern.data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
  }
  else{
    new_data = data.frame("clustering" = rep.clustering,
                    "type" = rep.row,
                    "nclust" = col.names,
                    "measurement" = row)
    intern.data = bind_rows(intern.data, new_data)
  }
  }
}

plot1 <- intern.data %>%
  filter( type == "Connectivity") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot2 <- intern.data %>%
  filter( type == "Dunn") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))

plot3 <- intern.data %>%
  filter( type == "Silhouette") %>%
  ggplot( aes(x = nclust, y  = measurement, group = clustering)) +
  geom_line(aes(linetype = clustering, color = clustering)) +
  geom_point(aes(shape = clustering))


ggarrange(plot1, 
             ggarrange(plot2, plot3, ncol = 2, labels = c("Dunn", "Silhouette"),
                       vjust = c(0.25, 0.25)), 
             nrow = 2, labels = "Connectivity", vjust = 0.15, 
          hjust = -0.25, common.legend = TRUE)