#' @title Get silhouette score for a clustering result
#'
#' @description The silhouette score is a cluster qualiy index that assessses both cluster cohesion
#' (how close points within the same cluster are to each other) and cluster separation
#' (how well different clusters are separated) for a given clustering result (encoded in the group membership vector).
#' The silhouette score is bounded by -1 (worst value) and 1 (best value). The silhouette score s of a point i is defined as:
#' s(i)=b(i)-a(i)/max(b(i),max(i)), where a(i) is the mean distance of this point to all other points within the same cluster
#' and b(i) is the smallest mean distance to data points in the nearest neighboring cluster.
#' The silhouette score of the entire clustering is computed as the mean of silhouette scores across all points.
#' This function wraps vegdist to compute distances/dissimilarities.
#'
#' @param abundances a matrix with taxa as rows and samples as columns
#' @param groups vector with group memberships
#' @param method name of dissimilarity or distance to use. See vegdist for options
#' @export
#'
silhouette <- function(abundances, groups, method='bray'){
if (length(groups) == length(colnames(abundances))){
abundances <- t(abundances)
} else if (length(groups) != length(rownames(abundances))){
stop('Group vector is not the same length as the abundance table dimensions.')
}
#print(dim(abundances))
dis <- as.matrix(vegdist(abundances, method=method))
# for every sample in a cluster, calculate silhoutte coefficient
scores <- c()
empty=FALSE
for (i in 1:nrow(dis)){
row <- dis[i,]
cluster <- groups[i]
intra_cluster_distance <- 0
smallest_cluster_distance <- 1
for (clus in unique(groups)){
ids <- which(groups == clus)
if (clus == cluster){
ids <- ids[ids != i] # distance to self should not be included
# Manual merge of treatment of zero-member clusters with alternative
# <<<<<<< HEAD
# if(length(ids)==0){
# warning(paste("Cluster",cluster," of",length(unique(groups)),"clusters is empty! Silhouette is set to NA"))
# empty=TRUE
# }else{
# #print(paste("ids=",ids))
# intra_cluster_distance <- mean(row[ids])
# }
# # fix: replaced meandist below by intra_cluster_distance
# } else if (intra_cluster_distance < smallest_cluster_distance){
# smallest_cluster_distance <- mean(row[ids])
# =======
# only done if there are members in the cluster: intra_cluster_distance <- mean(row[ids])
if(length(ids)==0){
warning(paste("Cluster",cluster," of",length(unique(groups)),"clusters is empty! Silhouette is set to NA"))
empty=TRUE
}else{
#print(paste("ids=",ids))
intra_cluster_distance <- mean(row[ids])
}
} else {
inter_cluster_distance <- mean(row[ids])
if (inter_cluster_distance < smallest_cluster_distance){
smallest_cluster_distance <- inter_cluster_distance
}
# >>>>>>> 0d6c5931d0584caae022c3a1319236ef0c91ed4c
}
} # loop clusters
if (intra_cluster_distance < smallest_cluster_distance){
silhouette <- 1-(intra_cluster_distance / smallest_cluster_distance)
} else if (intra_cluster_distance > smallest_cluster_distance){
silhouette <- (smallest_cluster_distance / intra_cluster_distance) - 1
} else {
silhouette <- 0
}
if(empty){
silhouette=NA
}
#print(paste("silhouette",silhouette))
scores <- c(scores, silhouette)
} # loop rows of dissimilarity matrix
#print(scores)
return(mean(scores))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.