#' Clustering Validation Index with Manifold-Valued Data
#'
#' CVI
#'
#'
#' @export
rclust.index <- function(input, label,
index=c("CCC","CH","Cindex","DB","Dunn","Gamma","Gplus","McClain","Ptbiserial","Ratkowsky","SDbw","SDindex","Silhouette","Tau")){
allindices = c("ch","silhouette","cindex","ptbiserial","db","mcclain","dunn","gamma","gplus","tau","ratkowsky","ccc","sdbw","sdindex")
#-------------------------------------------------------
if (is.matrix(input)){
input = RiemBase::riemfactory(t(input), name="euclidean")
}
# must be of 'riemdata' class
if ((class(input))!="riemdata"){
stop("* rclust.index : the input must be of 'riemdata' class. Use 'riemfactory' first to manage your data.")
}
if ((length(input$data)!=length(label))||(!is.vector(label))){
stop("* rclust.index : length of a class label vector must correspond to that of provided data.")
}
# take care of label
label = as.integer(label)
# index type matching
index = match.arg(tolower(index), allindices)
#-------------------------------------------------------
# computation
# convert into data cube and extract mfdname
dcube = rclust_stack3d(input)
mfdname = input$name
memberlist = rclust_membership(label)
if (index=="silhouette"){
score = index_silhouette(dcube, mfdname, label, memberlist)
} else if (index=="ch"){
score = index_ch(dcube, mfdname, label, memberlist)
} else if (index=="cindex"){
score = index_cindex(dcube, mfdname, label, memberlist)
} else if (index=="ptbiserial"){
score = index_ptbiserial(dcube, mfdname, label, memberlist)
} else if (index=="db"){
score = index_db(dcube, mfdname, label, memberlist)
} else if (index=="mcclain"){
score = index_mcclain(dcube, mfdname, label, memberlist)
} else if (index=="dunn"){
score = index_dunn(dcube, mfdname, label, memberlist)
} else if (index=="gamma"){
score = index_gamma(dcube, mfdname, label, memberlist)
} else if (index=="gplus"){
score = index_gplus(dcube, mfdname, label, memberlist)
} else if (index=="tau"){
score = index_tau(dcube, mfdname, label, memberlist)
} else if (index=="ratkowsky"){
score = index_ratkowsky(dcube, mfdname, label, memberlist)
} else if (index=="ccc"){
score = index_ccc(dcube, mfdname, label, memberlist)
} else if (index=="sdbw"){
score = index_sdbw(dcube, mfdname, label, memberlist)
} else if (index=="sdindex"){
score = index_sdindex(dcube, mfdname, label, memberlist)
}
#-------------------------------------------------------
# return score
return(score)
}
############################################################################
# (01) Silhouette Index / Max / Rousseeux (1987)
#' @keywords internal
#' @noRd
index_silhouette <- function(dcube, mfdname, label, memberlist){
# compute 'pdist'
distmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic")
# labeling care
ulabel = unique(label)
k = length(ulabel)
if (k==1){
stop("* rclust.index : for a single-label clustering, Silhouette index is not defined.")
}
# let's iterate, take a lot of time !
n = length(label)
vec.a = rep(0,n)
vec.b = rep(0,n)
for (i in 1:n){ # for each data object
# label of i-th element
idlabel = which(ulabel==label[i])
# compute a(i)
idx.samelabel = setdiff(memberlist[[idlabel]],i)
vec.a[i] = base::mean(as.vector(distmat[i,idx.samelabel]))
# compute b(j)
otherlabels = setdiff(ulabel, label[i])
dics = rep(0,k-1)
for (j in 1:(k-1)){
jlabel = which(ulabel==otherlabels[j])
idx.jlabel = memberlist[[jlabel]]
dics[j] = base::mean(as.vector(distmat[i,idx.jlabel]))
}
vec.b[i] = min(dics)
}
# now compute using vectorized operations
vec.s = ((vec.b-vec.a)/(base::pmax(vec.a, vec.b)))
# return
return(mean(vec.s))
}
############################################################################
# (02) CH / Max / Calinski and Harabasz (1974)
#' @keywords internal
#' @noRd
index_ch <- function(dcube, mfdname, label, memberlist){
# basic parameters
n = length(label)
q = length(unique(label))
if (q==1){
stop("* rclust.index : for a single-label clustering, CH index is not defined.")
}
## Previous Computation : Locally at the Tangent Space
# Bq = rclust_index_Bq(dcube, label, mfdname, memberlist)
# Wq = rclust_index_Wq(dcube, label, mfdname, memberlist)
# term.num = sum(diag(Bq))/(q-1)
# term.den = sum(diag(Wq))/(n-1)
# # compute preliminary ones
BGSS = rclust_index_BGSS(dcube, label, mfdname, memberlist)
WGSS = rclust_index_WGSS(dcube, label, mfdname, memberlist)
# compute the score
term.num = BGSS/(q-1)
term.den = WGSS/(n-q)
score = term.num/term.den
# return
return(score)
}
############################################################################
# (03) Cindex / Min / Hubert and Levin (1976)
#' @keywords internal
#' @noRd
index_cindex <- function(dcube, mfdname, label, memberlist){
# basic parameters
n = length(label)
q = length(unique(label))
Ns = rclust_index_Ns(label, memberlist) # Nt, Ns, Nw at once
Nw = Ns$Nw
# compute pairwise distance matrix
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic")
# compute Sw
Sw = rclust_index_Sw(dmat, label, mfdname, memberlist)
# Smin and Smax
vecd = sort(as.vector(dmat[lower.tri(dmat)])) # sort the distances in an increasing order
Smin = sum(head(vecd, Nw))
Smax = sum(tail(vecd, Nw))
# compute score and return output
Cindex = ((Sw-Smin)/(Smax-Smin))
return(Cindex)
}
############################################################################
# (04) Ptbiserial / Max / Milligan 1980
#' @keywords internal
#' @noRd
index_ptbiserial <- function(dcube, mfdname, label, memberlist){
# basic parameters
n = length(label)
# info : counts
Ns = rclust_index_Ns(label, memberlist)
Nb = as.double(Ns$Nb)
Nw = as.double(Ns$Nw)
Nt = as.double(Ns$Nt)
# info : Sw- and Sb-corresponding bar matrices
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic")
Sw.bar = rclust_index_Sw(dmat, label, mfdname, memberlist)/Nw
Sb.bar = rclust_index_Sb(dmat, label, mfdname, memberlist)/Nb
# compute
Sd = stats::sd(as.vector(dmat[lower.tri(dmat)])) # standard deviation
score = (((Sb.bar-Sw.bar)*sqrt(Nw*Nb/(Nt^2)))/Sd)
return(score)
}
############################################################################
# (05) DB / Min / Daviews and Bouldin 1979
#' @keywords internal
#' @noRd
index_db <- function(dcube, mfdname, label, memberlist){
# label
ulabel = unique(label)
q = length(ulabel)
dnrow = nrow(dcube)
dncol = ncol(dcube)
# compute class-wise mean (cube)
mean.class = rclust_classmean(dcube, label, mfdname, memberlist)
# compute distance between centroids
dmat.centroids = rclust_pdist_cube(mean.class, mfdname, type="intrinsic")
# compute in-class distance measures
dvec.inclass = rep(0,q)
for (i in 1:q){
idlabel = memberlist[[which(ulabel==ulabel[i])]]
dcube1 = array(0,c(dnrow,dncol,1)); dcube1[,,1] = mean.class[,,i]
dcube2 = rclust_cubesubset(dcube, idlabel)
dvec.inclass[i] = stats::sd(as.vector(rclust_pdist2_cube(dcube1, dcube2, mfdname, type="intrinsic")))
}
# now, compute the score
vec.db = rep(0,q)
for (k in 1:q){
idrest = setdiff(1:q,k)
del.k = dvec.inclass[k]
del.l = dvec.inclass[-k] # vector
dvecs = as.vector(dmat.centroids[k,idrest])
vec.db[k] = max((rep(del.k,(q-1))+del.l)/dvecs)
}
return(mean(vec.db))
}
############################################################################
# (07) McClain / Min / McClain and Rao 1975
#' @keywords internal
#' @noRd
index_mcclain <- function(dcube, mfdname, label, memberlist){
# info : counts
Ns = rclust_index_Ns(label, memberlist)
Nb = as.double(Ns$Nb)
Nw = as.double(Ns$Nw)
Nt = as.double(Ns$Nt)
# info : Sw- and Sb-corresponding bar matrices
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic")
Sw.bar = rclust_index_Sw(dmat, label, mfdname, memberlist)/Nw
Sb.bar = rclust_index_Sb(dmat, label, mfdname, memberlist)/Nb
# output
output = Sw.bar/Sb.bar
return(output)
}
############################################################################
# (08) Dunn / Max / Dunn 1974
#' @keywords internal
#' @noRd
index_dunn <- function(dcube, mfdname, label, memberlist){
# basic information
q = length(memberlist) # number of unique clusters
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic") # distance matrix
ulabel = unique(label)
# compute 1 : diameter
vec.diam = rep(0,q)
for (i in 1:q){
tgt = memberlist[[which(ulabel==ulabel[i])]]
vec.diam[i] = max(as.vector(dmat[tgt,tgt]))
}
# compute 2 : inter-cluster distance
mat.dist = array(0,c(q,q))
for (i in 1:(q-1)){
id1 = memberlist[[which(ulabel==ulabel[i])]]
for (j in (i+1):q){
id2 = memberlist[[which(ulabel==ulabel[j])]]
mat.dist[i,j] = min(as.vector(dmat[id1,id2]))
}
}
# compute : final step
term.den = max(vec.diam)
term.num = min(setdiff(unique(as.vector(mat.dist)),0))
output = (term.num/term.den)
return(output)
}
############################################################################
# (09) Gamma / Max / Baker and Hubert 1975
#' @keywords internal
#' @noRd
index_gamma <- function(dcube, mfdname, label, memberlist){
# compute distance matrix
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic") # distance matrix
# compute concordance information
concordance = rclust_concordant(dmat, label, memberlist)
sp = as.double(concordance$con)
sm = as.double(concordance$dis)
# score
output = (sp-sm)/(sp+sm)
return(output)
}
############################################################################
# (10) Gplus / Min / Rohlf 1974
#' @keywords internal
#' @noRd
index_gplus <- function(dcube, mfdname, label, memberlist){
# compute distance matrix
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic") # distance matrix
# compute concordance information
concordance = rclust_concordant(dmat, label, memberlist)
sm = as.double(concordance$dis)
# compute Nt
n = length(label)
Nt = as.double(n*(n-1)/2)
# compute score
output = (2*sm/(Nt*(Nt-1)))
return(output)
}
############################################################################
# (11) Tau / Max / Rohlf 1974
#' @keywords internal
#' @noRd
index_tau <- function(dcube, mfdname, label, memberlist){
# compute distance matrix
dmat = rclust_pdist_cube(dcube, mfdname, type="intrinsic") # distance matrix
# compute concordance information
concordance = rclust_concordant(dmat, label, memberlist)
sp = as.double(concordance$con)
sm = as.double(concordance$dis)
# compute Nt, Nb, Nw
info.n = rclust_index_Ns(label, memberlist)
Nt = as.double(info.n$Nt)
Nb = as.double(info.n$Nb)
Nw = as.double(info.n$Nw)
# compute score
term.num = (sp-sm)
term.den = sqrt(Nb*Nw*Nt*(Nt-1)/2)
output = term.num/term.den
return(output)
}
############################################################################
# (12) Ratkowsky / Max / Ratkowsky and Lance 1978
#' @keywords internal
#' @noRd
index_ratkowsky <- function(dcube, mfdname, label, memberlist){
# ulabel
ulabel = unique(label)
q = length(memberlist)
# equivariant embedding first.
X = rclust_equivariant_cube(dcube, mfdname)
p = ncol(X)
# compute n_k vector
vec.Nk = rep(0,q)
for (i in 1:q){
vec.Nk[i] = length(memberlist[[which(ulabel==ulabel[i])]])
}
# compute class-wise and global mean
mean.class = array(0,c(q,p))
for (i in 1:q){
mean.class[i,] = as.vector(colMeans(X[memberlist[[which(ulabel==ulabel[i])]],]))
}
mean.global = as.vector(colMeans(X))
# compute 1 : BGSS
BGSS = rep(0,p)
for (j in 1:p){
BGSS[j] = sum(((as.vector(mean.class[,j])-mean.global[j])^2)*vec.Nk)
}
# compute 2 : TSS
TSS = rep(0,p)
for (i in 1:p){
TSS[i] = sum((as.vector(X[,i])-mean.global[i])^2)
}
# compute the scoer
return((sqrt((1/p)*sum(BGSS/TSS)))/sqrt(q))
}
############################################################################
# (13) CCC / Max / Sarle 1983
#' @keywords internal
#' @noRd
index_ccc <- function(dcube, mfdname, label, memberlist){
# parameters and equivariant embedding
ulabel = unique(label)
X = rclust_equivariant_cube(dcube, mfdname) # extrinsic !
n = nrow(X)
p = ncol(X)
q = length(memberlist)
# preliminary computation
XtX = t(X)%*%X
vec.s = eigen(XtX/(n-1))$values # squared-root eigenvalues from empirical gram matrix
print(vec.s)
vec.s[(vec.s<=0)]=sqrt(.Machine$double.eps); vec.s = sqrt(vec.s);
val.v = prod(vec.s)
val.c = (val.v/q)^(1/p)
vec.u = (vec.s/val.c)
val.p = max(pmin(which(vec.u>=1),(q-1)))
Z = rclust_Z(label, memberlist)
X.bar = base::solve((t(Z)%*%Z),(t(Z)%*%X))
# main part 1 : R2 and ER2
R2 = 1-(sum(diag(XtX - (t(X.bar)%*%t(Z)%*%Z%*%X.bar)))/sum(diag(XtX)))
ER2 = 1-((((sum(1/(n+vec.u[1:val.p]))) + (sum((as.vector(vec.u[(val.p+1):p])^2)/(n+vec.u[(val.p+1):p]))))/(sum(vec.u^2)))*(((n-q)^2)/n)*(1+(4/n)))
# main part 2 : truly main
output = log((1-ER2)/(1-R2))*sqrt(n*val.p/2)/((0.001+ER2)^(1.2))
return(output)
}
############################################################################
# (14) SDindex / Min / Halkidi et al 2000
#' @keywords internal
#' @noRd
index_sdindex <- function(dcube, mfdname, label, memberlist){
# parameters and equivariant embedding
ulabel = unique(label)
X = rclust_equivariant_cube(dcube, mfdname) # extrinsic !
q = length(memberlist)
# compute 1 : Scat(q)
sig.local = list()
for (i in 1:q){
sig.local[[i]] = apply(X[memberlist[[which(ulabel==ulabel[i])]],], 2, var)
}
sig.global = apply(X, 2, var)
Scat = 0
for (i in 1:q){
tgt = sig.local[[i]]
Scat = Scat + sqrt(sum(tgt^2))
}
Scat = Scat/(sqrt(sum(as.vector(sig.global)^2))*q)
# compute 2 : Dis
classmean = rclust_classmean(dcube, label, mfdname, memberlist)
distmat = rclust_pdist_cube(classmean, mfdname, type="intrinsic")
halfD = as.vector(distmat[upper.tri(distmat)])
Dmax = max(halfD)
Dmin = min(halfD)
Dis = 0.0
for (k in 1:q){
Dis = Dis + (1/sum(as.vector(distmat[k,])))
}
Dis = Dis*Dmax/Dmin
# output
output = Dis*(1+Scat)
return(output)
}
############################################################################
# (15) SDbw / Min / Halkidi and Vazirgiannis 2001
#' @keywords internal
#' @noRd
index_sdbw <- function(dcube, mfdname, label, memberlist){
# parameters and equivariant embedding
ulabel = unique(label)
X = rclust_equivariant_cube(dcube, mfdname) # extrinsic !
q = length(memberlist)
# compute 1 : Scat(q)
sig.local = list()
for (i in 1:q){
sig.local[[i]] = apply(X[memberlist[[which(ulabel==ulabel[i])]],], 2, var)
}
sig.global = apply(X, 2, var)
Scat = 0
for (i in 1:q){
tgt = sig.local[[i]]
Scat = Scat + sqrt(sum(tgt^2))
}
Scat = Scat/(sqrt(sum(as.vector(sig.global)^2))*q)
# compute 2 : Density.bw with Stdev : refer to clusterCrit
# 2-1. threshold
Stdev = 0
for (k in 1:q){
Stdev = Stdev + sqrt(sum(sig.local[[k]]^2))
}
Stdev = sqrt(Stdev)/q
# 2-2. classwise mean in extrinsic manner
classmean = rclust_equivariant_cube(rclust_classmean(dcube, label, mfdname, memberlist), mfdname)
# 2-3. iterate
density.bw = 0
for (i in 1:(q-1)){
id1 = memberlist[[which(ulabel==ulabel[i])]]
for (j in (i+1):q){
id2 = memberlist[[which(ulabel==ulabel[j])]]
idunion = c(id1,id2) # join two groups
cluster1 = as.vector(classmean[i,])
cluster2 = as.vector(classmean[j,])
centroid = as.vector((cluster1+cluster2)/2)
val.top = rclust_index_density(centroid, X[idunion,], Stdev)
val.bot1 = rclust_index_density(cluster1, X[idunion,], Stdev)
val.bot2 = rclust_index_density(cluster2, X[idunion,], Stdev)
if (max(c(val.bot1,val.bot2))>0){
density.bw = density.bw + (val.top/max(c(val.bot1,val.bot2)))
}
}
}
density.bw = density.bw*2/(q*(q-1))
# return output
return(Scat+density.bw)
}
############################################################################
############################################################################
## Et Cetera
# * Scott / Maximal Incremental / Scott and Symons 1971
#' @keywords internal
#' @noRd
index_scott <- function(dcube, mfdname, label, memberlist){
# compute Wq
Wq = rclust_index_Wq(dcube, label, mfdname)
# comptue T (as Tmat)
meanfunc = utils::getFromNamespace("rbase.mean.cube","RiemBase")
mean.global = meanfunc(dcube, mfdname)$x
log.pulled = cpp_dispersion(dcube, mean.global, mfdname)
Tmat = (log.pulled%*%t(log.pulled))
# compute the score
n = length(label)
score = log(det(Tmat)/det(Wq))*n
return(score)
}
# for IRIS dataset [,2:4], McClain and CCC are reporting weird results.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.