PPCI: Projection Pursuit for Cluster Identification

Documented in cluster_performance success_ratio

### function cluster_performance() computes external cluster performance measures: Purity, V-Measure, Normalised Mutual Information and Adjusted Rand Index
## arguments:
# assigned = vector of cluster assignments
# labels = true class labels (same length as assigned)
# beta = weight parameter used in calculation of V-measure. Higher values apply higher weight to homogeneity over completeness.

cluster_performance = function(assigned, labels, beta = 1){
	n <- length(labels)
	T <- table(assigned, labels)
	RS <- rowSums(T)
	CS <- colSums(T)

	## V-measure

	CK <- - sum(apply(T, 1, function(x) return(sum(x[which(x>0)]*log(x[which(x>0)]/sum(x))))))/n
	KC <- - sum(apply(T, 2, function(x) return(sum(x[which(x>0)]*log(x[which(x>0)]/sum(x))))))/n
	K <- - sum(apply(T, 1, function(x) return(sum(x)*log(sum(x)/n))))/n
	C <- - sum(apply(T, 2, function(x) return(sum(x)*log(sum(x)/n))))/n
	if(C!=0){
		h <- 1 - CK/C
	}
	else{
		h <- 0
	}
	if(K!=0){
		c <- 1 - KC/K
	}
	else{
		c <- 0
	}
	if(h==0 && c==0) v.measure <- 0
	else v.measure <- (1+beta)*h*c/(beta*h+c)

	## Purity

	purity <- sum(apply(T, 1, function(x) return(max(x))))/n

	## Adjusted Rand Index

	O <- sum(sapply(T, function(t) choose(t, 2)))
	E <- (sum(sapply(RS, function(t) choose(t, 2)))*sum(sapply(CS, function(t) choose(t, 2))))/choose(n, 2)
	M <- (sum(sapply(RS, function(t) choose(t, 2))) + sum(sapply(CS, function(t) choose(t, 2))))/2
	adj.rand <- (O-E)/(M-E)


	## Normalised Mutual Information

	prod <- RS%*%t(CS)
	Tp <- T
	Tp[which(T==0)] <- 1e-10
	IXY <- sum(T*log(Tp*n/prod))
	HX <- sum(RS*log(RS/n))
	HY <- sum(CS*log(CS/n))
	NMI <- IXY/sqrt(HX*HY)

	c(adj.rand = adj.rand, purity = purity, v.measure = v.measure, nmi = NMI)
}



### function success_ratio() computes the success ratio (Pavlidis et al. 2016) of a binary partition
## arguments:
# assigned = vector of cluster assignments (takes at most two distinct values)
# labels = true class labels (same length as assigned)



success_ratio = function(assigned, labels){
  n = length(labels)

  #### Find the sizes of the overlaps between assigned and labels
  T = table(assigned, labels)

  #### determine which of the 2 aggregated classes to assign the classes
  split = apply(T, 2, which.max)

  #### if an aggregated class is empty, exit. failure to split any classes
  if(length(unique(split))==1){
    return(0)
  }

  #### calculate success and error
  success = min(sum(T[1,split==1]), sum(T[2,split==2]))
  error = sum(apply(T, 2, min))

  success/(success+error)
}