R/ICLUST.cluster.R
In psych: Procedures for Psychological, Psychometric, and Personality Research

ICLUST.cluster <- function (r.mat,ICLUST.options,smc.items) {#should allow for raw data, correlation or covariances

#options:  alpha =1  (minimum alpha)  2 (average alpha)    3 (maximum alpha)
#          beta =1   (minimum beta)   2 (average beta)     3 (maximum beta)
#          correct  for reliability
#          reverse score items if negative correlations
#          stop clustering if beta for new clusters < beta.min
#          output =1 (short)    2  (show steps)     3 show rejects as we go
#          
#initialize  various arrays and get ready for the first pass
output <- ICLUST.options$output

num.var <- nrow(r.mat)
keep.clustering <- TRUE          #used to determine when we are finished clustering
results <- data.frame(matrix(rep(0,18*(num.var-1)),ncol=18))   #create the data frame for the results
#results <-  matrix(rep(0,18*(num.var-1)),ncol=18)   #use a matrix for speed but we can not address it by name
names(results) <- c("Item/Cluster", "Item/Clust","similarity","correlation","alpha1","alpha2",
"beta1","beta2","size1","size2","rbar1","rbar2","r1","r2","alpha","beta","rbar","size")
rownames(results) <- paste("C",1:(num.var-1),sep="")
digits <- ICLUST.options$digits
clusters <- diag(1,nrow =nrow(r.mat))    #original cluster structure is 1 item clusters
if(is.null(rownames(r.mat))) {rownames(r.mat) <-  paste("V",1:num.var,sep="") }
rownames(clusters) <- rownames(r.mat)
colnames(clusters) <- paste("V",1:num.var,sep="")
diag(r.mat) <- 0
row.range <- apply(r.mat,1,range,na.rm=TRUE)     
item.max<- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
diag(r.mat) <- 1

count=1



#master loop 

while (keep.clustering) {   #loop until we figure out we should stop
#find similiarities
 #we will do most of the work on a copy of the r.mat
#cluster.stats <- cluster.cor(clusters,r.mat,FALSE,SMC=ICLUST.options$SMC)   #deleted 30/12/13
cluster.stats <- cluster.cor(clusters,r.mat,FALSE,SMC=ICLUST.options$SMC,item.smc=smc.items)


sim.mat <- cluster.stats$cor    #the correlation matrix
diag(sim.mat) <- 0   #we don't want 1's on the diagonal to mess up the maximum 

#two ways to estimate reliability -- for 1 item clusters, max correlation, for >1, alpha
#this use of initial max should be an option
if (ICLUST.options$correct) {   #find the largest and smallest similarities for each variable      
	row.range <- apply(sim.mat,1,range,na.rm=TRUE)     
	row.max <- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
   } else {row.max <- rep(1, nrow(sim.mat)) }         #don't correct for largest similarity
   
item.rel <- cluster.stats$alpha  
for (i in 1: length(item.rel)) { if (cluster.stats$size[i]<2) {
	item.rel[i] <- row.max[i]
	#figure out item betas here?
	}}
	
	
if(output>3)  print(sim.mat,digits=digits)


 #this is the corrected for maximum r similarities
if (ICLUST.options$correct) {
  sq.max <- diag(1/sqrt(item.rel))     #used to correct for reliabilities
  sim <- sq.max %*% sim.mat %*% sq.max   #this corrects for reliabilities but messes up the correlations of two item clusters with items
     } else {sim <- sim.mat}
     


diag(sim) <- NA                  #we need to not consider the diagonal when looking for maxima
#find the most similar pair    and apply tests if we should combine
test.alpha <- FALSE
test.beta <- FALSE

while(!(test.alpha&test.beta)){

max.cell <- which.max(sim)             #global maximum
if (length(max.cell) < 1) {
   keep.clustering <- FALSE
   break}   #there are no non-NA values left
sign.max <- 1
if ( ICLUST.options$reverse ) {            #normal case is to reflect if necessary
	min.cell <- which.min(sim)             #location of global minimum
	if (sim[max.cell] <  abs(sim[min.cell] )) { 
 	  	sign.max <- -1
		max.cell <- min.cell }
	if (sim[max.cell] < 0.0) {sign.max <- -1 }}  
	               #this is a weird case where all the similarities are negative  -- happens towards the end of clustering
	
max.col <- trunc(max.cell/nrow(sim))+1    #is in which row and column?
max.row <- max.cell - (max.col-1)*nrow(sim) #need to fix the case of first column
if (max.row < 1) {max.row <- nrow(sim)
 max.col <- max.col-1 }




size1 <- cluster.stats$size[max.row]
if(size1 < 2) {V1 <- 1
	beta1 <-  item.rel[max.row]
	alpha1 <- item.rel[max.row]
	rbar1 <-  item.rel[max.row]
	
    
	} else {
	rbar1 <- results[cluster.names[max.row],"rbar"]
	beta1 <- results[cluster.names[max.row],"beta"]
	alpha1 <- results[cluster.names[max.row],"alpha"]
	V1 <- size1 + size1*(size1-1) * rbar1
	}
	 
size2 <- cluster.stats$size[max.col]
if(size2 < 2) {V2 <- 1 
	beta2 <-  item.rel[max.col] 
	alpha2 <- item.rel[max.col] 
	rbar2 <-  item.rel[max.col]
	
	
	} else {
	
	rbar2 <- results[cluster.names[max.col],"rbar"]
	beta2 <- results[cluster.names[max.col],"beta"]
	alpha2 <- results[cluster.names[max.col],"alpha"] 
	V2 <- size2 + size2 * (size2-1) * rbar2}




Cov12 <- sign.max * sim.mat[max.cell] * sqrt(V1*V2) #this flips the sign of the correlation for negative correlations
r12 <- Cov12/(size1*size2)   #average between cluster r
V12 <- V1 + V2 + 2 * Cov12  #the variance of the new cluster

size12 <- size1 + size2
V12c <- (V12 - size12)*(size12/(size12-1))  #true variance (using the average r on the diagonal)
rbar <- V12c/(size12^2)
alpha <- V12c/V12

rmin <- min(r12,rbar2,rbar1)
#combine these two rows if the various criterion are passed
#beta.weighted <- size12^2 * sign.max *r12/V12   #this was added June, 2009 but can produce negative betas
beta.weighted <- size12^2 *r12/V12               #corrected July 28, 2009  
beta.unweighted <-  2* sign.max*sim.mat[max.cell]/(1+sign.max* sim.mat[max.cell])
beta.minimum <- size12^2 * rmin/V12     #added 6/12/24
# if(ICLUST.options$weighted){beta.combined <- beta.weighted} else {beta.combined <- beta.unweighted} 
 if(ICLUST.options$weighted=="min") ICLUST.options$weighted <- 2 
switch(ICLUST.options$weighted+1,
   {beta.combined <- beta.unweighted},
  {beta.combined <- beta.weighted},
 {beta.combined <-   beta.minimum }
   )

    
    #what is the correlation of this new cluster with the two subclusters?
#this considers item overlap problems
#There are actually two alternative solutions
#a) (cor.gen=TRUE)   finds the correlation due to a shared general factor 
#b) (cor.gen=FALSE)  finds the correlation for the general + group but remove the item overlap problem
#neither seems optimal, in that a) will correctly identify non-correlated clusters, but b) is less affected by small clusters.

if(ICLUST.options$cor.gen) {


c1 <-  r12 * size1 * size1 +  Cov12    #corrected covariance  
c2 <-  sign.max*(r12 * size2 * size2 +   Cov12) } else {

c1 <-  size1^2* rbar1 +   Cov12  
c2 <-  sign.max*(size2^2 *rbar2 +   Cov12) }


if((size1 < 2) && (size2  < 2)) { #r2 should be flipped if necessary -- r2 is always flipped (if necessary) when forming clusters 
       r1 <- sqrt(abs(rbar1))   #this  corrects for reliability in a two item cluster
       r2 <- sign.max* r1       #flips the sign if two are negatively correlated  -- in the case of two items 
                             } else {    #this next part corrects for item overlap as well as reliability of the subcluster
                                        
                                 if (ICLUST.options$correct.cluster) {   #correct is the default option
                                 if(TRUE)  {r1 <- c1/sqrt((V1 - size1  +size1 * rbar1) * V12) 
												       if (size2 < 2)  {
												       r2 <- c2/sqrt(abs(rbar2)*V12)}  else  {

												      # r2 <- sign.max * c2/sqrt((V2-size2 + size2 * rbar2)*V12c)}    #changed yet again on 6/10/10
												       r2 <-  c2/sqrt((V2-size2 + size2 * rbar2)*V12c)}
												    } else {
												      if(size1 < 2 )   {
												     
												      r1 <-  c1/sqrt(abs(rbar1)*V12)} else  { 
												       
												      r1 <- c1/sqrt((V1-size1 + size1 * rbar1)*V12c)  }  
                                     #flip the smaller of the two clusters  -- no, flip r2
                                                         if (size2 < 2) {r2 <- c2/sqrt(abs(rbar2)*V12)} else { r2 <-  c2/sqrt((V2-size2 + size2*rbar2)*V12c)}
												          #  r2 <-  c2/sqrt((V2-size2+size2*rbar2)*V12c) 
												          }
							         
							} else {if(TRUE) {r1 <- c1/sqrt(V1*V12)   #do not correct
												   r2 <- sign.max* c2/sqrt(V2*V12)
												    } else {
												        r1 <-sign.max* c1/sqrt(V1*V12)  } 
                                     #flip the smaller of the two clusters  - flip r2
												            r2 <-  c2/sqrt(V2*V12) }
							
						}
#test if we should combine these two clusters  
#first, does alpha increase?
test.alpha <- TRUE

if (ICLUST.options$alpha>0) { #should we apply the alpha test?
if (ICLUST.options$alpha.size < min(size1,size2)) {
  switch(ICLUST.options$alpha, {if (alpha < min(alpha1,alpha2)) {if (output>2) {print(
  paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],
  'new alpha =', alpha,'old alpha1 =', alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE }},
  {if (alpha < mean(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new alpha =',alpha,
  'old alpha1 =',alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE  }},
  {if (alpha < max(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new alpha =', alpha,
  'old alpha1 =',alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE  }}) #end switch  
  }   #end if options$alpha.size
  }

#second, does beta increase ?
test.beta <- TRUE          
if (ICLUST.options$beta>0) { #should we apply the beta test?
if (ICLUST.options$beta.size < min(size1,size2)) {
  switch(ICLUST.options$beta, {if (beta.combined < min(beta1,beta2)) {if (output>2) {print(
  paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new beta =',
  round (beta.combined,digits),'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE }},
  {if (beta.combined < mean(beta1,beta2)) {if (output>2) {print(paste ('do not combine ', 
  cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits),
  'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE  }},
  {if (beta.combined < max(beta1,beta2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits),
  'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE  }}) #end switch  
												
  }   #end if options$beta.size

}




if(test.beta & test.alpha) { break} else { #we have failed the combining criteria 

            if((ICLUST.options$n.clus > 0) & ((num.var - count ) >= ICLUST.options$n.clus) ) {warning ("Clusters formed as requested do not meet the alpha and beta criteria. Perhaps you should rethink the number of cluster settings.")
			                break } else {   
			if (beta.combined < ICLUST.options$beta.min) {
			
			keep.clustering <- FALSE       #the most similiar pair is not very similar, we should quit
			break}  else { 
			
			sim[max.row,max.col] <- NA 
		sim[max.col,max.row] <- NA }}
    }   #end of test.beta & test.alpha
    
}    #end of while test.alpha & test.beta.loop

#combine and summarize
if (keep.clustering) 
   {          # we have passed the alpha and beta tests, now combine these two variables
clusters[,max.row] <- clusters[,max.row] + sign.max *  clusters[,max.col] 

 
cluster.names <- colnames(clusters)

#summarize the results
results[count,1] <- cluster.names[max.row]
results[count,2] <- cluster.names[max.col]
results[count,"similarity"] <- sim[max.cell]
results[count,"correlation"] <- sim.mat[max.cell]
results[count,"alpha1"] <- item.rel[max.row]
results[count,"alpha2"] <- item.rel[max.col]
size1 <- cluster.stats$size[max.row]
size2 <- cluster.stats$size[max.col]
results[count,"size1"] <- size1
results[count,"size2"] <- size2
results[count,"beta1"] <-  beta1
results[count,"beta2"] <-  beta2
results[count,"rbar1"] <-  rbar1
results[count,"rbar2"] <-  rbar2
results[count,"r1"] <- r1
results[count,"r2"] <- r2


results[count,"beta"] <- beta.combined

results[count,'alpha'] <- alpha
results[count,'rbar'] <- rbar
results[count,"size"] <- size12

#update
cluster.names[max.row] <- paste("C",count,sep="")
colnames(clusters) <- cluster.names
clusters <- clusters[,-max.col,drop=FALSE]
cluster.names <- colnames(clusters)

#row.max <- row.max[-max.col]    



	}  #end of combine section

if(output > 1) print(results[count,],digits=digits)
count=count+1 
if ((num.var - count) < ICLUST.options$n.clus) {keep.clustering <- FALSE}
if(num.var - count < 1) {keep.clustering <- FALSE}   #only one cluster left
	
}  #end of keep clustering loop

#make clusters in the direction of the majority of the items
#direct <- -(colSums(clusters) < 0 ) 
#clusters <- t(diag(direct) %*% t(clusters))
#colnames(clusters) <- cluster.names

ICLUST.cluster <- list(results=results,clusters=clusters,number <- num.var - count)
}   # end ICLUST.cluster
#modified June 12, 2008 to  calculate the item-cluster correlation for cluster of size 2
#modified June 14, 2009 to find weighted or unweighted beta
#unweighted had been the default option before but it would seem that weighted makes more sense
#modified June 5, 2010 to correct the graphic tree paths