R/ICLUST.cluster.R

ICLUST.cluster <- function (r.mat,ICLUST.options,smc.items) {#should allow for raw data, correlation or covariances

#options:  alpha =1  (minimum alpha)  2 (average alpha)    3 (maximum alpha)
#          beta =1   (minimum beta)   2 (average beta)     3 (maximum beta)
#          correct  for reliability
#          reverse score items if negative correlations
#          stop clustering if beta for new clusters < beta.min
#          output =1 (short)    2  (show steps)     3 show rejects as we go
#          
#initialize  various arrays and get ready for the first pass
output <- ICLUST.options$output

num.var <- nrow(r.mat)
keep.clustering <- TRUE          #used to determine when we are finished clustering
results <- data.frame(matrix(rep(0,18*(num.var-1)),ncol=18))   #create the data frame for the results
#results <-  matrix(rep(0,18*(num.var-1)),ncol=18)   #use a matrix for speed but we can not address it by name
names(results) <- c("Item/Cluster", "Item/Clust","similarity","correlation","alpha1","alpha2",
"beta1","beta2","size1","size2","rbar1","rbar2","r1","r2","alpha","beta","rbar","size")
rownames(results) <- paste("C",1:(num.var-1),sep="")
digits <- ICLUST.options$digits
clusters <- diag(1,nrow =nrow(r.mat))    #original cluster structure is 1 item clusters
if(is.null(rownames(r.mat))) {rownames(r.mat) <-  paste("V",1:num.var,sep="") }
rownames(clusters) <- rownames(r.mat)
colnames(clusters) <- paste("V",1:num.var,sep="")
diag(r.mat) <- 0
row.range <- apply(r.mat,1,range,na.rm=TRUE)     
item.max<- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
diag(r.mat) <- 1

count=1



#master loop 

while (keep.clustering) {   #loop until we figure out we should stop
#find similiarities
 #we will do most of the work on a copy of the r.mat
#cluster.stats <- cluster.cor(clusters,r.mat,FALSE,SMC=ICLUST.options$SMC)   #deleted 30/12/13
cluster.stats <- cluster.cor(clusters,r.mat,FALSE,SMC=ICLUST.options$SMC,item.smc=smc.items)


sim.mat <- cluster.stats$cor    #the correlation matrix
diag(sim.mat) <- 0   #we don't want 1's on the diagonal to mess up the maximum 

#two ways to estimate reliability -- for 1 item clusters, max correlation, for >1, alpha
#this use of initial max should be an option
if (ICLUST.options$correct) {   #find the largest and smallest similarities for each variable      
	row.range <- apply(sim.mat,1,range,na.rm=TRUE)     
	row.max <- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
   } else {row.max <- rep(1, nrow(sim.mat)) }         #don't correct for largest similarity
   
item.rel <- cluster.stats$alpha  
for (i in 1: length(item.rel)) { if (cluster.stats$size[i]<2) {
	item.rel[i] <- row.max[i]
	#figure out item betas here?
	}}
	
	
if(output>3)  print(sim.mat,digits=digits)


 #this is the corrected for maximum r similarities
if (ICLUST.options$correct) {
  sq.max <- diag(1/sqrt(item.rel))     #used to correct for reliabilities
  sim <- sq.max %*% sim.mat %*% sq.max   #this corrects for reliabilities but messes up the correlations of two item clusters with items
     } else {sim <- sim.mat}
     


diag(sim) <- NA                  #we need to not consider the diagonal when looking for maxima
#find the most similar pair    and apply tests if we should combine
test.alpha <- FALSE
test.beta <- FALSE

while(!(test.alpha&test.beta)){

max.cell <- which.max(sim)             #global maximum
if (length(max.cell) < 1) {
   keep.clustering <- FALSE
   break}   #there are no non-NA values left
sign.max <- 1
if ( ICLUST.options$reverse ) {            #normal case is to reflect if necessary
	min.cell <- which.min(sim)             #location of global minimum
	if (sim[max.cell] <  abs(sim[min.cell] )) { 
 	  	sign.max <- -1
		max.cell <- min.cell }
	if (sim[max.cell] < 0.0) {sign.max <- -1 }}  
	               #this is a weird case where all the similarities are negative  -- happens towards the end of clustering
	
max.col <- trunc(max.cell/nrow(sim))+1    #is in which row and column?
max.row <- max.cell - (max.col-1)*nrow(sim) #need to fix the case of first column
if (max.row < 1) {max.row <- nrow(sim)
 max.col <- max.col-1 }




size1 <- cluster.stats$size[max.row]
if(size1 < 2) {V1 <- 1
	beta1 <-  item.rel[max.row]
	alpha1 <- item.rel[max.row]
	rbar1 <-  item.rel[max.row]
	
    
	} else {
	rbar1 <- results[cluster.names[max.row],"rbar"]
	beta1 <- results[cluster.names[max.row],"beta"]
	alpha1 <- results[cluster.names[max.row],"alpha"]
	V1 <- size1 + size1*(size1-1) * rbar1
	}
	 
size2 <- cluster.stats$size[max.col]
if(size2 < 2) {V2 <- 1 
	beta2 <-  item.rel[max.col] 
	alpha2 <- item.rel[max.col] 
	rbar2 <-  item.rel[max.col]
	
	
	} else {
	
	rbar2 <- results[cluster.names[max.col],"rbar"]
	beta2 <- results[cluster.names[max.col],"beta"]
	alpha2 <- results[cluster.names[max.col],"alpha"] 
	V2 <- size2 + size2 * (size2-1) * rbar2}




Cov12 <- sign.max * sim.mat[max.cell] * sqrt(V1*V2) #this flips the sign of the correlation for negative correlations
r12 <- Cov12/(size1*size2)   #average between cluster r
V12 <- V1 + V2 + 2 * Cov12  #the variance of the new cluster

size12 <- size1 + size2
V12c <- (V12 - size12)*(size12/(size12-1))  #true variance (using the average r on the diagonal)
rbar <- V12c/(size12^2)
alpha <- V12c/V12


#combine these two rows if the various criterion are passed
#beta.weighted <- size12^2 * sign.max *r12/V12   #this was added June, 2009 but can produce negative betas
beta.weighted <- size12^2 *r12/V12               #corrected July 28, 2009  
beta.unweighted <-  2* sign.max*sim.mat[max.cell]/(1+sign.max* sim.mat[max.cell])
if(ICLUST.options$weighted) {beta.combined <- beta.weighted} else {beta.combined <- beta.unweighted}  



#what is the correlation of this new cluster with the two subclusters?
#this considers item overlap problems
#There are actually two alternative solutions
#a) (cor.gen=TRUE)   finds the correlation due to a shared general factor 
#b) (cor.gen=FALSE)  finds the correlation for the general + group but remove the item overlap problem
#neither seems optimal, in that a) will correctly identify non-correlated clusters, but b) is less affected by small clusters.

if(ICLUST.options$cor.gen) {


c1 <-  r12 * size1 * size1 +  Cov12    #corrected covariance  
c2 <-  sign.max*(r12 * size2 * size2 +   Cov12) } else {

c1 <-  size1^2* rbar1 +   Cov12  
c2 <-  sign.max*(size2^2 *rbar2 +   Cov12) }


if((size1 < 2) && (size2  < 2)) { #r2 should be flipped if necessary -- r2 is always flipped (if necessary) when forming clusters 
       r1 <- sqrt(abs(rbar1))   #this  corrects for reliability in a two item cluster
       r2 <- sign.max* r1       #flips the sign if two are negatively correlated  -- in the case of two items 
                             } else {    #this next part corrects for item overlap as well as reliability of the subcluster
                                        
                                 if (ICLUST.options$correct.cluster) {   #correct is the default option
                                 if(TRUE)  {r1 <- c1/sqrt((V1 - size1  +size1 * rbar1) * V12) 
												       if (size2 < 2)  {
												       r2 <- c2/sqrt(abs(rbar2)*V12)}  else  {

												      # r2 <- sign.max * c2/sqrt((V2-size2 + size2 * rbar2)*V12c)}    #changed yet again on 6/10/10
												       r2 <-  c2/sqrt((V2-size2 + size2 * rbar2)*V12c)}
												    } else {
												      if(size1 < 2 )   {
												     
												      r1 <-  c1/sqrt(abs(rbar1)*V12)} else  { 
												       
												      r1 <- c1/sqrt((V1-size1 + size1 * rbar1)*V12c)  }  
                                     #flip the smaller of the two clusters  -- no, flip r2
                                                         if (size2 < 2) {r2 <- c2/sqrt(abs(rbar2)*V12)} else { r2 <-  c2/sqrt((V2-size2 + size2*rbar2)*V12c)}
												          #  r2 <-  c2/sqrt((V2-size2+size2*rbar2)*V12c) 
												          }
							         
							} else {if(TRUE) {r1 <- c1/sqrt(V1*V12)   #do not correct
												   r2 <- sign.max* c2/sqrt(V2*V12)
												    } else {
												        r1 <-sign.max* c1/sqrt(V1*V12)  } 
                                     #flip the smaller of the two clusters  - flip r2
												            r2 <-  c2/sqrt(V2*V12) }
							
						}
#test if we should combine these two clusters  
#first, does alpha increase?
test.alpha <- TRUE

if (ICLUST.options$alpha>0) { #should we apply the alpha test?
if (ICLUST.options$alpha.size < min(size1,size2)) {
  switch(ICLUST.options$alpha, {if (alpha < min(alpha1,alpha2)) {if (output>2) {print(
  paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],
  'new alpha =', alpha,'old alpha1 =', alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE }},
  {if (alpha < mean(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new alpha =',alpha,
  'old alpha1 =',alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE  }},
  {if (alpha < max(alpha1,alpha2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new alpha =', alpha,
  'old alpha1 =',alpha1,"old alpha2 =",alpha2))}
												test.alpha <- FALSE  }}) #end switch  
  }   #end if options$alpha.size
  }

#second, does beta increase ?
test.beta <- TRUE          
if (ICLUST.options$beta>0) { #should we apply the beta test?
if (ICLUST.options$beta.size < min(size1,size2)) {
  switch(ICLUST.options$beta, {if (beta.combined < min(beta1,beta2)) {if (output>2) {print(
  paste ('do not combine ', cluster.names[max.row],"with", cluster.names[max.col],'new beta =',
  round (beta.combined,digits),'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE }},
  {if (beta.combined < mean(beta1,beta2)) {if (output>2) {print(paste ('do not combine ', 
  cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits),
  'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE  }},
  {if (beta.combined < max(beta1,beta2)) {if (output>2) {print(paste ('do not combine ',
  cluster.names[max.row],"with", cluster.names[max.col],'new beta =', round (beta.combined,digits),
  'old beta1 =',round( beta1,digits),"old beta2 =",round(beta2,digits)))}
												test.beta <- FALSE  }}) #end switch  
												
  }   #end if options$beta.size

}




if(test.beta & test.alpha) { break} else { #we have failed the combining criteria 

            if((ICLUST.options$n.clus > 0) & ((num.var - count ) >= ICLUST.options$n.clus) ) {warning ("Clusters formed as requested do not meet the alpha and beta criteria. Perhaps you should rethink the number of cluster settings.")
			                break } else {   
			if (beta.combined < ICLUST.options$beta.min) {
			
			keep.clustering <- FALSE       #the most similiar pair is not very similar, we should quit
			break}  else { 
			
			sim[max.row,max.col] <- NA 
		sim[max.col,max.row] <- NA }}
    }   #end of test.beta & test.alpha
    
}    #end of while test.alpha & test.beta.loop

#combine and summarize
if (keep.clustering) 
   {          # we have passed the alpha and beta tests, now combine these two variables
clusters[,max.row] <- clusters[,max.row] + sign.max *  clusters[,max.col] 

 
cluster.names <- colnames(clusters)

#summarize the results
results[count,1] <- cluster.names[max.row]
results[count,2] <- cluster.names[max.col]
results[count,"similarity"] <- sim[max.cell]
results[count,"correlation"] <- sim.mat[max.cell]
results[count,"alpha1"] <- item.rel[max.row]
results[count,"alpha2"] <- item.rel[max.col]
size1 <- cluster.stats$size[max.row]
size2 <- cluster.stats$size[max.col]
results[count,"size1"] <- size1
results[count,"size2"] <- size2
results[count,"beta1"] <-  beta1
results[count,"beta2"] <-  beta2
results[count,"rbar1"] <-  rbar1
results[count,"rbar2"] <-  rbar2
results[count,"r1"] <- r1
results[count,"r2"] <- r2


results[count,"beta"] <- beta.combined

results[count,'alpha'] <- alpha
results[count,'rbar'] <- rbar
results[count,"size"] <- size12

#update
cluster.names[max.row] <- paste("C",count,sep="")
colnames(clusters) <- cluster.names
clusters <- clusters[,-max.col]
cluster.names <- colnames(clusters)

#row.max <- row.max[-max.col]    



	}  #end of combine section

if(output > 1) print(results[count,],digits=digits)
count=count+1 
if ((num.var - count) < ICLUST.options$n.clus) {keep.clustering <- FALSE}
if(num.var - count < 1) {keep.clustering <- FALSE}   #only one cluster left
	
}  #end of keep clustering loop

#make clusters in the direction of the majority of the items
#direct <- -(colSums(clusters) < 0 ) 
#clusters <- t(diag(direct) %*% t(clusters))
#colnames(clusters) <- cluster.names

ICLUST.cluster <- list(results=results,clusters=clusters,number <- num.var - count)
}   # end ICLUST.cluster
#modified June 12, 2008 to  calculate the item-cluster correlation for cluster of size 2
#modified June 14, 2009 to find weighted or unweighted beta
#unweighted had been the default option before but it would seem that weighted makes more sense
#modified June 5, 2010 to correct the graphic tree paths

Try the psych package in your browser

Any scripts or data that you put into this service are public.

psych documentation built on Sept. 26, 2023, 1:06 a.m.