R/cluster.cor.R
In psych: Procedures for Psychological, Psychometric, and Personality Research

"cluster.cor" <-
#fixed 6/14/24 to allow the use of a keys list
#added  smc.items   30.12/13 to reduce the number of calls to SMC to once
function(keys,r.mat,correct=TRUE,SMC=TRUE,item.smc=NULL,impute=TRUE) { #function to extract clusters according to the key vector
				#default is to correct for attenuation and show this above the diagonal
				#find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster) 
                #takes as input the keys matrix as well as a correlation matrix of all the items
 tol=sqrt(.Machine$double.eps)    #machine accuracy
 cl <- match.call()
 if(!is.matrix(keys)) keys <- as.matrix(keys)  #keys are sometimes a data frame - must be a matrix
 if(any(is.na(r.mat))) {SMC=FALSE
                 warning("Missing values in the correlation matrix do not allow for SMC's to be found")}
 r.mat[is.na(r.mat)] <- -9999999    #changes missing values to obviously incorrect values
 if(SMC && is.null(item.smc)) {item.smc <- smc(r.mat)} else {item.smc <- rep(1,dim(r.mat)[1])}   #now done once in the main function
 
 covar <- t(keys) %*% r.mat %*% keys    #matrix algebra is our friend but slow if we are doing this in iclust
   #probably better to just modify the matrix for those two rows and columns that are changing if doing an iclust
 var <- diag(covar)    #these are the scale variances
 sd.inv <- 1/sqrt(var)
 ident.sd <- diag(sd.inv,ncol = length(sd.inv))
 cluster.correl <- ident.sd %*% covar  %*% ident.sd
 cluster.correl[abs(cluster.correl)  > (1+tol)] <- NA    #happens only if item correlations were missing -- we use 1+a little to avoid rounding problem
 key.var <- diag(t(keys) %*% keys)
 key.smc <- t(keys) %*% item.smc  
 key.alpha <- ((var-key.var)/var)*(key.var/(key.var-1))
 key.lambda6 <-  (var - key.var + key.smc)/var
 key.alpha[is.nan(key.alpha)] <- 1           #if only 1 variable to the cluster, then alpha is undefined
 key.alpha[!is.finite(key.alpha)] <- 1   
 key.av.r <- key.alpha/(key.var - key.alpha*(key.var-1))  #alpha 1 = average r
 colnames(cluster.correl) <- colnames(keys)
 rownames(cluster.correl) <- colnames(keys)
 names(key.lambda6) <- colnames(keys)
 key.lambda6 <- drop(key.lambda6)
 
# diag(r.mat) <- 0  
# row.range <- apply(r.mat,1,range,na.rm=TRUE)     
# row.max <- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
#now, try to figure out the imputed correlation for the case of NAs.
if(any(is.na(cluster.correl)) && impute) { #find the missing values based upon average covariances rather than totals
#first, change those bad values back to NA
  warning('Some of the correlations were NA and were imputed')
  r.mat[r.mat < -1] <- NA
  n.keys <- ncol(keys)
  keys[keys==0] <- NA  #this will allow us to find average responses
  for (i in 1:n.keys) { 
     #first find which variables are screwed up
     if(any(is.na(cluster.correl[i,]))) {#fix them
        for (j in 1:n.keys) {if(is.na(cluster.correl[i,j])) {#fix it
        temp <- mean(colMeans((keys[,i] * r.mat),na.rm=TRUE) * keys[,j],na.rm=TRUE)*key.var[i]*key.var[j]  #this is the  average covariance times the number of items scored
        adjusted.r <- temp * ident.sd[i,i]* ident.sd[j,j]
        cluster.correl[i,j] <- adjusted.r
        }     
         }
  }
  } 
 }
 sn <- key.av.r * key.var/(1-key.av.r)
 if (correct) {cluster.corrected <- correct.cor(cluster.correl,t(key.alpha))
 result <- list(cor=cluster.correl,sd=sqrt(var),corrected= cluster.corrected,alpha=key.alpha,av.r = key.av.r,size=key.var,sn=sn,G6 =key.lambda6,Call=cl)
 }  #correct for attenuation
 else {
result <- list(cor=cluster.correl,sd=sqrt(var),alpha=key.alpha,av.r = key.av.r,size=key.var,sn=sn,G6 =key.lambda6,Call=cl)}
 class(result) <- c ("psych", "cluster.cor")
 return(result)}
#revised August 21, 2007 to add a smidgen to 1.0 in the looking for NAs.
#revised June 14, 2008 to add average.r
#revised August 25, 2009 to add lambda6
#revised December 2011 to remove digits -- this is all handled in the print function
#revised January 2012 to estimate values when we have missing values in the correlations
#74% of the time is spent doing matrix multiplication  70 seconds for a 400 x 400 problem.


"icluster.cor" <-
#added to speed up iclust by just combining rows

function(keys,r.mat,ivar,jvar,correct=TRUE,SMC=TRUE,item.smc=NULL,impute=TRUE) { #function to extract clusters according to the key vector
				#default is to correct for attenuation and show this above the diagonal
				#find the correlation matrix of scales made up of items defined in a keys matrix (e.g., extracted by factor2cluster) 
                #takes as input the keys matrix as well as a correlation matrix of all the items
 tol=sqrt(.Machine$double.eps)    #machine accuracy
 cl <- match.call()
 if(!is.matrix(keys)) keys <- as.matrix(keys)  #keys are sometimes a data frame - must be a matrix
 
 #covar <- t(keys) %*% r.mat %*% keys    #matrix algebra is our friend but slow if we are doing this in iclust
covar <- r.mat[-jvar,-jvar]   #drop the jth column and row
covar[ivar,] <- covar[,ivar] <- r.mat[ivar,] + r.mat[jvar,]
   #probably better to just modify the matrix for those two rows and columns that are changing if doing an iclust
 var <- diag(covar)    #these are the scale variances
 sd.inv <- 1/sqrt(var)
 ident.sd <- diag(sd.inv,ncol = length(sd.inv))
 cluster.correl <- ident.sd %*% covar  %*% ident.sd
 cluster.correl[abs(cluster.correl)  > (1+tol)] <- NA    #happens only if item correlations were missing -- we use 1+a little to avoid rounding problem
 key.var <- diag(t(keys) %*% keys)
 key.smc <- t(keys) %*% item.smc  
 key.alpha <- ((var-key.var)/var)*(key.var/(key.var-1))
 key.lambda6 <-  (var - key.var + key.smc)/var
 key.alpha[is.nan(key.alpha)] <- 1           #if only 1 variable to the cluster, then alpha is undefined
 key.alpha[!is.finite(key.alpha)] <- 1   
 key.av.r <- key.alpha/(key.var - key.alpha*(key.var-1))  #alpha 1 = average r
 colnames(cluster.correl) <- colnames(keys)
 rownames(cluster.correl) <- colnames(keys)
 names(key.lambda6) <- colnames(keys)
 key.lambda6 <- drop(key.lambda6)
 
# diag(r.mat) <- 0  
# row.range <- apply(r.mat,1,range,na.rm=TRUE)     
# row.max <- pmax(abs(row.range[1,]),abs(row.range[2,]))  #find the largest absolute similarity
#now, try to figure out the imputed correlation for the case of NAs.
if(any(is.na(cluster.correl)) && impute) { #find the missing values based upon average covariances rather than totals
#first, change those bad values back to NA
  warning('Some of the correlations were NA and were imputed')
  r.mat[r.mat < -1] <- NA
  n.keys <- ncol(keys)
  keys[keys==0] <- NA  #this will allow us to find average responses
  for (i in 1:n.keys) { 
     #first find which variables are screwed up
     if(any(is.na(cluster.correl[i,]))) {#fix them
        for (j in 1:n.keys) {if(is.na(cluster.correl[i,j])) {#fix it
        temp <- mean(colMeans((keys[,i] * r.mat),na.rm=TRUE) * keys[,j],na.rm=TRUE)*key.var[i]*key.var[j]  #this is the  average covariance times the number of items scored
        adjusted.r <- temp * ident.sd[i,i]* ident.sd[j,j]
        cluster.correl[i,j] <- adjusted.r
        }     
         }
  }
  } 
 }
 if (correct) {cluster.corrected <- correct.cor(cluster.correl,t(key.alpha))
 result <- list(cor=cluster.correl,sd=sqrt(var),corrected= cluster.corrected,alpha=key.alpha,av.r = key.av.r,size=key.var,G6 =key.lambda6,Call=cl)
 }  #correct for attenuation
 else {
result <- list(cor=cluster.correl,sd=sqrt(var),alpha=key.alpha,av.r = key.av.r,size=key.var,G6 =key.lambda6,Call=cl)}
 class(result) <- c ("psych", "cluster.cor")
 return(result)}
#revised August 21, 2007 to add a smidgen to 1.0 in the looking for NAs.
#revised June 14, 2008 to add average.r
#revised August 25, 2009 to add lambda6
#revised December 2011 to remove digits -- this is all handled in the print function
#revised January 2012 to estimate values when we have missing values in the correlations