R/corMatCCA.R

Defines functions corMatCCA

Documented in corMatCCA

#' Perform Canonical Correlation Analysis on a correlation matrix when no raw data is available.
#'
#' This function performs a Canonical Correlation Analysis on a correlation matrix. 
#' It assumes that the correlation matrix contains variable column names.
#' If no 'nObs' is supplied, no significance test will be performed.
#'
#' @param corMat A symmetric correlation matrix.
#' @param p The number of variables in the first set. Used to separate the sets.
#' @param set1name Name of the first set. Defaults to "set1".
#' @param set2name Name of the second set. Defaults to "set2".
#' @param useColumnNames Boolean that indicates whether to use the column names in the supplied correlation matrix. Defaults to True.
#' @param set1VarPrefix Prefix used for the variables in set 1. Ignored if useColumnNames=True and if all columns in the correlation matrix contain variable names. Defaults to "U".
#' @param set2VarPrefix Prefix used for the variables in set 2. Ignored if useColumnNames=True and if all columns in the correlation matrix contain variable names. Defaults to "V".
#' @param nObs Number of observations. Used for testing significance of the canonical variate pairs. 
#' @param alpha Significance level at which the test will be evaluated.
#' @return A list object with CCA information.
#' \itemize{
#'   \item corMat - The correlation matrix used for the analysis (unchanged).
#'   \item RCC - A list containing the raw correlation coefficients for both sets.
#'   \item SCC - A vector containing the sample canonical correlations between the variate pairs.
#'   \item R2 - A vector containing the squared sample canonical correlations between the variate pairs.
#'   \item prExVar - A dataframe containing the proportions of explained variance of the variate pairs.
#'   \item testRes - The result of the large sample test.
#'   \item additionalCorrelations - A list containing additional correlation matrices.
#' }
#' @export


#e10_5<-as.matrix(read.delim("C:/Users/parx/Documents/Statistics Master program/courses/Multivariate analysis/labs/Wichern_data/Wichern_data/E10-5.dat", header = T, sep=""), byrow=F, fill=T)
#e10_5[upper.tri(e10_5)]<-t(e10_5)[upper.tri(e10_5)] #complete the matrix
#corMat<-e10_5
#p<-5
#nObs<-784
#set1name<-"set1"
#set2name<-"set2"
#useColumnNames<-F
#set1VarPrefix<-"U"
#set2VarPrefix<-"V"
#alpha<-0.05

corMatCCA<-function(corMat, p, set1name="set1", set2name="set2", useColumnNames=T, set1VarPrefix="U", set2VarPrefix="V", nObs=NA, alpha=0.05){

  q<-ncol(corMat)-p
  n<-min(p,q)
  
  if (length(colnames(corMat)) != ncol(corMat) | useColumnNames==F) {
    set1varNames<-sprintf(paste(set1VarPrefix, "%d", sep=""), 1:p)
    set2varNames<-sprintf(paste(set2VarPrefix, "%d", sep=""), 1:q)
  }
  else {
    set1varNames<-colnames(corMat[,1:p])
    set2varNames<-colnames(corMat[,(p+1):ncol(corMat)])
  }
  
  R11<-corMat[1:p, 1:p]
  R21<-corMat[(p+1):ncol(corMat), 1:p]
  R12<-corMat[1:p, (p+1):ncol(corMat)]
  R22<-corMat[(p+1):ncol(corMat), (p+1):ncol(corMat)]
  
  eigDec_R11<-eigen(R11)
  eigDec_R22<-eigen(R22)
  R11_nsqrt<-eigDec_R11$vectors %*% diag( 1/ sqrt( eigDec_R11$values) ) %*% t(eigDec_R11$vectors)
  R22_nsqrt<-eigDec_R22$vectors %*% diag( 1/ sqrt( eigDec_R22$values) ) %*% t(eigDec_R22$vectors)
  eigDec1<-eigen(R11_nsqrt%*%R12%*%solve(R22)%*%R21%*%R11_nsqrt)
  eigDec2<-eigen(R22_nsqrt%*%R21%*%solve(R11)%*%R12%*%R22_nsqrt)
  
  A<-t(eigDec1$vectors)%*%R11_nsqrt
  RCC_set1<-t(A[1:n,])
  colnames(RCC_set1)<-sprintf("U%d",1:n)
  rownames(RCC_set1)<-set1varNames
  A_inv<-solve(A)
  
  SCC=sqrt(eigDec1$values[1:n])
  R2<-eigDec1$values[1:n]
  
  prExpU<-c()
  prExpU_cum<-c()
  for(i in 1:n) {
    prExpU<-c(prExpU, round(sum(diag(A_inv[,i]%*%t(A_inv[,i])))/p, 6)) # proportion explained by Un
    prExpU_cum<-c(prExpU_cum, round(sum(prExpU),6)) #cumulative proportion explained by Un
  }
  
  B<-t(eigDec2$vectors)%*%R22_nsqrt
  RCC_set2<-t(B[1:n,])
  colnames(RCC_set2)<-sprintf("V%d",1:n)
  rownames(RCC_set2)<-set2varNames
  B_inv<-solve(B)
  
  prExpV<-c()
  prExpV_cum<-c()
  for(i in 1:n){
    prExpV<-c(prExpV, round(sum(diag(B_inv[,i]%*%t(B_inv[,i])))/q, 6)) # proportion explained by Vn
    prExpV_cum<-c(prExpV_cum, sum(prExpV)) # cumulative proportion explained by Vn
  }
  
  prExVar<-data.frame(c(rep(set1name, n), rep(set2name,n)), c(sprintf("U%d",1:n), sprintf("V%d",1:n)), c(prExpU, prExpV), c(prExpU_cum, prExpV_cum))
  colnames(prExVar)<-c("set", "canonical variate", "proportion explained", "cumulative")
  rownames(prExVar)<-paste(prExVar[,1], "_", prExVar[,2], sep="")
  
  #large sample test:
  #nObs<-500
  if(is.numeric(nObs)){
    testHypothesis<-c()
    testVariate<-c()
    testStat<-c()
    testDF<-c()
    testPval<-c()
    for(i in 1:n){
      H0<-paste(paste(sprintf("p%d = ",i:n), collapse = ""), "0", sep="")
      v <- i
      s <- -(nObs-1-(1/2)*(p+q+1))*log(prod(1-eigDec1$values[i:p]))
      df <- (p-(i-1))*(q-(i-1))
      pVal<-pchisq(q=s, df=df, lower.tail=FALSE)
      testHypothesis<-c(testHypothesis, H0)
      testVariate<-c(testVariate, v)
      testStat<-c(testStat, s)
      testDF<-c(testDF, df)
      testPval<-c(testPval, pVal)
    }
    testRes<-data.frame(testHypothesis, testVariate, SCC, testStat, testDF, testPval, testPval<=alpha)
    colnames(testRes)<-c("H0", "Canonical variate pair", "Sample canonical correlation", "Large sample test statistic", "DF", "P-value", paste("Reject H0 at ", alpha*100, "% level with n=", nObs,  "."))
    
  }
  else {
    testRes<-"Number of observations not applicable!"
  }
  
  R_ux1<-Matrix::t(round(A%*%R11%*%Matrix::Diagonal(p, diag(R11)^(-1/2)),4))[,1:n]
  R_vx2<-Matrix::t(round(B%*%R22%*%Matrix::Diagonal(q, diag(R22)^(-1/2)),4))[,1:n]
  R_ux2<-Matrix::t(round(A%*%R12%*%Matrix::Diagonal(q, diag(R22)^(-1/2)),4))[,1:n]
  R_vx1<-Matrix::t(round(B%*%R21%*%Matrix::Diagonal(p, diag(R11)^(-1/2)),4))[,1:n]
  rownames(R_ux1)<-set1varNames
  colnames(R_ux1)<-sprintf("U%d",1:n)
  rownames(R_vx2)<-set2varNames
  colnames(R_vx2)<-sprintf("V%d",1:n)
  rownames(R_ux2)<-set2varNames
  colnames(R_ux2)<-sprintf("U%d",1:n)
  rownames(R_vx1)<-set1varNames
  colnames(R_vx1)<-sprintf("V%d",1:n)
  
  #sqrt(eigDec1$values[1:n])
  return(list(corMat=corMat, setNames=c(set1name, set2name), RCC=list(RCC_set1=RCC_set1, RCC_set2=RCC_set2), SCC=SCC, R2=R2, prExVar=prExVar, testRes=testRes, additionalCorrelations=list(R_ux1=R_ux1, R_vx2=R_vx2, R_ux2=R_ux2, R_vx1=R_vx1)))
}
parhammarstrom/multivariateAnalysisVT2021 documentation built on March 22, 2021, 4:52 p.m.