R/cfa.r

Defines functions scfa mcfa cfa hcfa print.scfa print.mcfa print.hcfa plot.scfa plot.mcfa plot.hcfa bcfa print.bcfa plot.bcfa

Documented in bcfa cfa hcfa mcfa plot.bcfa plot.hcfa plot.mcfa plot.scfa print.bcfa print.hcfa print.mcfa print.scfa scfa

# cfa 0.9.3 

# Original part by Funke

# Note: If the argument is a dataframe, call as cfa(df[1:3],df[4])
# if it is a matrix, call as cfa(df[,1:3],df[,4])


scfa<-function(cfg,cnt=NA,sorton="chisq",sort.descending=TRUE,
              format.labels=TRUE)
   {
     if(is.na(cnt)[1]) cnt<-as.vector(rep(1,nrow(cfg)))
     if(!is.data.frame(cfg)) cfg<-as.data.frame(cfg)
     if (!is.data.frame(cnt)) cnt<-as.data.frame(cnt) 
     cnt<-unlist(cnt) 
     d<-aggregate(cnt,by=cfg,sum,na.rm=TRUE) 
     counts<-d[,ncol(d)] 
     n<-sum(cnt)
     configs<-ncol(cfg) 
     if (configs<2) stop("At least two configuration variables required!")
     if (format.labels==TRUE)
       {
         for (i in 1:configs) levels(d[,i])<-format(levels(d[,i]))
       }
     sums<-matrix(data=NA,nrow=nrow(d),ncol=configs)
     n.levels<-vector(mode="numeric",length=configs)
     string<-do.call("paste",d[,1:configs]) 
     strings<-strsplit(string,"\" *\"*")
     for (i in 1:configs)
       {
          if (!is.factor(d[,i])) d[,i]<-as.factor(d[,i]) 
          tbl<-aggregate(counts,d[i],sum)  
          n.levels[i]<-nrow(tbl)          
          dimnames(tbl)[1]<-tbl[1] 
#         handle "":          
          if (dimnames(tbl)[1][[1]][1]=="") dimnames(tbl)[1][[1]][1]<-"NA" 
          cidx<-as.character(d[,i])
          cidx[cidx==""]<-"NA" 
          sums[,i]<-tbl[cidx,2]
       }
     expected<-apply(sums,1,prod)/n^(configs-1)
     chisq<-(counts-expected)^2/expected 
     sortidx<-order(chisq)
     if (sorton=="chisq") sortidx<-order(chisq)
     if (sorton=="n") sortidx<-order(counts) 
     if (sorton=="label") sortidx<-order(unlist(strings))
     if (sort.descending==TRUE) sortidx<-rev(sortidx) 
     colnames(sums)<-names(cfg)
     rownames(sums)<-strings 
     reslist<-list(labels=unlist(strings[sortidx]),
                   n.levels=n.levels,
                   sums=sums[sortidx,],
                   counts=counts[sortidx],     
                   expected=expected[sortidx],
                   chisq=chisq[sortidx])
     reslist 
   }                                          


mcfa<-function(cfg,cnts,sorton="chisq",sort.descending=TRUE,
              format.labels=TRUE)
   {
     if(!is.data.frame(cfg)) cfg<-as.data.frame(cfg)
     configs<-ncol(cfg)
     if (configs<2) stop("At least two configuration variables required!")
     if (ncol(cnts)<2) stop("Only one column of counts - use cfa instead!")
     if (ncol(cnts)>2)  {options("warn"=-1)}
     if (format.labels==TRUE)
       {
         for (i in 1:configs) levels(cfg[,i])<-format(levels(cfg[,i]))
       }
     string<-do.call("paste",cfg[,1:configs]) 
     strings<-strsplit(string,"\" *\"*")
     n<-sum(cnts) 
     colsums<-apply(cnts,1,sum)
     rowsums<-apply(cnts,2,sum) 
     nsmat<-matrix(data=rep(rowsums,ncol(cnts)),nrow=nrow(cfg),ncol=ncol(cnts),byrow=TRUE)
     expected<-colsums*nsmat/n
     chisq<-apply((cnts-expected)^2/expected,1,sum)
     sortidx<-order(chisq)
     if (sorton=="chisq") sortidx<-order(chisq)
     if (sorton=="label") sortidx<-order(unlist(strings))
     if (sort.descending==TRUE) sortidx<-rev(sortidx)
     reslist<-list(labels=unlist(strings[sortidx]),
               counts=cnts[sortidx,],     
               expected=expected[sortidx,],
               chisq=chisq[sortidx]) 
     if (ncol(cnts)>2)  {options("warn"=0)}
     reslist 
   }

cfa<-function(cfg,cnts=NA,
              sorton="chisq",
              sort.descending=TRUE,
              format.labels=TRUE,
              casewise.delete.empty=TRUE,
              binom.test=FALSE,
              exact.binom.test=FALSE,
              exact.binom.limit=10,
              perli.correct=FALSE,
              lehmacher=FALSE,
              lehmacher.corr=TRUE,
              alpha=0.05,
              bonferroni=TRUE
              )
  {
     # helper function for binomial coefficients
     bin.coeff<-function(n,k) exp(lgamma(n+1)-lgamma(k+1)-lgamma(n-k+1))
     
     # helper function for binomial test
     bin.test<-function(ni,ep,ntotal) # inexact for large numbers (>10) !
       {
         sum.p<-0
         for (j in ni:ntotal)
           {
             sum.p<-sum.p+((ep^j)*(1.0-ep)^(ntotal-j)*bin.coeff(ntotal,j) )
           }
         sum.p
       }
   
     # helper function for z-test
     z.test<-function(ni,ep,ntotal)
       {
         num<-(ni-(ntotal*ep))
         den<-sqrt(ntotal*ep*(1.0-ep))
         if ((ni[1]*ep[1])<=10.0) num<-num-0.5 # continuity correction
         num/den
       }  
     # The order of the following checks is important
     if (is.null(dim(cfg)))
       stop("cfg is probably part of a matrix! Use cfg[,1:n] rather than cfg[1:n]!") 
     if (is.na(cnts)[1]) cnts<-rep(1,nrow(cfg))
     if (length(cnts)==1 && !is.data.frame(cnts))
       stop("cnts is probably part of a matrix! Use cnts[,n] rather than cnts[n]!")

     if (casewise.delete.empty)  # delete cases with "" and NA in any variable
        {
           cfg<-as.data.frame(cfg)
           idx<-rep(TRUE,nrow(cfg)) 
           for (i in 1:ncol(cfg))
             { 
               interim<-idx & (factor(cfg[,i])!="") & (!is.na(cfg[,i]))
               idx<-interim 
             } 
           cfg<-cfg[idx,]
           if (is.null(dim(cnts))) 
             {
               cnts<-cnts[idx]
               ncnts<-1
             }
           else 
             {
               cnts<-cnts[idx,]
               ncnts<-ncol(cnts) 
             }
        } 
     ncases<-nrow(cfg)
     ntotal<-sum(cnts,na.rm=TRUE)
     if (!is.null(nrow(cnts)) && (nrow(cfg)!=nrow(cnts)))
        stop("Configurations and counts must have the same numer of cases!") 
     nconfigs<-ncol(cfg)
     n<-sum(cnts,na.rm=TRUE)
     multicfa<-FALSE
     if (!is.null(dim(cnts))) multicfa<-TRUE 

     if (multicfa==FALSE)  # ---------- One sample CFA
      {
        res<-scfa(cfg,cnts,sorton,sort.descending,format.labels)
        bivariate<-TRUE        
        if (any(res$n.levels>2)) bivariate<-FALSE
        configs<-length(res$chisq) 
        if (bivariate==TRUE)
          {
            df<-(2^nconfigs-nconfigs-1)    # (KL27&57)
            dfcell<-1                      # (KL 26)
          }
        if (bivariate==FALSE)
         {

           df<-prod(res$n.levels)-sum(res$n.levels)+(nconfigs-1)  # (LW96, Korrektur Gruener)
           dfcell<-prod(res$n.levels-1)                           # richtig?
         }    
       if (bivariate==TRUE)  
              z<-z.test(res$counts,res$expected/n,n)
       else
              z<-sqrt(res$chisq)
       pz<-1.0-pnorm(z)
       nz<-length(z)
       if (bonferroni==FALSE) nz<-1 # no adjustment 
       pzsig<-((1.0-pnorm(z,0,1))<(alpha/nz) | 
                 (pnorm(z,0,1)<(alpha/nz)))  # z: Type
       sig.chisq<-(1.0-pchisq(res$chisq,dfcell))<(alpha/nz)  # Chi^2: Type or antitype
       totalchisq<-sum(res$chisq,na.rm=TRUE)
       p.total.chisq<-1.0-pchisq(totalchisq,dfcell)
       if (binom.test==TRUE)   # LW 43 Approximate test for large n and "not too low" p
         { 
            z.bin<-(res$counts-res$expected)/sqrt(res$expected*(1-res$expected/n))
            low.limit<-qnorm(alpha/2)  # Binom: Type or antitype
            hi.limit<-qnorm(1.0-alpha/2)
            sig.bin<-((z.bin<low.limit) | (z.bin>hi.limit))
         } 
       if (exact.binom.test==TRUE && bivariate==TRUE)
         {
            p.exact.bin<-rep(NA,ncases)
            for (i in 1:ncases)
              {
                if (res$counts[i]<=exact.binom.limit) # Binom exact: Type
                  p.exact.bin[i]<-bin.test(res$counts[i],res$expected[i]/n,n)
              }
         }
       if (perli.correct==TRUE) # LW 42
          {
             numerator.corr<-rep(0,length(res$counts)) 
             idx<-((res$counts>5) & (res$counts<=10) & (res$counts-res$expected>0)) 
             numerator.corr[idx]<-(-0.5) 
             idx<-((res$counts>5) & (res$counts<=10) & (res$counts-res$expected<=0)) 
             numerator.corr[idx]<-0.5 
             x.perli<-(res$counts-res$expected+numerator.corr)/sqrt(res$expected) 
             low.limit<-qnorm(alpha/2)  # Perli z: Type or antitype
             hi.limit<-qnorm(1.0-alpha/2)
             sig.perli<-((x.perli<low.limit) | (x.perli>hi.limit))
          }
       if (lehmacher==TRUE)  # LW 44
         {
           p18<-apply((res$sums-1),1,prod)/((n-1)^nconfigs)
           pn<-apply(res$sums,1,prod)/(n^nconfigs)
           vl<-n*pn*(1-pn-(n-1)*(pn-p18))
           zl<-(res$counts-res$expected)/sqrt(vl) 
           # Kuechhoff's correction
           corr<-rep(0,length(res$counts)) 
           idx<-res$counts>res$expected
           corr[idx]<--0.5
           idx<-res$counts<res$expected
           corr[idx]<-0.5 
           zl.corr<-(res$counts-res$expected+corr)/sqrt(vl)
           low.limit<-qnorm(alpha/2)  # 2 sided test (type OR antitype)
           hi.limit<-qnorm(1.0-alpha/2)
           sig.zl<-((zl<low.limit) | (zl>hi.limit)) 
           sig.zl.corr<-((zl.corr<low.limit) | (zl.corr>hi.limit)) 
         } 
       q<-2*abs(res$counts-res$expected)/(n+abs(2*res$expected-n)) # Q: KL p. 34  
       # Table
       resdataframe<-data.frame(label=res$labels,
                                n=res$counts,
                                expected=res$expected,
                                Q=q, 
                                chisq=res$chisq,
                                p.chisq=1-pchisq(res$chisq,dfcell),
                                sig.chisq=sig.chisq,
                                z=z,
                                p.z=pz,
                                sig.z=pzsig)
      if (perli.correct==TRUE) resdataframe<-cbind(resdataframe,x.perli,sig.perli)                                
      if (lehmacher==TRUE && lehmacher.corr==FALSE) resdataframe<-cbind(resdataframe,zl,sig.zl) 
      if (lehmacher==TRUE && lehmacher.corr==TRUE) resdataframe<-cbind(resdataframe,zl.corr,sig.zl.corr)
      if (binom.test==TRUE) resdataframe<-cbind(resdataframe,z.bin,sig.bin)
      if (exact.binom.test==TRUE && bivariate==TRUE) resdataframe<-cbind(resdataframe,p.exact.bin)
      # Summary stats                                
       resvector<-c(totalchisq, df,p.total.chisq,n)
       names(resvector)<-c("totalchisq","df","p","sum of counts")
       # Levels of all configs
       reslevels<-res$n.levels 
       names(reslevels)<-colnames(cfg)
       res<-list(table=resdataframe,summary.stats=resvector,levels=reslevels)
       class(res)<-"scfa"
     }
      

     if (multicfa==TRUE)   # ----------- Multi sample CFA
      {
        res<-mcfa(cfg,cnts,sorton,sort.descending,format.labels) 
        bivariate<-TRUE        
        if (max(dim(table(cfg)))>2) bivariate<-FALSE 
        configs<-length(res$chisq) 
        counts<-ncol(cnts)
        factorn<-dim(table(cfg)) 
        if (bivariate==TRUE)
          {
            df<-(2^configs-1)*(counts-1) # (KL82)
            dfcell<-1                    # (KL77)
          }
        if (bivariate==FALSE)
          {
            df<-prod(factorn)+sum(-factorn)+(configs-1)*(counts-1) # richtig?
            dfcell<-prod(factorn-1)                                # richtig?
          }
          
       p.chisq<-1.0-pchisq(res$chisq,dfcell) 
       nz<-length(p.chisq)
       if (bonferroni==FALSE) nz<-1 # no adjustment
#      pzsig is Bonferroni-adjusted
       pchisqsig<-(p.chisq<(alpha/nz)) 
       totalchisq<-sum(res$chisq,na.rm=TRUE)
       p.total.chisq<-1.0-pchisq(totalchisq,dfcell) 
#      Table
       resdataframe<-data.frame(label=res$labels,
                                n=res$counts,
                                expected=res$expected,
                                chisq=res$chisq,
                                p.chisq=p.chisq,
                                p.chisq.sig=pchisqsig)
      # Summary stats                                
       resvector<-c(totalchisq, df,p.total.chisq,n)
       names(resvector)<-c("totalchisq","df","p","sum of counts") 
       res<-list(table=resdataframe,summary.stats=resvector)
       class(res)<-"mcfa"
      } 
     res
  }
 
hcfa<-function(configs,cnts)
   {   
      h.cfa.configs<-list()
      h.cfa.chisqs<-vector(mode="numeric") 
      h.cfa.dfs<-vector(mode="numeric")
      # delete subconfigurations and run CFA for the remaining combination
      perm.cfa<-function(configs,cnts)
       { 
         for (i in 1:ncol(configs))
          {
             configs.1<-configs[-i]
             res<-cfa(configs.1,cnts) 
             cfglbl<-paste(names(configs.1),sep=" ")
             h.cfa.configs<<-c(h.cfa.configs,list(cfglbl))
             h.cfa.chisqs<<-c(h.cfa.chisqs,res$summary.stats["totalchisq"])
             h.cfa.dfs<<-c(h.cfa.dfs,res$summary.stats["df"])
             if (ncol(configs.1)>=3)
               {
                 perm.cfa(configs.1,cnts)             
               }
          }
         list(h.cfa.configs,h.cfa.chisqs,h.cfa.dfs)
        } 
     configs<-as.data.frame(configs)
     cnts<-as.data.frame(cnts) 
     lcfg<-nrow(configs) 
     lcnt<-nrow(cnts)
     if (!is.data.frame(configs))
       stop("Configs must be a data frame")
     if (!is.data.frame(cnts)) 
       stop("Counts must be a data frame")   
     if (lcfg!=lcnt)
       stop("Configs and counts must have the same number of rows")
      if (ncol(configs)<3) 
          stop("Less than three config variables make no sense")

      res<-perm.cfa(configs,cnts) 
      lbls<-res[[1]]
      chisqs<-res[[2]]
      dfs<-res[[3]] 
      sortidx<-rev(order(chisqs))
      sorted.lbls<-lbls[sortidx]
      sorted.chisqs<-chisqs[sortidx]
      sorted.dfs<-dfs[sortidx]
      namevec<-""
      for (i in 1:length(sorted.lbls))
        {
          namevec<-c(namevec,paste(unlist(sorted.lbls[i]),collapse=" ")) 
        }
      names(sorted.chisqs)<-namevec[-1]
      orders<- unlist(lapply(strsplit(names(sorted.chisqs)," "),length)) 
      df<- unlist(lapply(strsplit(names(sorted.dfs)," "),length))
      res<-list(chisq=sorted.chisqs,df=sorted.dfs,orders=orders)
      class(res)<-"hcfa"
      res
   }
   
print.scfa<-function(x,...)
   { 
      cat("\n*** Analysis of configuration frequencies (CFA) ***\n\n")
      print(x$table)
      cat("\n\n")
      cat("Summary statistics:\n\n")
      cat("Total Chi squared         = ",x$summary.stats["totalchisq"],"\n") 
      cat("Total degrees of freedom  = ",x$summary.stats["df"],"\n") 
      cat("p                         = ",x$summary.stats["p"],"\n") 
      cat("Sum of counts             = ",x$summary.stats["sum of counts"],"\n")
      cat("\n")
      cat("Levels:\n\n")
      print(x$levels,...)
      cat("\n")
   }
   

print.mcfa<-function(x,...)
   { 
      cat("\n*** Repeated analysis of configuration frequencies (MCFA) ***\n\n")
      print(x$table,...)
      cat("\n\n")
      cat("Summary statistics:\n\n")
      cat("Total Chi squared         = ",x$summary.stats["totalchisq"],"\n") 
      cat("Total degrees of freedom  = ",x$summary.stats["df"],"\n") 
      cat("p                         = ",x$summary.stats["p"],"\n") 
      cat("Sum of counts             = ",x$summary.stats["sum of counts"],"\n")
      cat("\n")
   }

print.hcfa<-function(x,...)
   {
      cat("\n*** Hierarchical CFA ***\n\n")
      restbl<-matrix(NA,nrow=length(x$chisq),ncol=4)
      restbl[,1]<-x$chisq
      restbl[,2]<-x$df
      restbl[,3]<-1.0-pchisq(x$chisq,x$df) 
      restbl[,4]<-x$orders 
      colnames(restbl)<-c("Overall chi squared","df","p","order")
      rownames(restbl)<-names(x$chisq)
      print(restbl,...)
      restbl
   }

plot.scfa<-function(x,...)
   { 
     plot(unlist(x$table["n"]),unlist(x$table["chisq"]),
          main="Chi squared vs. n\n(left click to identify, right click to stop)",
          xlab="n",
          ylab="Chi squared",...
         )
      selected<-identify(unlist(x$table["n"]),unlist(x$table["chisq"]),
         n=nrow(x$table)) 
     if (length(selected)>0) print(x$table[c(selected),"label",drop=FALSE])
   }


plot.mcfa<-function(x,...)
   { 
     s<-colnames(x$table) 
     idx<-grep("n.counts",s)
     nsums<-apply(x$table[,idx],1,sum) 
     plot(nsums,unlist(x$table["chisq"]),
          main="Chi squared vs. sum of n\n(left click to identify, right click to stop)",
          xlab="sum of n",
          ylab="Chi squared",...
         )
      selected<-identify(nsums,unlist(x$table["chisq"]),
         n=nrow(x$table)) 
     if (length(selected)>0) print(x$table[c(selected),"label",drop=FALSE])
   }

plot.hcfa<-function(x,...)
   { 
     dotchart(unlist(x["chisq"]),groups=factor(unlist(x["orders"])),
              main="Chi squared by order",
              xlab="item",
              ylab="Chi squared",...
             )
   }

bcfa<-function(configs,cnts,runs=100,sig.item="sig.z",...)
   {
       if (is.null(dim(cnts))) 
             {
#               incvector<-numeric(len=length(cnts))
                incvector<-numeric(length(cnts)) 
             }
           else 
             { 
#               incvector<-numeric(len=length(cnts[,1]))
               incvector<-numeric(length(cnts[,1]))
             }    
      cnts<-unlist(cnts)       
      n<-sum(cnts) 
      for (i in  1:runs) 
        {
          for (j in 1:(length(cnts)))
           {
             lim<-cnts[j]/n
             rndnums<-runif(cnts[j])
             lres<-(rndnums<=lim)
             inc<-length(lres[lres==TRUE])
             incvector[j]<-incvector[j]+inc
           }
         if (sum(incvector)==0) stop("All counts are zero")
         # must be sorted on label to prevent re-ordering due to different n or chisq
         res<-cfa(configs,as.data.frame(incvector),sorton="label",...) 
         if (!sig.item %in% colnames(res$table)) stop("invalid sig.item specified!")
         boolres<-unlist(res$table[sig.item])
         if (i==1)
           {
             cnttype<-rep(0,length(res$table[,"n"]))
             cntleexpected<-rep(0,length(res$table[,"n"]))
             cntgtexpected<-rep(0,length(res$table[,"n"]))
           }
         idxleexpected<-res$table[,"n"]-res$table[,"expected"]<=0
         idxgtexpected<-res$table[,"n"]-res$table[,"expected"]>0
         cntleexpected[idxleexpected]<-cntleexpected[idxleexpected]+1
         cntgtexpected[idxgtexpected]<-cntgtexpected[idxgtexpected]+1
         cnttype[boolres]<-cnttype[boolres]+1 
      }
    qcnttype<-cnttype/runs
    qtypes<-cntgtexpected/runs
    restbl<-cbind(cntleexpected,
                  cntgtexpected,
                  qtypes*100.0,
                  cnttype,
                  qcnttype*100.0) 
    rownames(restbl)<-as.character(res$table[,"label"])
    colnames(restbl)<-c("cnt.antitype",
                        "cnt.type",
                        "pct.types",
                        "cnt.sig",
                        "pct.cnt.sig")
    class(restbl)<-"bcfa"
    restbl
   } 
   
print.bcfa<-function(x,...)
   {
     print(x[,1:5],...) 
   }

plot.bcfa<-function(x,...)
   { 
      plot(unlist(x[,"cnt.type"]),unlist(x[,"cnt.sig"]),
           main="Count(sig.) by count(type)\n(left click to identify, right click to stop)",
           xlab="Count(type)",
           ylab="Count(sig.)",...
          )
      selected<-identify(unlist(x[,"cnt.type"]),unlist(x[,"cnt.sig"]),
                n=length(unlist(x[,"cnt.sig"]))) 
      names(selected)<-row.names(x)[selected]         
     if (length(selected)>0) print(selected)          
   }

Try the cfa package in your browser

Any scripts or data that you put into this service are public.

cfa documentation built on May 2, 2019, 1:46 p.m.