R/mrCA.R

Defines functions mrCA

Documented in mrCA

#' Multiple-response Correspondence Analysis (MR-CA)
#'
#' @description This functions performs a multiple-response Correspondence Analysis (MR-CA) as defined in Mahieu, Schlich, Visalli, and Cardot (2021)
#'
#' @param data A data.frame of observations in rows whose first column is a factor (the categories) and subsequent columns are binary numeric or integer, each column being a response option
#' @param proj.row Optional. A contingency table with new categories to be projected as supplementary rows within the MR-CA space in rows and the same response options as data in columns
#' @param proj.row.obs A numeric vector whose length equals nrow(proj.row) and giving the number of observations within each projected rows. Useless if proj.row=NULL
#' @param proj.col Optional. A contingency table with new response options to be projected as supplementary columns within the MR-CA space in columns and the same categories as data in rows
#' @param ellipse Logical. Are confidence ellipses for the categories to be computed? Default is FALSE. See details
#' @param nboot Number of virtual datasets used in the total bootsrap procedure. Useless when ellipse=FALSE. See details
#' @param nbaxes.sig The number of significant axes retuned by \code{\link[MultiResponseR]{mr.dimensionality.test}}. By default, all axes are considered significant. Useless when ellipse=FALSE. See details
#'
#' @details
#' \itemize{
#'   \item \strong{ellipse}: When ellipse=TRUE, confidence ellipses for the categories are computed using a total bootstrap procedure (Cadoret & Husson, 2013). \strong{nboot} virtual datasets are generated by randomly sampling with replacement response vectors within each category. A MR-CA is then performed on these virtual dataset and the resulting virtual configurations are adjusted to the actual configuration using Procrustes rotations accounting for \strong{nbaxes.sig} axes (Mahieu, Schlich, Visalli, & Cardot, 2021). Finally, for each category, a confidence ellipse is constructed using the position of its bootstrap replicates. The ellipses are plotted when using \code{\link[MultiResponseR]{plot.mrCA}} Pairwise total bootstrap tests as proposed in Castura et al. (2023) are also performed between the categories.
#' }
#'
#'
#' @return A list with the following elements:
#' \describe{
#'   \item{eigen}{Eigenvalues and their corresponding percentages of inertia}
#'   \item{row.coord}{Rows coordinates}
#'   \item{col.coord}{Columns coordinates}
#'   \item{proj.row.coord}{Projected rows coordinates}
#'   \item{proj.col.coord}{Projected columns coordinates}
#'   \item{svd}{Results of the singular value decomposition}
#'   \item{bootstrap.replicate.coord}{Coordinates of the rotated bootstrap replicates}
#'   \item{total.bootstrap.test.pvalues}{P-values of the pairwise total bootstrap tests}
#' }
#'
#' @references Mahieu, B., Schlich, P., Visalli, M., & Cardot, H. (2021). A multiple-response chi-square framework for the analysis of Free-Comment and Check-All-That-Apply data. Food Quality and Preference, 93.
#' @references Loughin, T. M., & Scherer, P. N. (1998). Testing for Association in Contingency Tables with Multiple Column Responses. Biometrics, 54(2), 630-637.
#' @references Cadoret, M., & Husson, F. (2013). Construction and evaluation of confidence ellipses applied at sensory data. Food Quality and Preference, 28(1), 106-115.
#' @references Castura, J. C., Varela, P., & Næs, T. (2023). Evaluation of complementary numerical and visual approaches for investigating pairwise comparisons after principal component analysis. Food Quality and Preference, 107.
#'
#'
#' @export
#'
#' @import stats
#' @import utils
#'
#' @examples
#' nb.obs=200
#' nb.response=5
#' nb.category=5
#' vec.category=paste("C",1:nb.category,sep="")
#' right=matrix(rbinom(nb.response*nb.obs,1,0.25),nb.obs,nb.response)
#' category=sample(vec.category,nb.obs,replace = TRUE)
#' dset=cbind.data.frame(category,right)
#' dset$category=as.factor(dset$category)
#'
#' res=mrCA(dset)
#'
#' plot(res)
mrCA=function(data,proj.row=NULL,proj.row.obs=NULL,proj.col=NULL,ellipse=FALSE,nboot=2000,nbaxes.sig=Inf){
  classe=class(data)[1]
  if (!classe%in%c("data.frame")){
    stop("data must be a data.frame")
  }
  classe=class(data[,1])[1]
  if (!classe%in%c("factor")){
    stop("The first column of data must be factor")
  }
  if(!colnames(data)[1]%in%c("category","Category","produit","Produit","product","Product")){
    stop("First column name must be category, Category, produit, Produit, product or Product")
  }
  colnames(data)[1]="cat"
  data$cat=as.factor(as.character(data$cat))
  for (j in 2:ncol(data)){
    classe=class(data[,j])
    if (!classe%in%c("numeric","integer")){
      stop("Contingency data must be numeric or integer")
    }
  }
  check.bin=unique(unlist(data[,2:ncol(data)]))
  if (length(check.bin)>2){
    warning("contingency data are not composed of only ones and zeros")
  }else{
    check.un=sum(check.bin==c(0,1))
    check.deux=sum(check.bin==c(1,0))
    if (check.un!=2 & check.deux!=2){
      warning("contingency data are not composed of only ones and zeros")
    }
  }
  verif.col=colSums(data[,2:ncol(data)])
  if (any(verif.col==0)){
    stop("Some columns are only zeros")
  }
  data=data[order(data$cat),]
  rownames(data)=as.character(1:nrow(data))

  cont=aggregate(.~cat,data,sum)
  rownames(cont)=cont$cat
  cont$cat=NULL

  if (nbaxes.sig==1){
    nbaxes.proc=2
  }else if (nbaxes.sig==Inf){
    nbaxes.proc=min((nrow(cont)-1),ncol(cont))
    nbaxes.sig=min((nrow(cont)-1),ncol(cont))
  }else if (nbaxes.sig==0){
    stop("nbaxes.sig is equal to zero")
  }else{
    nbaxes.proc=nbaxes.sig
  }

  row.obs=table(data$cat)
  nom.cat=names(row.obs)
  row.obs=as.numeric(row.obs)
  names(row.obs)=nom.cat

  nplus=row.obs
  nplusplus=sum(nplus)
  redui=cont
  fij=(nplus%o%colSums(redui))/nplusplus
  std=((redui-fij)/sqrt(fij))/sqrt(nplusplus)

  n=nrow(redui)
  p=ncol(redui)
  nb.axe=min(n-1,p)

  udv=svd(std)

  u=udv$u[,1:nb.axe,drop=FALSE]
  vs=udv$d[1:nb.axe]
  eig=vs^2
  v=udv$v[,1:nb.axe,drop=FALSE]
  rownames(u)=rownames(redui)
  rownames(v)=colnames(redui)

  marge.r=nplus/nplusplus
  if (nb.axe==1){
    dvs=vs
  }else{
    dvs=diag(vs)
  }

  row.coord=(u/sqrt(marge.r))%*%dvs
  col.coord=v

  if(!is.null(proj.row)){
    classe=class(proj.row)[1]
    if (!classe%in%c("matrix","data.frame")){
      stop("proj.row must be a matrix or a data.frame")
    }
    if (ncol(proj.row)!=ncol(cont)){
      stop("proj.row must have the same number of response options that data")
    }
    classe=class(proj.row.obs)[1]
    if (!classe%in%c("numeric","integer")){
      stop("proj.row.obs must be integer or numeric")
    }
    if (nrow(proj.row)!=length(proj.row.obs)){
      stop("proj.row and proj.row.obs have not the same size")
    }
    unit.supp=proj.row/proj.row.obs
    fij.sup=(colSums(redui))/nplusplus
    mat.fij.sup=as.matrix(rep(1,nrow(proj.row)))%*%fij.sup
    vec.sup=((unit.supp-mat.fij.sup)/sqrt(mat.fij.sup))
    proj.row.coord=as.matrix(vec.sup)%*%v
    rownames(proj.row.coord)=rownames(proj.row)
  }
  if(!is.null(proj.col)){
    classe=class(proj.col)[1]
    if (!classe%in%c("matrix","data.frame")){
      stop("proj.col must be a matrix or a data.frame")
    }
    if (nrow(proj.col)!=nrow(cont)){
      stop("proj.col must have the same number of categories that data")
    }
    c.supp=colSums(proj.col)/nplusplus
    o.supp=proj.col/nplusplus
    r.supp=nplus/nplusplus
    e.supp=r.supp%o%c.supp
    std.supp=(o.supp-e.supp)/sqrt(e.supp)
    proj.col.coord=t(t(u)%*%as.matrix(std.supp)/vs)
    rownames(proj.col.coord)=colnames(proj.col)
  }

  percent.eig=eig/sum(eig)*100
  cum.percent.eig=cumsum(percent.eig)
  mat.eig=cbind(eig,percent.eig,cum.percent.eig)
  colnames(mat.eig)=c("eigenvalue","percentage of inertia","cumulative percentage of inertia")
  name.dim=paste("Dim.",1:nrow(mat.eig),sep="")
  rownames(mat.eig)=colnames(col.coord)=colnames(row.coord)=name.dim
  if(!is.null(proj.row)){
    colnames(proj.row.coord)=name.dim
  }else{
    proj.row.coord=NULL
  }
  if(!is.null(proj.col)){
    colnames(proj.col.coord)=name.dim
  }else{
    proj.col.coord=NULL
  }
  colnames(u)=name.dim
  names(vs)=name.dim
  colnames(v)=name.dim

  if (ellipse){

    row.coord=row.coord[,1:nbaxes.proc,drop=FALSE]
    col.coord=col.coord[,1:nbaxes.proc,drop=FALSE]
    if(!is.null(proj.row)){
      proj.row.coord=proj.row.coord[,1:nbaxes.proc,drop=FALSE]
    }
    if(!is.null(proj.col)){
      proj.col.coord=proj.col.coord[,1:nbaxes.proc,drop=FALSE]
    }

    row.c=matrix(NA,max(table(data$cat)),nlevels(data$cat))
    colnames(row.c)=levels(data$cat)
    for (c in unique(data$cat)){
      ou.c=which(data$cat==c)
      row.c[1:length(ou.c),c]=ou.c
    }
    mySample=function(vec){
      vec.retour=na.omit(vec)
      if (length(vec.retour)>1){
        vec.retour=sample(vec.retour,length(vec.retour),replace = TRUE)
      }else{
        vec.retour=vec.retour[1]
      }
      return(vec.retour)
    }
    myProcrustes=function(tofit,target,row.weights=rep(1,nrow(target)),scaling=FALSE){
      if (!is.matrix(tofit) & !is.data.frame(tofit)){
        stop("tofit must be a matrix or a data.frame")
      }
      if (!is.matrix(target) & !is.data.frame(target)){
        stop("tofit must be a matrix or a data.frame")
      }
      X=as.matrix(tofit)
      Y=as.matrix(target)
      if (ncol(X)!=ncol(Y) | nrow(X)!=nrow(Y)){
        stop("Dimension of tofit and target are different")
      }
      if (!is.logical(scaling)){
        stop("scaling must be logical")
      }
      if (is.numeric(row.weights) | is.integer(row.weights)){
        if (length(row.weights)!=nrow(target)){
          stop("length(row.weights) must equal nrow(target)")
        }
      }else{
        stop("class(row.weights) must be numeric or integer")
      }
      vecW=row.weights/sum(row.weights)
      mX=t(as.matrix(vecW))%*%X
      X=sweep(X,2,mX,"-")
      mY=t(as.matrix(vecW))%*%Y
      loc=mY
      Y=sweep(Y,2,mY,"-")
      matW=diag(row.weights/sum(row.weights)*nrow(target))
      croise=t(X)%*%matW%*%Y
      udv=svd(croise)
      if (scaling){
        dilat=sum(udv$d)/sum(diag(crossprod(X)))
      }else{
        dilat=1
      }
      H=udv$u%*%t(udv$v)
      aligned=(dilat*X%*%H)
      aligned=sweep(aligned,2,loc,"+")
      return(aligned)
    }

    sortie=data.frame(cat=as.factor(rep(levels(data$cat),nboot)),matrix(0,nlevels(data$cat)*nboot,nbaxes.proc))
    colnames(sortie)[2:ncol(sortie)]=colnames(row.coord)
    vec.boot=seq(1,nrow(sortie),by=nlevels(data$cat))
    pb=txtProgressBar(min=0,max = nboot,style=3)
    for (boot in 1:nboot){

      tirage=unlist(apply(row.c,2,mySample))
      jdd.tirage=data[tirage,]
      jdd.tirage=jdd.tirage[order(jdd.tirage$cat),]
      rownames(jdd.tirage)=as.character(1:nrow(jdd.tirage))

      nplus.tirage=table(jdd.tirage$cat)
      nom.cat=names(nplus.tirage)
      nplus.tirage=as.numeric(nplus.tirage)
      names(nplus.tirage)=nom.cat

      cont.tirage = aggregate(.~cat,jdd.tirage,sum)
      rownames(cont.tirage)=as.character(cont.tirage$cat)
      cont.tirage$cat=NULL
      redui.tirage=cont.tirage

      verif=colSums(cont.tirage)
      vire = which(verif==0)
      if(length(vire)!=0){
        cont.tirage=cont.tirage[,-vire]
        redui.tirage=redui.tirage[,-vire]
        vire.extend=NULL
        for (quel.vire in names(vire)){
          ou.virer=which(colnames(jdd.tirage)==quel.vire)
          vire.extend=c(vire.extend,ou.virer)
        }
        jdd.tirage=jdd.tirage[,-vire.extend]
      }

      nplusplus.tirage=sum(nplus.tirage)
      fij.tirage=(nplus.tirage%o%colSums(redui.tirage))/nplusplus.tirage
      std.tirage=((redui.tirage-fij.tirage)/sqrt(fij.tirage))/sqrt(nplusplus.tirage)

      n.tirage=nrow(redui.tirage)
      p.tirage=ncol(redui.tirage)
      nb.axe.tirage=min(n.tirage-1,p.tirage)

      udv.tirage=svd(std.tirage)

      u.tirage=udv.tirage$u[,1:nb.axe.tirage]
      vs.tirage=udv.tirage$d[1:nb.axe.tirage]
      rownames(u.tirage)=rownames(redui.tirage)

      marge.r.tirage=nplus.tirage/nplusplus.tirage
      row.coord.tirage=(u.tirage/sqrt(marge.r.tirage))%*%diag(vs.tirage)
      row.coord.tirage=row.coord.tirage[,1:nbaxes.proc]

      rot = myProcrustes(row.coord.tirage, row.coord,nplus)
      sortie[vec.boot[boot]:(vec.boot[boot]+nlevels(data$cat)-1),2:ncol(sortie)]=rot
      setTxtProgressBar(pb,boot)
    }

    sortie$cat=as.factor(sortie$cat)
    toellipse=sortie
    rownames(toellipse)=as.character(1:nrow(toellipse))

    coord.boot=toellipse[,1:(nbaxes.sig+1)]
    dim.sig=nbaxes.sig

    les.prod=unique(coord.boot$cat)

    diff.test=as.data.frame(matrix(1,length(les.prod),length(les.prod)))
    colnames(diff.test)=rownames(diff.test)=les.prod

    for(i in 1:(nrow(diff.test)-1)){
      for (j in (i+1):ncol(diff.test)){
        p.1=rownames(diff.test)[i]
        p.2=colnames(diff.test)[j]
        coord.p1=coord.boot[coord.boot$cat==p.1,,drop=FALSE]
        coord.p2=coord.boot[coord.boot$cat==p.2,,drop=FALSE]
        delta=as.matrix(coord.p1[,-1,drop=FALSE]-coord.p2[,-1,drop=FALSE])
        mu=colMeans(delta)
        sigma=cov(delta)
        calc.mal.sq=function(vec){
          mal.bary=t(as.matrix(vec-mu))%*%solve(sigma,tol=1e-300)%*%(as.matrix(vec-mu))
          return(as.numeric(mal.bary))
        }
        mal.sq.cloud=apply(as.matrix(delta),1,calc.mal.sq)
        mal.sq.origin=as.numeric(t(as.matrix(rep(0,length(mu))-mu))%*%solve(sigma)%*%(as.matrix(rep(0,length(mu))-mu)))
        pvalue.test=(sum(mal.sq.cloud>=mal.sq.origin)+1)/(nrow(delta)+1)
        diff.test[p.1,p.2]=pvalue.test
        diff.test[p.2,p.1]=pvalue.test
      }
    }
  }else{
    toellipse=NULL
    diff.test=NULL
  }
  back=list(eigen=mat.eig,row.coord=row.coord,col.coord=col.coord,proj.row.coord=proj.row.coord,proj.col.coord=proj.col.coord,svd=list(u=u,vs=vs,v=v),bootstrap.replicate.coord=toellipse,total.bootstrap.test.pvalues=diff.test)
  class(back)=c("mrCA","list")
  return(back)
}
MahieuB/MultiResponseR documentation built on June 22, 2024, 8:08 a.m.