Multigroup Data Analysis Through Densities

Documented in fpcad

fpcad <-
  function(xf, group.name="group", gaussiand=TRUE, windowh=NULL,
           normed=TRUE, centered=TRUE, data.centered=FALSE, data.scaled=FALSE, 
           common.variance=FALSE, nb.factors=3, nb.values=10, sub.title="",
           plot.eigen=TRUE, plot.score=FALSE, nscore=1:3, filename=NULL)
  {
    #require(e1071)
    
    #---------------
    # Preliminaries
    #---------------
    if (!(is.folder(xf) | is.data.frame(xf))){
      stop("fpcad applies to a data frame or an object of class 'folder'.")
    }
    
    if (is.folder(xf)) {
      # Convert the data folder into a data frame
      x <- as.data.frame(xf, group.name = group.name)
    }
    if (is.data.frame(xf)) {
      # Is group.name the name of a column of xf?
      if (!group.name %in% names(xf))
        stop(paste0("xf has no column named '", group.name, "'."))
      # x will contain the data.frame
      x <- xf[-which(colnames(xf) == group.name)]
      # Last column: the grouping variable
      x[group.name] <- xf[, group.name]
      # Convert the data.frame into a folder
      xf <- as.folder(xf, groups = group.name)
    }

    # p denotes the dimension of the data
    p<-ncol(x)-1;
    
    # Rename the last column of x as group
    last.column.name=colnames(x)[ncol(x)]
    colnames(x)[ncol(x)] <- "group"
    group<-as.factor(x$group);
    nb.groups<-length(levels(group));
    levgroup<-levels(group);
    
    # Control and error message
    # on data
    if (!all(apply(as.data.frame(x[,1:p], stringsAsFactors = TRUE), 2, is.numeric)))
      stop("The variables must be numeric!")
    if (any(is.na(x)))
      stop("There are NAs in the folder")
    # on the window or window parameter
    if (!is.null(windowh))
    {if (is.numeric(windowh))
    {if (length(windowh) > 1)
    {stop("windowh must be either a numeric value, or a list of matrix")
    }
      if (windowh < .Machine$double.eps)
      {stop("windowh must be strictly positive!")
      }
    } else 
    {if (is.list(windowh))
    {if (is.null(names(windowh)))
    {stop("The elements of the windowh list must be named")
    } else 
    {if (min(names(windowh)==levgroup)<1)
    {stop("The names of the windowh list must be the group names")
    }
    }
      if (p >1)
      {if (min(unlist(lapply(windowh, det))) < .Machine$double.eps)
      {stop("All elements of windowh must be positive matrices!")
      }
      } else               
      {if (min(unlist(windowh)) < .Machine$double.eps)
      {stop("All elements of windowh must be strictly positive!")
      }
      }
    }
    }
    }
    
    # For now, the only choice of kernel is the Gaussian kernel.
    if (!gaussiand)
    { kern = "gauss" }  else
      kern = ""
    
    # Control and error message
    if (min(table(group)) <= 1)
      stop("There should be more than one observation in each group")
    
    # Control of nb.factors and nb.values
    if (nb.groups < nb.values)
    {nb.values <- nb.groups ;
    }
    if (nb.groups < nb.factors)
    {nb.factors <- nb.groups
    }
    
    # Mean per group
    moyL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=colMeans);
    moyL0<-moyL
    if(data.centered)
    {# Centering data
      for (i in 1:nb.groups)
      {moyL[[i]]<-numeric(p)
      x[x$group==levgroup[i],1:p]=scale(x[x$group==levgroup[i],1:p],scale=F)
      }
    }
    
    # Variance per group
    varL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=var);
    varL0<-varL
    
    # Correlation matrix or correlation coefficient per group
    corL=varL
    if (p>1)
    {corL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=cor);
    } else
    {for (i in 1:nb.groups)
    {corL[[i]]<-1
    }
    }
    
    if(data.scaled)
    {varL<-corL
    for (i in 1:nb.groups)
    {x[x$group==levgroup[i],1:p]=scale(x[x$group==levgroup[i],1:p])
    }
    }
    
    if(common.variance)
    {for (i in 1:nb.groups)
    {varL[[i]]<-var(x[,1:p])
    }
    }
    
    # Skewness et kurtosis coefficients per group
    #require(e1071)
    skewnessL <- by(as.data.frame(x[, 1:p], stringsAsFactors = TRUE), INDICES=group, FUN=apply, 2, skewness)
    kurtosisL <- by(as.data.frame(x[, 1:p], stringsAsFactors = TRUE), INDICES=group, FUN=apply, 2, kurtosis)
    
    #---------------
    # Calculus of the matrix W to diagonalize
    #---------------
    choix = character(4)
    # Choice of the dimension
    # "m" : multivariate ; "u" : univariate
    if (p > 1)              
    {choix[1] = "m"
    }  else
    {choix[1] = "u"
    }
    # Choice of the distribution type
    # "g" : gaussian distributions; "n" : non gaussian distributions 
    if (gaussiand)           
    {choix[2] = "g"                    
    }  else
    {choix[2] = "n"
    }
    # Choice of the kernel 
    # "g" : gaussian kernel; "." : not applicable
    # This option offers a limited choice as the only available kernel is the
    # gaussian kernel 
    if (kern == "gauss")
    {choix[3] = "g"
    }  else
    {choix[3] = "."
    }
    # Choice of the window 
    # The window is given by the user in the "windowh" parameter as 
    # "l" : list of (definite positive) matrices
    # "n" : positive number (common to all densities) with which the variance 
    #       matrices are multiplied 
    # "a" : NULL, that is the matrice variance of each density is multiplied by the 
    #       AMISE window (internally computed by the "bandwidth.parameter" function).
    # "." : not applicable
    if (gaussiand)        
    {choix[4] = "."
    }  else
    {if (is.null(windowh))
    {choix[4] = "a"
    }  else
    {if (p == 1)
    {if (length(windowh) == 1)
    {choix[4] = "n"
    } else
    {choix[4] = "l"
    }
    }  else
    {if (is.list(windowh))
    {choix[4] = "l"
    }  else
    {choix[4] = "n"
    }
    } 
    }
    }
    
    choix = paste(choix, collapse = "")
    
    # Calculus of the inner products between densities
    W<-matrix(0,ncol=nb.groups,nrow=nb.groups);
    switch(choix,
           # Case: multivariate, gaussian distribution and estimated parameters
           mg.. =
             {for (i in 1:nb.groups)
             {for (j in 1:i)
             {# Test if det(varL[[i]]+varL[[j]]) is different from zero
               if (abs(det(varL[[i]]+varL[[j]])) < .Machine$double.eps)
               {stop(paste(c("Singular covariance matrix (group ", unique(names(varL)[c(i, j)]), ")"), collapse = ""))
               }
               W[i,j]<-l2dpar(moyL[[i]],varL[[i]],moyL[[j]],varL[[j]])
             }
             }
             },
           # Case univariate, gaussian distributions with parametres internally estimed 
           ug.. =  
             {for (i in 1:nb.groups)
             {for (j in 1:i)
             {#Test if varL[[i]]+varL[[j]] is different from zero
               if (abs(varL[[i]]+varL[[j]]) < .Machine$double.eps)
               {stop(paste(c("Singular covariance matrix (group ", unique(names(varL)[c(i, j)]), ")"), collapse = ""))
               }
               W[i,j]<-l2dpar(moyL[[i]],varL[[i]],moyL[[j]],varL[[j]])
             }
             }
             },
           # Case: multivariate, non Gaussian distribution, density estimated using 
           # Gaussian kernel and AMISE window 
           mnga =
             {nbL<-by(x[,1:p],INDICES=group,FUN=nrow);
             wL<-bandwidth.parameter(p,nbL)
             # Multiplication of the variance by the window parameter
             varLwL<-varL
             for (i in 1:nb.groups)
             {varLwL[[i]]<-varL[[i]]*(wL[[i]]^2)}
             for (i in 1:nb.groups)
             {xi<-x[x$group==levgroup[i],1:p];
             for(j in 1:i)
             {#Test if the determinant of 
               #varL[[i]]*(wL[[i]]^2)+varL[[j]]*(wL[[j]]^2) is different from zero
               if (abs(det(varLwL[[i]]+varLwL[[j]])) < .Machine$double.eps)
                 stop("The matrices are not invertible")
               xj<-x[x$group==levgroup[j],1:p];
               W[i,j]<-l2d(xi,xj,method="kern",varw1=varLwL[[i]],varw2=varLwL[[j]]) 
             }
             }
             },
           # Case univariate, non gaussian distributions estimated by gaussian kernel
           # method, and AMISE windows 
           unga =
             {nbL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=NROW);
             wL<-bandwidth.parameter(p,nbL)
             # Multiplication of the variance by the window parameter
             varLwL<-varL
             for (i in 1:nb.groups)
             {varLwL[[i]]<-varL[[i]]*(wL[[i]]^2)
             }
             for (i in 1:nb.groups)
             {xi<-x[x$group==levgroup[i],1:p];
             for(j in 1:i)
             {# Test if the variances are different from zero
               if (varLwL[[i]]+varLwL[[j]] < .Machine$double.eps)
               {stop("One variance or more is equal to zero")
               }
               xj<-x[x$group==levgroup[j],1:p];
               W[i,j]<-l2d(xi,xj,method="kern",varw1=varLwL[[i]],varw2=varLwL[[j]])
             }
             };
             },
           # Case: multivariate, non gaussian distributions estimed by gaussian kernel
           # method, and bandwith parameter, common to all densities, given by the user
           mngn =
             {nbL<-by(x[,1:p],INDICES=group,FUN=nrow);
             # Multiplication of the variance by the window parameter
             varLwL<-varL
             for (i in 1:nb.groups)
             {varLwL[[i]]<-varL[[i]]*(windowh^2)}
             for (i in 1:nb.groups)
             {xi<-x[x$group==levgroup[i],1:p];
             for(j in 1:i)
             {#Test if the determinant of 
               #varL[[i]]*(wL[[i]]^2)+varL[[j]]*(wL[[j]]^2) is different from zero
               if (abs(det(varLwL[[i]]+varLwL[[j]])) < .Machine$double.eps)
                 stop("The matrices are not invertible")
               xj<-x[x$group==levgroup[j],1:p];
               W[i,j]<-l2d(xi,xj,method="kern",varw1=varLwL[[i]],varw2=varLwL[[j]])
             }
             }
             },
           # Case univariate, non gaussian distributions estimed by gaussian kernel
           # method, and bandwith parameter, common to all densities, given by the user    
           ungn =
             {nbL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=NROW);
             # Multiplication of the variance by the window
             varLwL<-varL
             for (i in 1:nb.groups)
             {varLwL[[i]]<-varL[[i]]*(windowh^2)}
             for (i in 1:nb.groups)
             {xi<-x[x$group==levgroup[i],1:p];
             for(j in 1:i)
             {# Test if the variances are different from zero
               if (varLwL[[i]]+varLwL[[j]] < .Machine$double.eps)
               {stop("One variance or more is equal to zero")
               }
               xj<-x[x$group==levgroup[j],1:p];
               W[i,j]<-l2d(xi,xj,method="kern",varw1=varLwL[[i]],varw2=varLwL[[j]]) }} 
             },
           # Case: multivariate, non gaussian distributions estimated by gaussian kernel
           # method, and windows given as a list of matrices
           mngl =
             {nbL<-by(x[,1:p],INDICES=group,FUN=nrow);
             for (i in 1:nb.groups)
             {gi = levgroup[i]
             xi<-x[x$group==gi,1:p];
             for(j in 1:i)
             {gj = levgroup[j]
             xj<-x[x$group==gj,1:p];
             W[i,j]<-l2d(xi,xj,method="kern",varw1=windowh[[gi]],varw2=windowh[[gj]]) }};
             },
           
           # Case univariate, non gaussian distributions estimated by gaussian kernel
           # method, and windows given as a list of numbers
           ungl =
             {nbL<-by(as.data.frame(x[,1:p], stringsAsFactors = TRUE),INDICES=group,FUN=NROW);
             for (i in 1:nb.groups)
             {gi = levgroup[i]
             xi<-x[x$group==gi,1:p];
             for(j in 1:i)
             {gj = levgroup[j]
             xj<-x[x$group==gj,1:p];
             W[i,j]<-l2d(xi,xj,method="kern",varw1=windowh[[gi]],varw2=windowh[[gj]])
             }
             };
             }
    )
    for (i in 1:(nb.groups-1))
    {for (j in (i+1):nb.groups)                             
    {W[i,j]<-W[j,i] }}
    # End of the computation of the matrix W
    
    norme<-vector("numeric",nb.groups);
    for (i in 1:nb.groups)
    {norme[i]<-sqrt(W[i,i])};
    if(normed)
    {# Calculus of the matrix W of the normed pca
      for (i in 1:nb.groups)
      {for (j in 1:i)
      {W[i,j]<-W[i,j]/(norme[i]*norme[j])}};
      for (i in 1:(nb.groups-1))
      {for (j in (i+1):nb.groups)
      {W[i,j]<-W[j,i]}};
    }
    if(centered)
    {# Calculus of the matrix W of the centred pca
      moyW<-mean(W);
      moycolW<-colMeans(as.data.frame(W, stringsAsFactors = TRUE));
      for (i in 1:nb.groups)
      {for (j in 1:i)
      {W[i,j]<-W[i,j]-moycolW[[i]]-moycolW[[j]]+moyW}};
      for (i in 1:(nb.groups-1))
      {for (j in (i+1):nb.groups)
      {W[i,j]<-W[j,i]}};
    }
    
    #---------------
    # Calculus of the eigenvalues / eigenvectors of the matrix W
    #	Calculus of the scores / contributions / qualities
    #---------------
    ep<-svd(W)
    
    # Eigenvalues to display
    epaff<-ep$d[1:nb.values];
    
    # Scores to display
    valpsqrt<-diag(sqrt(abs(ep$d[1:nb.factors])));
    coor <- ep$u[,1:nb.factors] %*% diag(sign(ep$u[1,1:nb.factors])) %*% valpsqrt ;
    
    # Corresponding qualities
    qual<-coor;
    for (i in 1:nb.groups)
    {qual[i,]<-round(1000*(coor[i,]^2)/W[i,i]) /10};
    
    # Corresponding contributions
    cont<-coor;
    for (j in 1:nb.factors)
    {cont<-round(1000*(ep$u[,1:nb.factors]^2)) /10};
    
    # Change of the names of the grouping variable in the returned results
    names(attributes(moyL)$dimnames) <- last.column.name  
    names(attributes(varL)$dimnames) <- last.column.name
    names(attributes(corL)$dimnames) <- last.column.name
    names(attributes(skewnessL)$dimnames) <- last.column.name
    names(attributes(kurtosisL)$dimnames) <- last.column.name
    
    # Creation of the list of results in an object of class 'fpcad' :
    results <- list( call=match.call(),
                     group=last.column.name,
                     variables=colnames(x)[1:p],
                     inertia=data.frame(eigenvalue=epaff,
                                        inertia=round(1000*epaff/sum(abs(ep$d)))/10, stringsAsFactors = TRUE),
                     contributions=data.frame(levgroup,PC=cont, stringsAsFactors = TRUE),
                     qualities=data.frame(levgroup,PC=qual, stringsAsFactors = TRUE),
                     scores=data.frame(levgroup,PC=coor, stringsAsFactors = TRUE),
                     norm = data.frame(levgroup,norm=norme, stringsAsFactors = TRUE),
                     means=moyL,
                     variances=varL,
                     correlations=corL,
                     skewness=skewnessL,
                     kurtosis=kurtosisL);
    # Change of the name of the grouping variable in the data frames in results
    colnames(results$contributions)[1]=last.column.name
    colnames(results$qualities)[1]=last.column.name
    colnames(results$scores)[1]=last.column.name
    colnames(results$norm)[1]=last.column.name
    class(results) <- "fpcad"
    
    # Save this list in a file (if a filename is given)
    if (! is.null(filename))
    {save(results,file = filename)
    }
    
    # Barplot of the inertia
    if (plot.eigen)
    {inertia=results$inertia$inertia
    barplot(inertia, main="Inertia",
            names.arg=1:length(inertia),
            cex.names=1)
    }
    
    # Plotting the scores on the principal planes (1,2),(1,3) et (2,3)
    # Plane (a,b)
    if (plot.score)
    {plot(results, nscore=nscore, sub.title=sub.title)
    }
    
    # Returning the list of results
    return(results)
  }