R/principal_components.R

Defines functions COMP_POLY_H COMP_MQ WH.plot_multiple_indivs WH.plot_multiple_Spanish.funs WH.MultiplePCA WH.1d.PCA

Documented in WH.1d.PCA WH.MultiplePCA WH.plot_multiple_indivs WH.plot_multiple_Spanish.funs

## 1v PCA for quantiles -------
# Principal components analysis  of histogram variable based on Wasserstein distance----
#' Principal components analysis  of histogram variable based on Wasserstein distance
#' @description The function implements a Principal components analysis of histogram variable 
#' based on Wasserstein distance. It performs a centered (not standardized) PCA on a set of quantiles of a variable.
#' Being a distribution a multivalued description, the analysis performs a dimensional reduction and a visualization of distributions. 
#' It is a 1d (one dimension) becuse it is considered just one histogram variable.
#' @param data A MatH object (a matrix of distributionH).
#' @param var An integer, the variable number.
#' @param quantiles An integer, it is the number of quantiles used in the analysis.
#' @param plots a logical value. Default=TRUE plots are drawn.
#' @param listaxes A vector of integers listing the axis for the 2d factorial reperesntations.
#' @param axisequal A logical value. Default TRUE, the plot have the same scale for the x and the y axes.
#' @param qcut  a number between 0.5 and 1, it is used for the plot of densities, and avoids very peaked densities. 
#' Default=1, all the densities are considered.
#' @param outl a number between 0 (default)  and 0.5. For each distribution, is the amount of mass removed from the tails 
#' of the distribution. For example, if 0.1, from each distribution is cut away a left tail and a right one each containing 
#' the 0.1 of mass.   

#' @return a list with the results of the PCA in the MFA format of package \pkg{FactoMineR} for function MFA
#' @references Verde, R.; Irpino, A.; Balzanella, A., "Dimension Reduction Techniques for Distributional Symbolic Data," Cybernetics, IEEE Transactions on , vol.PP, no.99, pp.1,1
#' doi: 10.1109/TCYB.2015.2389653
#' keywords: {Correlation;Covariance matrices;Distribution functions;Histograms;Measurement;Principal component analysis;Shape;Distributional data;Wasserstein distance;principal components analysis;quantiles},
#' \url{http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=7024099&isnumber=6352949}
#' @details In the framework of symbolic data analysis (SDA), distribution-valued data 
#' are defined as multivalued data, where each unit is described by a distribution 
#' (e.g., a histogram, a density, or a quantile function) of a quantitative variable.
#'  SDA provides different methods for analyzing multivalued data. Among them, the most relevant 
#'  techniques proposed for a dimensional reduction of multivalued quantitative variables is principal 
#'  component analysis (PCA). This paper gives a contribution in this context of analysis. 
#'  Starting from new association measures for distributional variables based on a peculiar metric 
#'  for distributions, the squared Wasserstein distance, a PCA approach is proposed for distribution-valued data,
#'   represented by quantile-variables. 
#' @examples
#' results=WH.1d.PCA(data = BLOOD,var = 1, listaxes=c(1:2))
#' @importFrom FactoMineR PCA
#' @importFrom graphics plot abline axis hist plot.new plot.window polygon segments text title
#' @importFrom grDevices dev.new rgb
#' @importFrom stats density quantile
#' @export
WH.1d.PCA=function(data,var, quantiles=10, plots=TRUE, listaxes=c(1:4),axisequal=FALSE,qcut=1,outl=0){
  if (is(data)[1]!="MatH"){stop("Input data must be a MatH object (A matrix of histograms)")}
  #Build the matrix of Quantiles
  VARS=ncol(data@M)
  INDIV=nrow(data@M)
  if ((qcut<0.5)||(qcut>1)){qcut=1}
  if ((outl<0)||(outl>0.5)){outl=0}
  if ((outl==0.5)){outl=0.49}
  if (missing(var)){
    var=1
    varname=colnames(data@M)[1]
    cat(paste("Var is missing, We do a PCA on variable ", varname, "\n"))
  } else{
    if (length(var)>1){
      varname=colnames(data@M)[var[1]]
      cat(paste("Var has several values, We do a PCA on variable -->", var[1], "\n"))
      var=var[1]
    }
    else{
      if (var>VARS){stop(paste("The variables are less than ",var))}
      else{
        varname=colnames(data@M)[var]
        cat(paste("We do a PCA on variable ---> ", varname, "\n"))
      }
    }
  }
  data=data[,var]
  #check indiv and remove empty rows
  toremove=numeric(0)
  for (i in 1:INDIV){
    if (length(data@M[i,1][[1]]@x)<2){
      toremove=c(toremove,i)
    }
  }
  if (length(toremove)>0){
    data@M=as.matrix(data@M[-toremove,1])
  }
  colnames(data@M)=varname
  INDIV=nrow(data@M)
  if (INDIV<2){stop("At least two individuals are necessary, please check data")}
  
  #Create the quantile matrix
  MATQ=matrix(0,nrow = INDIV,ncol = (quantiles+1))
  if (outl==0){
    p=c(0, 1:quantiles)/quantiles
    namec=list("Min")
    for (j in 2:(quantiles)){
      namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
    }
    namec=c(namec, "Max")
  }
  else{
    
    p=c(0, 1:quantiles)/quantiles*(1-2*outl)+outl 
    namec=list(paste0("Q(",format(p[1], digits=2, nsmall=2),")"))
    for (j in 2:length(p)){
      namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
    }
  }
  
  for (i in 1:INDIV)
  {
    for (j in 1:(quantiles+1))
    {MATQ[i,j]=compQ(data@M[i,1][[1]],p[j])}
    
  }
  rownames(MATQ)=rownames(data@M)
  colnames(MATQ)=namec
  
  # do a PCA
  vmeans=numeric(0)
  vstds=numeric(0)
  
  for (ind in 1:INDIV){
    vmeans=c(vmeans,data@M[ind,1][[1]]@m)
    vstds=c(vstds,data@M[ind,1][[1]]@s)
  }
  minM=min(vmeans)
  maxM=max(vmeans)
  minS=min(vstds)
  maxS=max(vstds)
  
  MATF=cbind(MATQ/sqrt((quantiles+1)),vmeans,vstds)
  res.pca = PCA(MATF,quanti.sup = c((ncol(MATF)-1),ncol(MATF)), scale.unit=FALSE,  graph=F)
  
  VARW=0#WH.var.covar(data)
  TOTINE=sum(res.pca$eig[,1])
  ## plotting PCA results ----------------
  if (plots){
    #par(mfrow=c(1,1))
    # lt's define the couple of axes
    dev.new(noRStudioGD = FALSE )
    planes=matrix(0,1,2)
    for (cc in 1:floor(length(listaxes)/2)){
      planes=rbind(planes,c(listaxes[(cc*2-1)],listaxes[(cc*2)]))
    }
    planes=as.matrix(planes[-1,])
    dim(planes)=c(floor(length(listaxes)/2),2)
    if ((length(listaxes)%%2)==1){
      planes=rbind(planes, c(listaxes[(length(listaxes)-1)],
                             listaxes[length(listaxes)]))
    }
    
    #plot Spanish-fan plot
    for (pl in 1:nrow(planes)){
      axe1=planes[pl,1];axe2=planes[pl,2]
      
      labX=paste("Axis ",axe1," (", format(res.pca$eig[axe1,2], digits=2, nsmall=2), "%)")
      labY=paste("Axis ",axe2," (", format(res.pca$eig[axe2,2], digits=2, nsmall=2), "%)")
      CVAR=res.pca$var$coord[,c(axe1,axe2)]
      plot.new()
      
      if (axisequal){
        xrange=c(min(0,min(cbind(CVAR[,1],CVAR[,2]))), max(0,max(cbind(CVAR[,1],CVAR[,2]))))
        yrange=xrange
        plot.window(xrange, yrange)
        title(xlab=labX)
        title(ylab=labY)
        
        #         plot(type="n",
        #              xlab=labX,ylab=labY)
        segments(xrange[1],0,xrange[2],0)
        segments(yrange[1],0,yrange[2],0)
      }
      else{
        plot.window(c(min(0,CVAR[,1]),max(0,CVAR[,1])),
                    c(min(0,CVAR[,2]),max(0,CVAR[,2])))
        title(xlab=labX)
        title(ylab=labY)
        #       plot(c(min(0,CVAR[,1]),max(0,CVAR[,1])), c(min(0,CVAR[,2]),max(0,CVAR[,2])),type="n",
        #            xlab=labX,ylab=labY)Y)
        segments(min(0,CVAR[,1]),0,max(0,CVAR[,1]),0)
        segments(0,min(0,CVAR[,2]),0,max(0,CVAR[,2]))}
      title("PCA Variable plot (Spanish-fan plot)")
      if ((nrow(CVAR)%%2)==0){centr=nrow(CVAR)/2} else{centr=(nrow(CVAR)-1)/2}
      centr=nrow(CVAR)/2
      cxl=0.8
      for (tr in 2:nrow(CVAR)){
        centrality=(abs(tr-centr-1)/(centr-1))
        
        #cat(centrality,"\n")
        x=c(0, CVAR[(tr-1),1], CVAR[tr,1], 0)
        y=c(0, CVAR[(tr-1),2], CVAR[tr,2], 0)
        red=1
        green=centrality
        #cat(red,green,"\n")
        polygon(x, y, col=rgb(red, green, 0,0.7), lty = 1, lwd = 1,  border = "black")
      }
      text(x = CVAR[1,1],y= CVAR[1,2],labels = colnames(MATQ)[1],cex=cxl)
      if (nrow(CVAR)<8){
        for (tr in 2:(nrow(CVAR)-1)){
          text(x = CVAR[tr,1],y= CVAR[tr,2],labels = colnames(MATQ)[tr],cex=cxl)
        }}
      else{
        
        text(x = CVAR[ceiling(nrow(CVAR)/8),1],y= CVAR[ceiling(nrow(CVAR)/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)/8)],cex=cxl)
        text(x = CVAR[ceiling(nrow(CVAR)*2/8),1],y= CVAR[ceiling(nrow(CVAR)*2/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)*2/8)],cex=cxl)
        text(x = CVAR[ceiling(nrow(CVAR)*3/8),1],y= CVAR[ceiling(nrow(CVAR)*3/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)*3/8)],cex=cxl)
        text(x = CVAR[centr+1,1],y= CVAR[centr+1,2],labels = colnames(MATQ)[centr+1],cex=cxl)
        text(x = CVAR[ceiling(nrow(CVAR)*5/8),1],y= CVAR[ceiling(nrow(CVAR)*5/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)*5/8)],cex=cxl)
        text(x = CVAR[ceiling(nrow(CVAR)*6/8),1],y= CVAR[ceiling(nrow(CVAR)*6/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)*6/8)],cex=cxl)
        text(x = CVAR[ceiling(nrow(CVAR)*7/8),1],y= CVAR[ceiling(nrow(CVAR)*7/8),2],
             labels = colnames(MATQ)[ceiling(nrow(CVAR)*7/8)],cex=cxl)
        
      }
      text(x = CVAR[nrow(CVAR),1],y= CVAR[nrow(CVAR),2],
           labels = colnames(MATQ)[ncol(MATQ)],cex=cxl)
      axis(1, pos = 0, cex.axis=0.7)#, at=xM,labels=labX)
      axis(2, pos = 0, cex.axis=0.7)#, at=yM,labels=labY)
      dev.new(noRStudioGD = FALSE )
      
      #plot individuals
      
      #layout(matrix(c(1,2,3,4), 2, 2, byrow = TRUE))
      
      xm=min(res.pca$ind$coord[,axe1])
      xM=max(res.pca$ind$coord[,axe1])
      ym=min(res.pca$ind$coord[,axe2])
      yM=max(res.pca$ind$coord[,axe2])
      Xlen=xM-xm
      Ylen=yM-ym
      
      #Compute scaling factor for domain
      matX=matrix(0,128,INDIV)
      matD=matrix(0,128,INDIV)
      for (ind in 1:INDIV){
        distr=data@M[ind,1][[1]]
        #generate 200 random points according to the QF
        rn=200
        xn=c(rep(0,rn))
        random_no=c(0:rn)/rn
        
        for (i in 1:rn){
          xn[i]=compQ(distr,random_no[i])
        }
        d <- density(xn,n=128)
        matX[,ind]=d$x
        matD[,ind]=d$y
      }
      MinX=min(matX)
      MaxX=max(matX)
      RX=MaxX-MinX
      q95D=quantile(matD,probs = qcut)
      matD[matD>q95D]=as.numeric(q95D)
      MaxD=max(matD)
      
      meanXRange=diff(range(apply(matX,2,FUN = range)))
      xfact=0.3
      yfact=0.1
      if (axisequal){xm1=min(xm,ym)
      xM1=max(xM,yM)
      ym1=xm1
      yM1=xM1
      Xlen=xM1-xm1
      Ylen=yM1-ym1
      xm=(xM1+xm1)/2-Xlen/2*(1+1.5*xfact)
      xM=(xM1+xm1)/2+Xlen/2*(1+1.5*xfact)
      ym=(yM1+ym1)/2-Ylen/2*(1+0.5*yfact) 
      yM=(yM1+ym1)/2+Ylen/2*(1+1.5*yfact)
      
      }else{
        xm1=xm
        xM1=xM
        ym1=ym
        yM1=yM
        xm=(xM1+xm1)/2-Xlen/2*(1+1.5*xfact)
        xM=(xM1+xm1)/2+Xlen/2*(1+1.5*xfact)
        ym=(yM1+ym1)/2-Ylen/2*(1+0.5*yfact) 
        yM=(yM1+ym1)/2+Ylen/2*(1+1.5*yfact)     
      }
      dev.new(noRStudioGD = FALSE )
      plot.new()
      plot.window(c(xm,xM),c(ym,yM))
      title(main="PCA plot of distributions", sub="Smoothed distributions")
      for (ind in 1:INDIV){
        x=matX[,ind]
        y=matD[,ind]
        x=(x-data@M[ind,1][[1]]@m)/meanXRange*xfact*Xlen+res.pca$ind$coord[ind,axe1]
        x=c(x,x[length(x)],x[1])
        y=c(y,0,0)/q95D*yfact*Ylen+res.pca$ind$coord[ind,axe2]
        polygon(x,y, col=rgb(1, 1,0,0.6))
        
      }
      for (ind in 1:INDIV){
        text(res.pca$ind$coord[ind,axe1],res.pca$ind$coord[ind,axe2],
             label=rownames(MATQ)[ind], pos=1, cex=0.7)
      }
      axis(1, pos = 0, cex.axis=0.7)#, at=xM,labels=labX)
      axis(2, pos = 0, cex.axis=0.7)#, at=yM,labels=labY)
      segments(xm,0,xM,0)
      segments(0,ym,0,yM)
      
      title(xlab=labX)
      title(ylab=labY)
      dev.new(noRStudioGD = FALSE )
      # plot.new()
      plot.new()
      plot.window(c(xm,xM),c(ym,yM))
      title(main="Coloured using the mean values")
      trasp=0.6
      
      for (ind in 1:INDIV){
        x=matX[,ind]
        y=matD[,ind]
        x=(x-data@M[ind,1][[1]]@m)/meanXRange*xfact*Xlen+res.pca$ind$coord[ind,axe1]
        x=c(x,x[length(x)],x[1])
        y=c(y,0,0)/q95D*yfact*Ylen+res.pca$ind$coord[ind,axe2]
        red=(data@M[ind,1][[1]]@m-minM)/(maxM-minM)
        if (red>0.5){
          green=1-red
          blue=0
        }
        else{green=1-red
        red=0
        blue=1-green
        }
        polygon(x,y, col=rgb(red,green,blue,trasp))
        
      }
      for (ind in 1:INDIV){
        text(res.pca$ind$coord[ind,axe1],res.pca$ind$coord[ind,axe2],
             label=rownames(MATQ)[ind], pos=1, cex=0.7)
      }
      axis(1, pos = 0, cex.axis=0.7)#, at=xM,labels=labX)
      axis(2, pos = 0, cex.axis=0.7)#, at=yM,labels=labY)
      segments(xm,0,xM,0)
      segments(0,ym,0,yM)
      
      abline(0,res.pca$quanti.sup$coord[1,2]/res.pca$quanti.sup$coord[1,1],lty=3,lwd=2, col="red")
      title(xlab=labX)
      title(ylab=labY)
      dev.new(noRStudioGD = FALSE )
      plot.new()
      plot.window(c(xm,xM),c(ym,yM))
      title(main="Coloured using std values")
      for (ind in 1:INDIV){
        
        x=matX[,ind]
        y=matD[,ind]
        x=(x-data@M[ind,1][[1]]@s)/meanXRange*xfact*Xlen+res.pca$ind$coord[ind,axe1]
        x=c(x,x[length(x)],x[1])
        y=c(y,0,0)/q95D*yfact*Ylen+res.pca$ind$coord[ind,axe2]
        red=(data@M[ind,1][[1]]@s-minS)/(maxS-minS)
        if (red>0.5){
          green=1-red
          blue=0
        }
        else{green=1-red
        red=0
        blue=1-green
        }
        polygon(x,y, col=rgb(red,green,blue,trasp))
        
      }
      for (ind in 1:INDIV){
        text(res.pca$ind$coord[ind,axe1],res.pca$ind$coord[ind,axe2],
             label=rownames(MATQ)[ind], pos=1, cex=0.7)
      }
      axis(1, pos = 0, cex.axis=0.7)#, at=xM,labels=labX)
      axis(2, pos = 0, cex.axis=0.7)#, at=yM,labels=labY)
      segments(xm,0,xM,0)
      segments(0,ym,0,yM)
      
      abline(0,res.pca$quanti.sup$coord[2,2]/res.pca$quanti.sup$coord[2,1],lty=3,lwd=2, col="red")
      
      title(xlab=labX)
      title(ylab=labY)
    }
  }
  return(list(PCAout=res.pca, WASSVARIANCE=VARW, INERTIA=TOTINE))
}
# Principal components analysis  of a set of histogram variables based on Wasserstein distance----
#' Principal components analysis  of a set of histogram variable based on Wasserstein distance
#' @description (Beta version) The function implements a Principal components analysis of a set of histogram variables 
#' based on Wasserstein distance. It performs a centered (not standardized) PCA on a set of quantiles of a variable.
#' Being a distribution a multivalued description, the analysis performs a dimensional reduction and a visualization of distributions. 
#' It is a 1d (one dimension) becuse it is considered just one histogram variable.
#' @param data A MatH object (a matrix of distributionH).
#' @param list.of.vars A list of  integers, the active variables.
#' @param quantiles An integer, it is the number of quantiles used in the analysis.
#' Default=10.
#' @param outl a number between 0 (default)  and 0.5. For each distribution, is the amount of mass removed from the tails 
#' of the distribution. For example, if 0.1, from each distribution is cut away a left tail and a right one each containing 
#' the 0.1 of mass.
#' @return a list with the results of the PCA in the MFA format of package \pkg{FactoMineR} for function MFA
#' @details It is an extension of WH.1d.PCA to the multiple case. 
#' @importFrom FactoMineR MFA
#' @importFrom stats rnorm
#' @export
## MFA PCA for quantiles -------
WH.MultiplePCA=function(data, list.of.vars, quantiles=10, outl=0){
  data=data[,list.of.vars]
  VARS=ncol(data@M)
  INDIV=nrow(data@M)
  MATQ=matrix(0,nrow = INDIV,ncol = VARS*(quantiles+1))
  if ((outl<0)||(outl>0.5)){outl=0}
  if ((outl==0.5)){outl=0.49}
 # browser()
  #create the names of the variables
  #################################################
  COLnames=list()
  if (outl==0){
    p=c(0, 1:quantiles)/quantiles
    for (v in 1:VARS){
      namec=list("Min")
      for (j in 2:(quantiles)){
        namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
      }
      namec=c(namec, "Max")
      COLnames=c(COLnames, paste(substr(colnames(data@M)[v],1,4),namec,sep="."))
      for (i in 1:INDIV)
      {
       # tmp=compQ_vect([email protected][i,v][[1]],p)
        for (j in 1:(quantiles+1)){
          MATQ[i,((v-1)*(quantiles+1)+j)]=compQ(data@M[i,v][[1]],p[j])
        }
      }
    }
    
  }
  else{
    
    p=c(0, 1:quantiles)/quantiles*(1-2*outl)+outl 
    namec=list(paste0("Q(",format(p[1], digits=2, nsmall=2),")"))
    for (j in 2:length(p)){
      namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
    }
    for (v in 1:VARS){
      namec=list(paste0("Q(",format(p[1], digits=2, nsmall=2),")"))
      for (j in 2:length(p)){
        namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
      }
      COLnames=c(COLnames, paste(substr(colnames(data@M)[v],1,4),namec,sep="."))
      for (i in 1:INDIV)
      {
        for (j in 1:(quantiles+1)){
          MATQ[i,((v-1)*(quantiles+1)+j)]=compQ(data@M[i,v][[1]],p[j])
        }
      }
    }
  }
  #################################################
  # COLnames=list();
  # for (v in 1:VARS){
  #   namec=list("Min")
  #   for (j in 2:(quantiles)){
  #     namec=c(namec,paste0("Q(",format(p[j], digits=2, nsmall=2),")"))
  #   }
  #   namec=c(namec, "Max")
  #   COLnames=c(COLnames, paste(substr(colnames([email protected])[v],1,4),namec,sep="."))
  #   for (i in 1:INDIV)
  #   {
  #     for (j in 1:(quantiles+1)){
  #       MATQ[i,((v-1)*(quantiles+1)+j)]=compQ([email protected][i,v][[1]],p[j])
  #     }
  #   }
  # }
  
  rownames(MATQ)=rownames(data@M)
  colnames(MATQ)=COLnames
  
  # do a MFA
  MATF=cbind(MATQ/sqrt((quantiles+1)))
  #check all columns
  for(cc in 1:ncol(MATF)){
    if (sd(MATF[,cc])<1e-100){
      MATF[,cc]=MATF[,cc]+rnorm(nrow(MATF),0,1e-20)
    }
  }
  MFA.res= MFA(MATF,group=c(rep(quantiles+1,length(list.of.vars))),type=c(rep("c",length(list.of.vars))),
               ncp=3,name.group=colnames(data@M),graph=TRUE)
  # do Plots!!!!
  #plot Spanish-fan plot
  #plot individuals
  return(MFA.res)
}

# Plotting Spanish fun plots for Multiple factor analysis of Histogram Variables ----
#' Plotting Spanish fun plots for Multiple factor analysis of Histogram Variables
#' @description The function plots the circle of correlation of the quantiles
#' of the histogrma variables after a Multiple factor analysis.
#' @param res Results from WH.MultiplePCA, or WH.1D.PCA.
#' @param axes A list of  integers, the new factorial axes c(1,2) are the default.
#' @param var A list of integers are the variables to plot.
#' @param LABS Logical, if TRUE graph is labeled, otherwise it does not.
#' @param multi Logical, if TRUE (default) results come from a WH.MultiplePCA,
#'  if FALSE results come from WH.1D.PCA.
#' @param corplot Logical, if TRUE (default) the plot reports correlations, if FALSE
#' the coordinates of quantiles on the factorial plane
#' @return a plot of class ggplot
#' @examples
#' #Do a MultiplePCA on the BLOOD dataset
#' \dontrun{
#' res=WH.MultiplePCA(BLOOD,list.of.vars = c(1:3)) 
#' }
#' #Plot results
#' \dontrun{
#' WH.plot_multiple_Spanish.funs(res,axes=c(1,2),var=c(1:3))
#' }
#' @export
WH.plot_multiple_Spanish.funs=function(res,
                                       axes=c(1,2),var=1,LABS=TRUE,
                                       multi=TRUE,corplot=TRUE){
  #require(ggplot2)
  if (multi){
    labX=paste("Axis ",axes[1]," (", format(res$eig[axes[1],2], 
                                            digits=2, nsmall=2), "%)")
    labY=paste("Axis ",axes[2]," (", format(res$eig[axes[2],2], 
                                            digits=2, nsmall=2), "%)")
  }
  else
  {
    labX=paste("Axis ",axes[1]," (", format(res$PCAout$eig[axes[1],2], 
                                            digits=2, nsmall=2), "%)")
    labY=paste("Axis ",axes[2]," (", format(res$PCAout$eig[axes[2],2], 
                                            digits=2, nsmall=2), "%)")
  }
  v1=numeric()
  v2=numeric()
  tt=character()
  tt2=numeric()
  VA=numeric()
  x1=numeric()
  x2=numeric()
  hj=numeric()
  vj=numeric()
  if (!multi){var=1}
  for (j in var){
    
    #plot Spanish-fan plot
    if (multi){
      els=which(res$summary.quanti$group==j)
    } 
    else{
      els=c(1:nrow(res$PCAout$var$coord))
    }
    tmp_v1=numeric()
    tmp_v2=numeric()
    tmp_tt=character()
    tmp_tt2=numeric()
    tmp_VA=numeric()
    if (multi){
      if (corplot){
        CVAR=res$global.pca$var$cor[els,c(axes[1],axes[2])]
      }
      else{
        CVAR=res$global.pca$var$coord[els,c(axes[1],axes[2])]
      }
    }
    else{
      
      if (corplot){
        CVAR=res$PCAout$var$cor[els,c(axes[1],axes[2])]}
      else{
        CVAR=res$PCAout$var$coord[els,c(axes[1],axes[2])]
      }
    }
    
    for (pp in 1:(nrow(CVAR)-1)){
      tmp_v1=c(tmp_v1,CVAR[pp,1],0,CVAR[pp+1,1],CVAR[pp,1])
      tmp_v2=c(tmp_v2,CVAR[pp,2],0,CVAR[pp+1,2],CVAR[pp,2])
      tmp_tt=c(tmp_tt,paste(as.character(j),rep(as.character(pp),4),sep="_"))
      tmp_tt2=c(tmp_tt2,rep(pp,4))
      tmp_VA=c(tmp_VA,rep(j,4))
    }
    tmp_tt2=abs(tmp_tt2-mean(tmp_tt2))
    tmp_tt2=1-tmp_tt2/max(tmp_tt2)
    v1=c(v1,tmp_v1,0,0,0,0)
    v2=c(v2,tmp_v2,0,0,0,0)
    tt=c(tt,tmp_tt,paste(as.character(j),rep(as.character(pp),4),sep="_"))
    tt2=c(tt2,tmp_tt2,1,1,1,1)
    tmp_VA=c(tmp_VA,rep(j,4))
    VA=c(VA,tmp_VA)
    x1=c(x1,CVAR[,1])
    x2=c(x2,CVAR[,2])
  }
  
  d=data.frame(v1=v1,v2=v2,tt=tt,tt2=tt2,VA=VA)
  #CVAR=res$global.pca$var$cor[,c(axes[1],axes[2])]
  txt_lab=data.frame(v1=x1,v2=x2,hj=(-sign(x1)+1)/2,vj=(-sign(x2)+1)/2)
  ### the circle of correlation
  angle <- seq(-pi, pi, length = 250)
  x_c = sin(angle)
  y_c = cos(angle)
  df <- data.frame(x_c = x_c, y_c = y_c)
  ##
  l1=data.frame(x=c(-1.2,1.2),y=c(0,0))
  l2=data.frame(x=c(0,0),y=c(-1.2,1.2))
  
  
  p <- ggplot(d, aes(x=v1, y=v2)) + 
    geom_polygon(aes(group=tt, fill=as.factor(VA),alpha=tt2),colour='grey30')
  if (LABS==TRUE){
    p=p+geom_text(data=txt_lab,aes(x=v1,y=v2,
                                   label=rownames(txt_lab),
                                   hjust=hj,vjust=vj))
  }
  if (corplot){
    p=p+xlim(-1.2,1.2)+ylim(-1.2,1.2)+
      ggtitle("Correlation plot of variables (Spanish-fan plot)")
  }
  else{
    XL=range(d$v1)
    XL=(XL-mean(XL))*1.2+mean(XL)
    YL=range(d$v2)
    YL=(YL-mean(YL))*1.2+mean(YL)
    p=p+xlim(XL)+ylim(YL)+
      ggtitle("Plot of variables (Spanish-fan plot)")
  }
  
  p=p+theme_bw()+
    xlab(labX)+ylab(labY)+
    theme(legend.position="none")
  if (corplot){
    p=p+coord_fixed(ratio = 1)}
  p=p+theme(plot.title = element_text(hjust = 0.5))
  if (corplot){
    p=p+geom_path( data = df, aes(x=x_c, y=y_c), inherit.aes = F,linetype = 2)
  }
  p=p+geom_vline(xintercept = 0)+geom_hline(yintercept = 0)
  
  
  print(p)    
  
  return(p)
}

# Plot histograms of individuals after a  Multiple factor analysis of Histogram Variables ----
#'  Plot histograms of individuals after a  Multiple factor analysis of Histogram Variables
#' @description (Beta version) The function plots histogram data of the individuals for a particular
#' variable on a factorial palne after a Multiple factor analysis.
#' @param data a MatH object
#' @param res Results from WH.MultiplePCA.
#' @param axes A list of  integers, the new factorial axes c(1,2) are the default.
#' @param indiv A list of objects (rows) of data to plot. Default=0 all the objects of data.
#' @param var An integer indicating an original histogrma variable to plot.
#' @param strx a resizing factor for the domain of histograms (default=0.1 means 
#' that each distribution has a support that is one tenth of the spread of the x axis)
#' @param stry a resizing factor for the density of histograms (default=0.1 means 
#' that each distribution has a density that is one tenth of the spread of the y axis)
#' @param HISTO a logical value. Default=TRUE plots histograms, FALSE plot smooth densities.
#' @param coor (optional) if 0 (Default) takes the coordinates in res, if a a matrix is passed the coordinates are those passed
#' @param stat (optional) if 'mean'(Default) a plot of individuals labeled by the means is produced.
#' Otherwise if 'std', 'skewness' or 'kurtosis', data are labeled with this statistic. 
#' @return a plot of class ggplot
#' @examples
#' #Do a MultiplePCA on the BLOOD dataset
#' \dontrun{
#' #' results=WH.MultiplePCA(BLOOD,list.of.vars = c(1:3)) 
#' #Plot histograms of variable 1 of BLOOD dataset on the first 
#' #factorial plane showing histograms
#' WH.plot_multiple_indivs(BLOOD,results,axes=c(1,2),var=1,strx=0.1,
#'  stry=0.1, HISTO=TRUE)
#' #Plot histograms of variable 1 of BLOOD dataset on the first 
#' #factorial plane showing densities

#' WH.plot_multiple_indivs(BLOOD,results,axes=c(1,2),var=1,strx=0.1,
#'  stry=0.1, HISTO=FALSE)
#'  }

#' @export 
WH.plot_multiple_indivs=function(data,res,axes=c(1,2),indiv=0,var=1, 
                                 strx=0.1, stry=0.1, HISTO=TRUE, coor=0,
                                 stat="mean"){
  #require(ggplot2)
  
  if (indiv[1]==0){
    el_ind=c(1:get.MatH.nrows(data))
  } else{
    el_ind=indiv
  }
  if (length(coor)==1){
    coord=res$ind$coord[el_ind,axes]}else{
      coord=coor
    }
  # browser()
  
  if (HISTO){
    x=numeric()
    y=numeric()
    ID_O=numeric()  
    minx=numeric()
    mea=numeric()
    rmax=-Inf
    for (i in el_ind){
      
      tmp_r=max(data@M[i,var][[1]]@x)-data@M[i,var][[1]]@x[1]+1e-100
      mm=data@M[i,var][[1]]@m
      if (tmp_r>rmax){rmax=tmp_r}
      vv=COMP_POLY_H(data@M[i,var][[1]])
      x=c(x,min(vv$x),vv$x,max(vv$x))
      y=c(y,0,vv$y,0)
      ID_O=c(ID_O,rep(i,length(vv$x)+2))
      mea=c(mea,rep(mm,length(vv$x)+2))
      minx=c(minx,rep(min(vv$x),length(vv$x)+2))
    }
    
  }else
  { 
    # compute densities
    pt=300
    x=numeric()
    y=numeric()
    ID_O=numeric()  
    minx=numeric()
    mea=numeric()
    rmax=-Inf
    
    for (i in el_ind){
      vals=COMP_MQ(data@M[i,var][[1]],Mp=c(0:pt)/pt)
      mm=data@M[i,var][[1]]@m
      tmp_r=max(vals)-min(vals)
      if (tmp_r>rmax){rmax=tmp_r}
      vv=density(vals,n = 128,from=min(vals), to=max(vals))
      x=c(x,min(vv$x),vv$x,max(vv$x))
      y=c(y,0,vv$y,0)
      ID_O=c(ID_O,rep(i,length(vv$x)+2))
      mea=c(mea,rep(mm,length(vv$x)+2))
      minx=c(minx,rep(min(vals),length(vv$x)+2))
    }
    
    ### end of compute density
  }
  ID_o=ID_O
  x_o=x
  y_o=y
  ALL_OBJ=data.frame(x_o=x_o,y_o=y_o,ID_o=ID_o,minx=minx,mea=mea)
  limX=range(ALL_OBJ$x_o)
  
  EXTRY=quantile(ALL_OBJ$y_o,probs = 0.995)
  #bisogna traslare ogni oggetto e scalarlo
  #scaliamo gli oggetti tra 0 e 1
  ALL_OBJ$x=(ALL_OBJ$x_o-minx)/(rmax)
  #tagliamo picchi eccessivi
  ALL_OBJ$y[ALL_OBJ$y_o>EXTRY]=EXTRY
  limY=quantile(ALL_OBJ$y_o,probs = 0.99)
  ALL_OBJ$y=ALL_OBJ$y_o/limY
  #ogni oggetto va traslato e rimpicciolito di un fattore strech x strech y
  dim1MAX=max(coord[,1])
  dim1Min=min(coord[,1])
  rX=(dim1MAX-dim1Min)
  dim2MAX=max(coord[,2])
  dim2Min=min(coord[,2])
  rY=(dim2MAX-dim2Min)
  cc=0
  for (i in el_ind){
    cc=cc+1
    xc=coord[cc,1]
    yc=coord[cc,2]
    tmp_x=ALL_OBJ$x_o[which(ALL_OBJ$ID_o==i)]
    tmp_y=ALL_OBJ$y_o[which(ALL_OBJ$ID_o==i)]
    tmp_x=tmp_x*strx*rX
    tmp_y=tmp_y*stry*rY
    tmp_x=(tmp_x-mean(tmp_x))+xc
    tmp_y=tmp_y+yc
    ALL_OBJ$x_o[which(ALL_OBJ$ID_o==i)]=tmp_x
    ALL_OBJ$y_o[which(ALL_OBJ$ID_o==i)]=tmp_y
  }
  
  
  title=paste("Plot of individuals: distributions for variable",
              colnames(data@M)[var])
  labX=paste("Axis ",axes[1]," (", format(res$eig[axes[1],2], 
                                          digits=2, nsmall=2), "%)")
  labY=paste("Axis ",axes[2]," (", format(res$eig[axes[2],2], 
                                          digits=2, nsmall=2), "%)")
  
  colnames(coord)=c('x','y')
  coord=as.data.frame(coord)
  p=ggplot(data=ALL_OBJ,aes(x=x_o,y=y_o))+
    geom_polygon(aes(group=ID_o, fill=as.factor(ID_o),alpha=0.4),colour='black')+
    geom_point(data=coord,aes(x=x,y=y))+
    geom_text(data=coord,
              aes(x=x,y=y,label=rownames(coord),
                  hjust=0.5, vjust=1))+theme_bw()+
    xlab(labX)+ylab(labY)+
    ggtitle(title)+
    theme(legend.position="none")+
    theme(plot.title = element_text(hjust = 0.5))+
    geom_vline(xintercept = 0)+geom_hline(yintercept = 0)
  ### add a line for mean snd sd
  Ms=get.MatH.stats(data[el_ind,var],stat=stat)$mat
  # SD=get.MatH.stats(data[el_ind,var],stat="std")$mat
  # Skew=get.MatH.stats(data[el_ind,var],stat="std")$mat
  # kurtH()=get.MatH.stats(data[el_ind,var],stat="std")$mat
  ### print a plot with means
  
  coord=cbind(coord,Ms)
  colnames(coord)[[3]]=stat
  
  p2=ggplot(ALL_OBJ,aes(x=x_o,y=y_o))+
    scale_fill_gradient(low = "yellow", high = "red")+
    geom_polygon(aes(group=ID_o, fill=mea,alpha=0.4),colour='black')+
    geom_point(data=coord,aes(x=x,y=y))+theme_bw()+
    geom_text(data=coord,
              aes(x=x,y=y,
                  label=format(mean,digits=1,nsmall=2),
                  hjust=0.5, vjust=1))+
    xlab(labX)+ylab(labY)+
    ggtitle(paste(title,"each label is the ",stat,"." ))+
    theme(legend.position="none")+
    theme(plot.title = element_text(hjust = 0.5))+
    geom_vline(xintercept = 0)+geom_hline(yintercept = 0)
  
  #+ geom_abline(intercept = 0, slope = cor(coord)[1,3]*sd(coord$x),linetype=2)
  # browser()
  print(p)
  print(p2)
  
  return(pp=list(p,p2))
}

COMP_MQ=function(object,Mp){
  res=numeric()
  n=length(Mp)
  
  for (i in 1:n){
    p=Mp[i]
    #Check for errors
    if (p<0 || p>1) stop("p must be a value between 0 and 1")
    
    if (p<=0) {
      q=object@x[1]
    }else{
      if (p>=1) {q=object@x[length(object@x)]}
      else{
        ini=max(object@x[object@p<=p])
        pos1=which.max(object@x[object@p<=p])
        pos2=pos1+1
        fin=object@x[pos2];
        if (ini==fin){
          q=ini
        }
        else{
          q=ini+(object@x[pos2]-object@x[pos1])*(p-object@p[pos1])/(object@p[pos2]-object@p[pos1])
        }
      }
      res=c(res,q)
    }
  }
  return(res)
}

COMP_POLY_H=function(object){
  #browser()
  x=object@x[1]
  y=0
  if (length(object@x)<2){
    x=c(x,x,x)
    y=c(y,1,0)}
  else{
    for (i in 2:length(object@x)){
      tmp_dens=(object@p[i]-object@p[i-1])/(object@x[i]-object@x[i-1]+1e-100)
      x=c(x,object@x[i-1],object@x[i])
      y=c(y,tmp_dens,tmp_dens)
    }
    x=c(x,object@x[length(object@x)])
    y=c(y,0)
  }
  
  return(res=data.frame(x=x,y=y))
}

Try the HistDAWass package in your browser

Any scripts or data that you put into this service are public.

HistDAWass documentation built on Dec. 7, 2017, 5:03 p.m.