R/LexHCca.R

Defines functions LexHCca

Documented in LexHCca

#' @import flexclust
#' @export
LexHCca <- function(x, cluster.CA="docs", nb.clust="click",  min=2, max=NULL, kk=Inf, 
                    consol=FALSE, iter.max=500, graph=TRUE, description=TRUE, proba=0.05, 
                    nb.desc=5, size.desc=80, seed=12345,...)
  
{
  dots <- list(...)
  if(is.null(dots$method)) method.sel <- "ward"
    else method.sel <-  dots$method
    

    object <- x
if (!inherits(object, "LexCA")) stop("object should be LexCA class")
  options(stringsAsFactors = FALSE)
  
  marg.doc<-"before"
 if(description==FALSE)  nb.desc <-0
  
#---- Initial checks -----------------------------------------
  ## Type selection number of clusters
if(is.character(nb.clust))
  nb.clust <- ifelse(nb.clust == "auto", -1,0)
 # if(nb.clust=="click") nb.clust<- 0
 #  if(nb.clust=="auto") nb.clust<- -1
  
  
# if(nb.clust>0) if(kk!=Inf) stop("For k-means nb.clust must be 0 (user selection) or -1 (automatic selection)")
# graph.scale = add top right the barplot
  graph.scale="inertia"

  if ((kk != Inf) & (consol == TRUE)) {
    warning("No consolidation has been done after the hierarchical clustering since kk is different from Inf")				
    consol <- FALSE
  }	
  ### nb.clust can be a number; 0 is click; -1 is automatic
  
  ##
  if (min <2) min<-2
  metric = "euclidean"
  if (cluster.CA=="rows" ) cluster.CA <- "docs"
  if (cluster.CA=="columns" ) cluster.CA <- "words"
 #---- End Initial checks -----------------------------------------


  
#---- F1. auto.cut.tree function -----
  auto.cut.tree <- function(res, min, max, metric, method=method.sel, weight = NULL, cla = NULL, ...)
  {
    set.seed(seed)
    X <- as.data.frame(res$ind$coord)
    do <- dist(X, method = metric)^2			
    eff <- outer(weight, weight, FUN = function(x, y, n) {x * y/n/(x + y)}, n = sum(weight))
    dissi <- do * eff[lower.tri(eff)]
    hc <- flashClust::hclust(dissi, method = method, members = weight)
    inert.gain <- rev(hc$height)
    intra <- rev(cumsum(rev(inert.gain)))
    quot <- intra[min:(max)]/intra[(min-1):(max-1)]
    nb.clust <- which.min(quot) + min-1
    rm(.Random.seed,envir=globalenv()) # Retrieve set.seed
    return(list(res = res, tree = hc, nb.clust = nb.clust,
                within = intra, inert.gain = inert.gain, quot = quot))
  }	  # End Function auto.cut.tree  
  
  

  
#---- F2. print.inertia function ----
  print.inertia <- function(title) {
    if (!nzchar(Sys.getenv("RSTUDIO_USER_IDENTITY"))) 
      dev.new()
    old.mar <- par()$mar
    par(mar = c(0.5, 2, 0.75, 0))
    lay = matrix(ncol = 5, nrow = 5, c(2, 4, 4, 4, 4, 
                                       2, 4, 4, 4, 4, 2, 4, 4, 4, 4, 2, 4, 4, 4, 4,1, 3, 3, 3, 3))
    layout(lay, respect = TRUE)
    barplot(rev(t$tree$height)[1:max(15, max)], col = c(rep("black", 
                                                            t$nb.clust - 1), rep("grey", max(max, 15) - t$nb.clust + 1)), rep(0.1, max(max, 15)), space = 0.9)
    plot(x = 1, xlab = "", ylab = "", main = "", col = "white",axes = FALSE)
    text(1,1,title,cex=2)
    plot(x = 1, xlab = "", ylab = "", main = "", col = "white",axes = FALSE)
    legend("top", "Aggreg. Crit", box.lty = NULL, cex = 1)
  }    # End function print.inertia
  
  


#---- F3. sum of squares ----
  ssquares <- function(coord, weight_,clust_){ # }, weight_, clust_) {
    dfw <- data.frame(coord,"weight_"=weight_, "clust_"=clust_)
        # write.csv2(dfw,file="C:/Xplortext/dfw.csv")
        sum.weights <- sum(dfw$weight_)
        pct.weight <- dfw$weight_/ sum.weights  
        dfclust <- as.data.frame(table(clust_))
        dimen <- ncol(coord)
    mediadim <- apply(dfw[,c(1:dimen)]*pct.weight,2,FUN=sum)
    dfw$sstot_ <- apply(sweep(dfw[,c(1:dimen)],2,mediadim,"-")^2  *pct.weight,1,FUN="sum")
    cl.weight  <- aggregate(pct.weight, by=list(dfw$clust_), FUN=sum)$x
    coord_centers <- aggregate(dfw[,c(1:dimen)]*pct.weight/cl.weight[dfw$clust_],by=list(dfw$clust_), FUN=sum) 
    dfw$ssintra_ <- apply(((dfw[,c(1:dimen)] - coord_centers[clust_, c(2:ncol(coord_centers))])^2),1,sum) *pct.weight
    dfw$ssinter_ <- dfw$sstot_-dfw$ssintra_
    dfw$ssexplain_ <- 100*dfw$ssinter_ / dfw$sstot_ 
    dfclust$weight <- cl.weight*sum.weights 
    dfclust$sstot <- aggregate( dfw$sstot_,by=list(dfw$clust_), FUN=sum)$x
    dfclust$ssintra <- aggregate( dfw$ssintra_,by=list(dfw$clust_), FUN=sum)$x
    dfclust$ssinter <- dfclust$sstot-dfclust$ssintra
    dfclust$ssexplain <- 100*dfclust$ssinter / dfclust$sstot 
    ss.tot <- sum(dfclust$sstot)  
    ss.intra <- sum(dfclust$ssintra)
    ss.inter <- sum(dfclust$ssinter)
    pct.explained <- 100* ss.inter / ss.tot

       if (cluster.CA == "docs") {							
      colnames(dfclust) <- c("Cluster", "Docs", "Occurrences", "ss.tot", "ss.intra", "ss.between", "%Explained")							
    } else {							
      colnames(dfclust) <- c("Cluster", "Words", "Occurrences", "ss.tot", "ss.intra", "ss.between", "%Explained")							
    }	
     dfw <- dfw[,c((ncol(dfw)-5):ncol(dfw))]
     colnames(dfw) <- c("Weight","Cluster", "ss.tot", "ss.intra", "ss.between","%Explained")
    
    res.ss <- list("ss.tot"=ss.tot, "ss.intra"=ss.intra,"ss.inter"= ss.inter, "coord.centers"=coord_centers, "dfw"=dfw,
                   "dfclust"=dfclust)
    return(res.ss)
  }  # End function F3. sum of squares
  
  
  
  #---- F4. Consolidation function ----
  consolidationW <- function(coordinates, cluster_i,  weights =NULL, method="hardcl", iter.max = iter.max, ...) {		
    # X son las coordenadas documents x dims
    # iter.max <- iter.max+1
     # init_centers <- as.matrix(centers[, c(2:ncol(centers))])
    iter <- 0
    cluster_iminus1 <- 1

    while(identical(cluster_i, cluster_iminus1) == F && iter < iter.max){
      iter <- iter + 1			 # update iteration	
      suppressWarnings(
        ck_flex <- flexclust::cclust(coordinates,k=cluster_i, method="hardcl",weights= weights))
       # w_centers <- as.matrix(ck_flex@centers)
      cluster_iminus1 <- cluster_i
      cluster_i <- ck_flex@cluster
    } # End while
    return(ck_flex@cluster)
  } 
  
  #---- F4 End Consolidation function ----
  
  #---- F5 descfreq_New ----
  descfreq_New <- function (donnee, by.quali = NULL, proba = 0.05, row.INIT, col.INIT)
  {
    lab <- colnames(donnee)
    for (i in 1:length(lab)) lab[i] = gsub(" ", ".", lab[i])
    if (!is.null(by.quali)) {
      donnee <- as.data.frame(matrix(unlist(by(donnee, by.quali,apply, 2, sum)), ncol = ncol(donnee), byrow = TRUE))				
      rownames(donnee) <- levels(by.quali)
      marge.li <- aggregate(row.INIT, by=list(by.quali), FUN="sum")
      marge.li <- as.vector(marge.li$x)
    }
    colnames(donnee) = lab
    old.warn = options("warn")
    options(warn = -1)				
    nom = tri = structure(vector(mode = "list", length = nrow(donnee)), names = rownames(donnee))				
    marge.col=col.INIT
    SumTot=sum(marge.li)
    nr <- nrow(donnee)
    nc <- ncol(donnee)
    for (j in 1:nr) {
      aux3 = marge.li[j]/SumTot
      for (k in 1:nc) {
        aux2 = donnee[j, k]/marge.col[k]
        
        if (aux2 > aux3) # Over representation
          aux4 <- phyper(donnee[j, k] - 1, marge.col[k],SumTot - marge.col[k], marge.li[j],lower.tail=FALSE)*2
        else aux4 = phyper(donnee[j, k], marge.col[k], SumTot - marge.col[k], marge.li[j])*2
        
        if (aux4 > 1)
          aux4 <- 2 - aux4
        if (aux4 < proba) {
          aux5 <- (1-2 * as.integer(aux2 > aux3)) * qnorm(aux4/2)
          aux1 = donnee[j, k]/marge.li[j]
          tri[[j]] = rbind(tri[[j]], c(aux1 * 100, sum(marge.col[k])/SumTot *100, donnee[j, k], marge.col[k], aux4, aux5))
          nom[[j]] = rbind(nom[[j]], c(colnames(donnee)[k], colnames(donnee)))
        }				
      }		
    }			
    
    for (j in 1:nrow(donnee)) {
      if (!is.null(tri[[j]])) {
        oo = rev(order(tri[[j]][, 6]))
        tri[[j]] = tri[[j]][oo, ]
        nom[[j]] = nom[[j]][oo, ]
        if (nrow(matrix(tri[[j]], ncol = 6)) > 1)
          rownames(tri[[j]]) = nom[[j]][, 1]
        else {
          tri[[j]] = matrix(tri[[j]], ncol = 6)
          rownames(tri[[j]]) = nom[[j]][1]
        }
        colnames(tri[[j]]) = c("Intern %", "glob %", "Intern freq","Glob freq ", "p.value", "v.test")
      }
    }
    names(tri) <-paste("cluster",1:length(tri),sep="_")
    res <- tri
    options(old.warn)
    class(res) <- c("descfreq", "list ")
    return(res)				
  }	
  #---- End descfreq_New function -------------------------------
  
  
  
  ########################################################################################  
  # Starting analysis ------------------------------------------------------------  
  ######################################################################################## 


   #---- 1. Weighting documents (after/before) and words ----
  doc.w.after <- apply(object$call$X,1,sum)    # Initial weighting for documents after selection
  #nm <- names(doc.w.after)
  # doc.w.before before word's selection
  #doc.w.before <- object$rowINIT[which(rownames(object$rowINIT) %in% nm) ,]
  #names(doc.w.before) <- nm
  word.w <- apply(object$call$X, 2, sum) 
    #if(marg.doc=="before") doc.w <- doc.w.before else doc.w <- doc.w.after # before word's selection


  #---- 2. Selecting words or documents to cluster  ----
  if (cluster.CA == "docs") {
    weight.dw <- doc.w.after # 
    coord.dw <- object$row$coord
  } else {
    weight.dw <- word.w
    coord.dw <- object$col$coord
  }   
 
  
  #---- 3. Building PCA object ----
  # Check if kmeans will be performed
  # Operations with kk
  #  If kk = Inf => hierarchical cluster
  #  If kk is not Inf => k-means cluster. If k-means coordinates * sqrt(eigenvalue)   
  ################# Selecting cluster of rows (documents) or columns (words) 
  cla <- res <- NULL	
  
  
  
  if (kk < nrow(coord.dw)) {  # Kmeans
    res <- as.data.frame(sweep(coord.dw, 2, sqrt(object$eig[1:ncol(coord.dw), 1]), FUN = "*"))
    kk <- min(kk, nrow(unique(coord.dw)))
  } else kk=Inf				


   if (!is.null(res)) {	# It is k-means. res is a dataframe coordinates active documents (or words) x dimensions	
    if (kk < nrow(res)) kk <- min(kk, nrow(unique(res)))	
    if (kk <= nrow(res)) {	
      # Weighted k-means (no with Factoclass) Do it with flexclust   
      set.seed(seed)
      cla <- flexclust::cclust(res,k=kk, method="hardcl",weights= weight.dw)
      cla <- data.frame(cla@cluster) # documents/words win names x 1 column whith number of cluster	
      cla <- cbind(res, weight.dw ,cla[rownames(res),,drop=FALSE])
      cla1 <- cbind(cla[,c(1:(ncol(cla)-2))]*cla[,(ncol(cla)-1)],cla[,c((ncol(cla)-1):ncol(cla))])
      cla.agg <- aggregate(cla1[,c(1:(ncol(cla1)-1))], by=list(cla.cluster=cla$cla.cluster), FUN=sum)	
      cla.centers <-  cla.agg[,c(2:(ncol(cla.agg)-1))] /cla.agg[,ncol(cla.agg)]  
      res <- FactoMineR::PCA(cla.centers, row.w = cla.agg[,ncol(cla.agg)], scale.unit = FALSE, ncp = Inf, graph = FALSE)
      rm(.Random.seed,envir=globalenv()) # Retrieve set.seed
    }          # end      if (kk <= nrow(res)) 
  } # Final of k-means
  

  # If kk >= nrow(word) now a hierarchical cluster is performed, res don't exist
  if(kk==Inf){ ## It is hierarchical cluster, no k-means
    aux <- object$eig			
    res <- FactoMineR::PCA(coord.dw, scale.unit = FALSE, ncp = Inf, graph = FALSE, row.w = weight.dw)	
    res$eig <- aux			
  }  # Now, all res object are PCA objects type;  End Building PCA object



    
  
  
  #---- 4. Checking the minimum and maximum number of clusters ----
  # max min  "ind" is documents if cluster.CA="docs", words if cluster.CA="words"
  nb.ind <- nrow(coord.dw)			## Cases (docs o words) 

  if (is.null(max))    max <- min(10, round(nb.ind/2))			
  max <- min(max, nb.ind  - 1)  
  if(max<2) {
    stop("The maximum number of clusters (",max,") can not be lower than 2")
  }
  if(max < min) {
    warning("The minimum number of clusters (",min,") can not be bigger than the maximum (",max,"). 
              Automatic change: min=max")				
    min<- max
  }   # End max min
  

  

  
  

  
  #---- 5. Building the tree to select number clusters -----   
  # Retrieve the tree with cases if hierarchical or groups if k-means	
  # row.w.init has the weighting from rows from res object of PCA
  # method=ward, metric=euclidean , res$call$row.w.init is absolute frequency after
  
    t <- auto.cut.tree(res, min = min, max = max, metric = metric,method.sel = method.sel,
                     weight = res$call$row.w.init, cla = cla, order = order, ...)	
 
   # tree can be plotted plot(t$tree)
  # We need to select the number of clusters
  # Only is necessary the plot if nb.clust = click (0 value)

  if (graph.scale == "inertia") {
    nb.ind.clI <- nrow(t$res$ind$coord)
    inertia.height <- rep(0, nb.ind.clI - 1)							
    for (i in 1:(nb.ind.clI - 1)) inertia.height[i] <- t$inert.gain[(nb.ind.clI -i)]							
    inertia.height <- sort(inertia.height, decreasing = FALSE)	
    t$tree$height <- inertia.height							
  }			# end of graph.scale =="inertia"
  
  
  
  
  #---- 6. Check number clusters (max,min) before selection ----
  options(warn=0)
  if (nb.clust == 1)  #
  {
    warning("The number of clusters can not be 1. Automatic change: nb.clust=",-1)				
    nb.clust <- -1
  }  
  if(nb.clust>1)
    if(kk<=nb.clust) {
      warning("The number of clusters used in a Kmeans preprocessing before the hierarchical clustering
              is less than the number of clusters nb.clust. Automatic change: nb.clust=",kk-1)				
      nb.clust <- kk-1
    } 
  
  if(nb.clust>max) {
    warning("The number of selected clusters is bigger than the number of maximum clusters.
              Automatic change: max=", min(nb.clust, nrow(res$ind$coord) - 1))				
    max<- min(nb.clust,nrow(res$ind$coord) - 1)
    nb.clust <- max  # New instruction
  }    
  
 
  
  #---- 7. Height for proposed number of clusters, necessary to plot for selection ----
  auto.haut <- ((t$tree$height[length(t$tree$height) - t$nb.clust + 2]) +		
                  (t$tree$height[length(t$tree$height) - t$nb.clust + 1]))/2 
  
  #---- 8. Print Inertia gain graph for selection case ----
  if (nb.clust==0 ) {
    par(mfrow=c(1,1))			
    print.inertia("Hierarchical Clustering")
    print("Click on the graph to cut the tree")	
    flush.console()							
    plot(t$tree, hang = -1, main = "Click to cut the tree", xlab = "", sub = "")
    text(1, 1, "Hierarchical Clustering", "cex" = 2)	
    abline(h = auto.haut, col = "black", lwd = 3)	
    coupe <- locator(n = 1)							
    while (coupe$y < min(t$tree$height)) {							
      cat("No class \n")							
      coupe <- locator(n = 1)							
    }						    # End while	
    y <- coupe$y							
    # y is a value of height, value of xy axis							
    # If it is not manual selections inertia gain graph is not plotted
  } else {
    if (nb.clust < 0) 	
      y <- auto.haut	
    else     y <- (t$tree$height[length(t$tree$height) - nb.clust + 2] + 
                t$tree$height[length(t$tree$height) - nb.clust + 1])/2	
  }

  # y has the point of cut
  # composition of clusters (if kmeans not the cases)
  clust <- cutree(as.hclust(t$tree), h = y)	

  # integer with the number of the cluster
  # The name is the label of the doc/word 
  nb.clust <- max(clust)	
  
  
### REVISAR SI ordColo es necesario después
  # Order of clusters in dendrogram (duplicates are removed)
  ordColo <- unique(clust[t$tree$order])
  
  
  ### REVISAR SI X es necesario después
    X <- as.data.frame(t$res$ind$coord)	   # Coordinates Docs or words x dimensions, If kmeans groups x dimensions
  #---- End Print Inertia gain graph for selection case ----
  
  
  #---- 9. df.coord.cla docs x  (Dim.1;...;Dim k; weight cluster ----
  if(kk==Inf) {  # Jerárquico
    df.coord.cla <- data.frame(X, weight= res$call$row.w.init[rownames(X)], cluster=clust[rownames(X)])
  }
  if(kk!=Inf) {   # Kmedias
    df.coord.cla <-   data.frame(coord.dw, weight= cla[rownames(coord.dw),"weight.dw"], cluster=clust[cla$cla.clust])                               #cluster=cla[rownames(coord.dw),"cluster"])
  }


    
  #---- 10 Print dendrogram - Selection of the number of the clusters has been done, cla ----
  if(graph) if(kk!=Inf) {
    warning("Warning: Dendrogram is not plotted for kmeans method")
   # graph <- FALSE
  }
  if(graph) if(consol) {
    warning("Warning: Dendrogram is not plotted for consolidation method")
  #  graph <- FALSE
  }

  if(graph & kk==Inf) {
    par(mfrow=c(1,1))
    print.inertia("Hierarchical Clustering")
    plot(t$tree, hang = -1, main = "", xlab = "", sub = "")
    text(1, 1, "Hierarchical Clustering", "cex" = 2)
    rect <- rect.hclust(t$tree, h = y, border = ordColo)
    clust <- NULL
    for (j in 1:nb.clust) clust <- c(clust, rep(j, length(rect[[j]])))
    clust <- as.factor(clust)
    belong <- cbind.data.frame(t$tree$order, clust)
    belong <- belong[do.call("order", belong), ]
    clust <- as.factor(belong$clust)
    names(clust) <- rownames(X)
    if (nzchar(Sys.getenv("RSTUDIO_USER_IDENTITY")))				
      layout(matrix(nrow = 1, ncol = 1, 1), respect = TRUE)
  }
  
  
    
    
  #---- 11. Consolidation if hierarchical ----			
 if(consol==TRUE){
   clust.afterconsol <- consolidationW(df.coord.cla[,c(1:(ncol(df.coord.cla)-2))], cluster_i=df.coord.cla[,"cluster"],
                                               weights=df.coord.cla[,"weight"] , iter.max = iter.max)
   df.after <- as.data.frame( clust.afterconsol ) 
   dftemp<- data.frame(df.coord.cla[,c(1:(ncol(coord.dw)))],  weight=df.coord.cla[,"weight"], cluster=df.coord.cla[,"cluster"])
   dftemp$cluster <- df.after[rownames(dftemp),"clust.afterconsol"]
   df.coord.cla <- dftemp
}

    
  #---- 12. Computing centers. Building RDO object  ----			
  RDO <-   ssquares(df.coord.cla[,c(1:(ncol(coord.dw)))],  df.coord.cla[,"weight"], df.coord.cla[,"cluster"]) 
  
  RDO <- list("data.clust"=data.frame(df.coord.cla[,c(1:(ncol(coord.dw)))], "clust"=df.coord.cla[,"cluster"]),   
              "centers"=RDO$coord.centers, "clust.count"=RDO$dfclust, "clust.content"=RDO$dfw,
              "global"= list("ss.tot"=RDO$ss.tot, "ss.intra"=RDO$ss.intra,"ss.inter"= RDO$ss.inter,			
                             "%Explained"= 100*RDO$ss.inter/RDO$ss.tot), cluster.CA=cluster.CA)
  RDO$CA <- object
  class(RDO) <- "LexHCca"
  # End 12. Computing centers. Building RDO object 
  

  
  
  
  
  

  #---- 14. Weighting documents (after/before) and words ----
  #  doc.w.after <- apply(object$call$X,1,sum)    # Initial weighting for documents after selection
  nm <- names(doc.w.after)
  # doc.w.before before word's selection
  doc.w.before <- object$rowINIT[which(rownames(object$rowINIT) %in% nm) ,]
  names(doc.w.before) <- nm
  # word.w <- apply(object$call$X, 2, sum) 
  if(marg.doc=="before") doc.w <- doc.w.before else doc.w <- doc.w.after # before word's selection
  # num.docs <- nrow(object$call$X)
  # num.words <- ncol(object$call$X)
  # Final 14 weighting docs
 
#  doc.w.after <- apply(object$call$X,1,sum)    # Initial weighting for documents after selection
  
  
  
  
  # tabInd coordinates + cluster if active docs/words
  if (cluster.CA == "docs"){ 
    descr.data <- object$call$X   # words in columns
    descr.cl <- data.frame("cluster_"=df.coord.cla[rownames(descr.data),"cluster"])  # drop=FALSE
    rownames(descr.cl) <- rownames(descr.data)
    tabInd <- cbind.data.frame(object$row$coord, clust= as.factor(df.coord.cla[rownames(descr.data),"cluster"]))
  }else{ 
    descr.data <- t(object$call$X)
    descr.cl <- data.frame("cluster_"=df.coord.cla[rownames(descr.data),"cluster"] ) # drop=FALSE
    rownames(descr.cl) <- rownames(descr.data)
    tabInd <- cbind.data.frame(object$col$coord, clust=as.factor(df.coord.cla[rownames(descr.data),"cluster"]))
  }

  
  
  
  
  
  
  desc.ind <- descaxes <- descword <- descwordsup <- descdoc <- descquali <- descquanti <- NULL
  #---- 15. descword, Cluster description by words and descdoc ----
  if(description) { 
    
    if(cluster.CA=="docs") {
      descr.w.rows <- doc.w[rownames(descr.data)]
      descr.w.cols <- word.w[colnames(descr.data)]
      # Cluster Description by active words
      descword <- descfreq_New(descr.data, descr.cl$cluster_,prob=proba, row.INIT= descr.w.rows, col.INIT=descr.w.cols) 
    } else { 
      descr.w.rows <- word.w[rownames(descr.data)]
     # descr.w.cols <- doc.w[colnames(descr.data)]
      # Cluster Description by active documents # Using doc.w.after
      descdoc <- descfreq_New(descr.data, descr.cl$cluster_ ,prob=proba, row.INIT= descr.w.rows, col.INIT=doc.w.after) 
    } 
    #----  End descword, Cluster description by words and descdoc ----    


    
    
    #---- Document description by qualitative supplementary variables ----
    ### Por defecto es before...
    if(!is.null(object$quali.sup$coord)) {
      if(cluster.CA=="docs") {
    #    descr.cl <- data.frame("cluster_"=df.coord.cla[rownames(descr.data),"cluster"])  # drop=FALSE
        a1 <- object$call$Xtot[rownames(descr.data), rownames(object$quali.sup$eta2), drop=FALSE]
        a2 <- cbind(a1, "clust_"=as.factor(descr.cl[rownames(a1),"cluster_"]))
         descquali <- FactoMineR::catdes(a2, num.var=ncol(a2), proba = proba, row.w=doc.w[rownames(a2)])
         names(descquali$category) <-paste("cluster",1:length(descquali$category),sep="_")
      }}
    #---- End Document description by qualitative supplementary variables ----    
   
    #---- Document description by quantitative supplementary variables ----
    if(!is.null(object$quanti.sup$coord)) {
    
      if(cluster.CA=="docs") {
        a1 <- object$call$Xtot[rownames(descr.data), rownames(object$quanti.sup$coord), drop=FALSE]
        a2<- cbind(a1, "clust_"=as.factor(descr.cl[rownames(a1),"cluster_"]))
        descquanti <- FactoMineR::catdes(a2, num.var=ncol(a2), proba = proba, row.w=doc.w[rownames(a2)])
        if(!is.null(descquanti$quanti)>0)  
          names(descquanti$quanti) <-paste("cluster",1:length(descquanti$quanti),sep="_")
      }}
    #---- End Document description by quantitative supplementary variables ----
    
    
    #---- Description of axes ----
    if(cluster.CA=="docs") {
      descaxes <- data.frame(cbind(object$row$coord,clust_=as.factor(descr.cl[rownames(object$row$coord),"cluster_"])))
      descaxes$clust_ <- as.factor(descaxes$clust_)
      descaxes <- FactoMineR::catdes(descaxes, num.var=ncol(descaxes), proba = proba, row.w=doc.w[rownames(descaxes)])  
    } else { # columnas
      descaxes <- data.frame(cbind(object$col$coord,clust_=as.factor(clust[rownames(object$col$coord)])))
      descaxes$clust_ <- as.factor(descaxes$clust_)
      descaxes <- FactoMineR::catdes(descaxes, num.var=ncol(descaxes), proba = proba, row.w=word.w[rownames(descaxes)]) 
    }
    
 
  
    if(nb.desc>0){
      # Calculo de los centros
      # list.centers <- by(tabInd[, -ncol(tabInd), drop = FALSE], tabInd[, ncol(tabInd)], colMeans)
      # centers <- matrix(unlist(list.centers), ncol = ncol(tabInd) - 1, byrow = TRUE)
      #---- Function select --------------------------------
      select <- function(Y, default.size, method, coord.centers) {
        clust <- as.numeric(Y[1, ncol(Y)])
        Y <- Y[, -ncol(Y), drop = FALSE]
        colnames(coord.centers) <- colnames(Y)
        Z <- rbind(Y, coord.centers)

        if (nrow(Y) == 1) {  #No elements
          distance <- data.frame(0, row.names = "")
          colnames(distance) <- rownames(Z[1, ])
        }
        else {
          distance <- as.matrix(dist(Z, method = method))
          distance <- distance[(nrow(Y) + 1):nrow(distance), 
                               -((nrow(Y) + 1):ncol(distance))]
          distance <- sort(distance[clust, ], decreasing = FALSE)
        }
        if (length(distance) > default.size) 
          distance <- distance[1:default.size]
        else distance <- distance
      }

            #---- Function distinctivness  ---------------------
      distinctivness <- function(Y, default.size, method, coord.centers) {
        clust <- as.numeric(Y[1, ncol(Y)])
        Y <- Y[, -ncol(Y), drop = FALSE]	
        colnames(coord.centers) <- colnames(Y)
        Z <- rbind(Y, coord.centers)
        if (nrow(Y) == 1) {
          distance <- as.matrix(dist(Z, method = method))
          ind.car <- vector(length = 1, mode = "numeric")
          ind.car <- min(distance[-c(1, (clust + 1)), 1])
          names(ind.car) <- rownames(Z[1, ])		
        }
        else {
          distance <- as.matrix(dist(Z, method = method))
          distance <- distance[(nrow(Y) + 1):nrow(distance),-((nrow(Y) + 1):ncol(distance))]
          if (nrow(distance) == 2)
            center.min <- distance[-clust, ]
          else center.min <- apply(distance[-clust, ], 2, min)
          ind.car <- sort(center.min, decreasing = TRUE)		
        }	
        if (length(ind.car) > default.size)
          ind.car = ind.car[1:default.size]
        else ind.car = ind.car
      } # Final distinctivness
      #---- End Function distinctivness  ---------------------
      
      m_centers <- data.matrix(RDO$centers)[,-1]
      Cluster <- tabInd[,ncol(tabInd), drop=FALSE]
      para <- by(tabInd, Cluster, simplify = FALSE, select, 
                 default.size = nb.desc, method = metric, coord.centers = m_centers)
      dist <- by(tabInd, Cluster, simplify = FALSE, distinctivness, 
                 default.size = nb.desc, method = metric, coord.centers = m_centers)
      desc.ind <- list(para = para, dist = dist)

    } # End nb.par
  }  
  #---- Final Descriptions ----
  

  newcall <- list(t = t, min = min, max = max, X = tabInd[,-ncol(tabInd)], vec = FALSE, call = match.call(), 
                  cluster.CA=cluster.CA)  
  
  df.tmp <- cbind(descr.data,descr.cl)
  colnames(df.tmp)[ncol(df.tmp)] <- "Clust_"

  colnames(RDO$centers)[1] <- "Clust_" 
  newres <- list(data.clust = df.tmp, centers =as.data.frame(RDO$centers), clust.count = RDO$clust.count, 
                 clust.content = RDO$clust.content, ss=RDO$global, call = newcall)
  

  if (!is.null(descaxes)) {
    class(descaxes) <- NULL
    newres$description$desc.axes <- descaxes
  }
  if (cluster.CA == "docs") {
    if (!is.null(descword)) 
      newres$description$desc.cluster.doc <- list(words = descword, 
                                                  wordsup = descwordsup, qualisup = descquali, 
                                                  quantisup = descquanti)
  }
  else {
    if (!is.null(descdoc)) 
      newres$description$desc.cluster.word <- list(docs = descdoc)
  }
  
  

  
  
  if (!is.null(object$info)) {
    if (nb.desc > 0) {
      if (is.null(desc.ind$para)) 
        stop("Please, use description=TRUE \n  to obtain a description of the clusters by the characteristic documents ")
     

      if(!is.null(object$var.agg)) vbaggr <- TRUE else vbaggr <- FALSE
      
      if(vbaggr==FALSE) {
      var.text <- object$info$var.text[[1]]
      str.base <- object$info$base[[1]]
      str.envir <- object$info$menvir[[1]]
      base <- get(str.base, envir = str.envir)
      corpus <- base[, var.text[1]]
      if (length(var.text) > 1) {
        for (i in 2:length(var.text)) {
          corpus <- paste(corpus, base[, var.text[i]], sep = ".")
        }
      }
      }
      if(vbaggr==FALSE) {
      corpus <- data.frame(corpus, stringsAsFactors = FALSE)
      rownames(corpus) <- rownames(base)
      }
      
     
      lispara <- vector(mode = "list")
   
      for (iclus in 1:nb.clust) {
        doctot <- length(desc.ind$para[[iclus]])
        ntdoc <- min(nb.desc, doctot)
        lispara[[iclus]] <- data.frame()
        for (i in 1:ntdoc) {
          lispara[[iclus]][i, 1] <- names(desc.ind$para[[iclus]][i])
          lispara[[iclus]][i, 2] <- desc.ind$para[[iclus]][i]

          if (cluster.CA == "docs") { if(vbaggr==FALSE)
            lispara[[iclus]][i, 3] <- strtrim(corpus[lispara[[iclus]][i,1], 1], size.desc)
            else lispara[[iclus]][i, 3] <- ""
          }
        }

        if (cluster.CA == "docs") {
          colnames(lispara[[iclus]]) <- c("DOCUMENT", "CRITERION", "TEXT") 
        }
        else colnames(lispara[[iclus]]) <- c("WORD", "CRITERION")
      }
      names(lispara) <- paste("cluster", 1:nb.clust, sep = "_")
      if (cluster.CA == "docs") 
        newres$description$desc.cluster.doc$para <- lispara
      else newres$description$desc.cluster.word$para <- lispara
      lisdist <- vector(mode = "list")
      for (iclus in 1:nb.clust) {
        doctot <- length(desc.ind$dist[[iclus]])
        ntdoc <- min(nb.desc, doctot)
        lisdist[[iclus]] <- data.frame()
        for (i in 1:ntdoc) {
          lisdist[[iclus]][i, 1] <- names(desc.ind$dist[[iclus]][i])
          lisdist[[iclus]][i, 2] <- desc.ind$dist[[iclus]][i]
          if (cluster.CA == "docs") 
          { if(vbaggr==FALSE)
            lisdist[[iclus]][i, 3] <- strtrim(corpus[lisdist[[iclus]][i, 1], 1], size.desc)
          else lisdist[[iclus]][i, 3] <-""
        }}
        if (cluster.CA == "docs") 
          colnames(lisdist[[iclus]]) <- c("DOCUMENT","CRITERION", "TEXT")
        else colnames(lisdist[[iclus]]) <- c("WORD", "CRITERION")
      }
      names(lisdist) <- paste("cluster", 1:nb.clust, sep = "_")
      if (cluster.CA == "docs") 
        newres$description$desc.cluster.doc$dist <- lisdist
      else newres$description$desc.cluster.word$dist <- lisdist
    }
  }
  newres$call$CA <- object
  newres$coord.clust <- cbind(newres$call$X, "Clust_"=newres$data.clust[rownames(newres$call$X),"Clust_"])
  
  class(newres) <- c("LexHCca")
  #---- 13. Plot Factorial Plane with centers ----
  if(graph){
    p<-plot.LexHCca(newres, type="map", draw=c("points","centers"))
    plot(p)
  }
  # 13. Plot Factorial Plane with centers
  
return(newres)

}

Try the Xplortext package in your browser

Any scripts or data that you put into this service are public.

Xplortext documentation built on Nov. 10, 2023, 1:06 a.m.