R/zoo_hc.R

Defines functions zoo_hc

Documented in zoo_hc

#' Hierarchical clustering of the NISP data
#' @name zoo_hc
#' @description Hierarchical clustering of the NISP data by periods
#'
#' @param df a dataframe
#' @param num_column the column name of assemblage numbers
#' @param site_column the column name of assemblage sites
#' @param period_column the column name of assemblage periods 
#' @param percBOTA_column the column name of assemblage percents of BOTA
#' @param percOC_column the column name of assemblage percents of OC
#' @param percSUDO_column the column name of assemblage percents of SUDO
#' @param typSite_column the column name of assemblage site types
#' @param hc.meth the used HC method
#' @param hdend the maximum height of the dendrogram
#' @param labels_cex the size of the labels (site numbers)
#' @param leaves_cex the size of the symbols (site symbols)
#' @param axis_text the size of the axis labels
#' @param per.label.ang the angle of the period label
#' @param per.label.sz the size of the period label
#' @param lorder_period a vector with the ordered periods
#' @param typsit_symb a dataframe with the symbology
#' @param export.plot if TRUE, save the plot
#' @param hc.name the name of the output plot if saved
#' @param plot.width,plot.height,plot.dpi the dimensions and resolution of the plot if saved
#' @param verbose if TRUE (default): verbose
#' 
#' @return a gglot with the different HC by period
#'
#' @examples
#' 
#' df <- zoo_read()
#' lorder_period <- zoo_order_period(df)
#' typsit_symb <- zoo_legends(worksheets = c("sites_types"))
#' zoo_hc(df = df, lorder_period = lorder_period, typsit_symb = typsit_symb,
#'        dir.out = "C:/Rprojects/zoowork/www/",
#'        plot.width = 6, plot.height = 10,
#'        per.label.sz = 2)
#' 
#' @export
zoo_hc <- function(df = NA,
                   num_column = "num",
                   site_column = "site",
                   period_column = "period",
                   percBOTA_column = "percBOTA",
                   percOC_column = "percOC",
                   percSUDO_column = "percSUDO",
                   typSite_column = "type.site",
                   hc.meth = "complete",
                   hdend = 130,
                   labels_cex = .3,
                   leaves_cex = 1.2,
                   axis_text = 4,
                   per.label.ang = 270,
                   per.label.sz = 3,
                   lorder_period = NA,
                   typsit_symb = NA,
                   export.plot = T,
                   dirOut = paste0(system.file(package = "zoowork"), "/results/"),
                   hc.name = "hc.png",
                   plot.width = 19.5,
                   plot.height = 22.2,
                   plot.dpi = 300,
                   verbose = TRUE){
  ldendro <- list() # to store the trees
  lmaxheight <- c() # to store the max height of the tree 
  ldendro_tsit <- list()
  lheights_tsit <- c()
  # the following part is ~ the same as zoo_ca()
  # - - - - - - - - - - - - - - -
  perCA_tsit <- data.frame(perCA1 = 0,
                           perCA2 = 0,
                           per = "xx")
  ca_all_tsite <- data.frame(num = 'xx',
                             site = 'xx',
                             type.site = 'xx',
                             CA1 = 0,
                             CA2 = 0,
                             percBOTA = 0,
                             percSUDO = 0,
                             percOC = 0,
                             per = 'xx',
                             shape = 0,
                             color = 'xx')
  for (per in lorder_period){
    # per <- "MIA2"
    if(verbose){print(per)}
    df_per <- df[df[ , period_column] %in% per,] # sélectionne sur périodes
    df_per <- df_per[complete.cases(df_per), ]
    row.names(df_per) <- df_per[ , num_column]
    df_lda.per <- df_per[ , c(percBOTA_column, percSUDO_column, percOC_column, typSite_column)]
    if(nrow(df_per) > 3){
      if(verbose){print("  - run HC")}
      xdat <- df_lda.per[ , -which(names(df_lda.per) %in% c(typSite_column))]
      ca <- FactoMineR::CA(xdat,graph = FALSE)            # AFC
      inertCA1 <- round(as.numeric(ca$eig[, 2][1]), 1)
      inertCA2 <- round(as.numeric(ca$eig[, 2][2]), 1)
      # show %
      perCA_tsit <- rbind(perCA_tsit, data.frame(perCA1 = inertCA1,
                                                 perCA2 = inertCA2,
                                                 per = per))
      coords_ind_ca <- as.data.frame(ca$row$coord)
      coords_var_ca <- as.data.frame(ca$col$coord)
      coords_ca <- rbind(coords_ind_ca,coords_var_ca)
      colnames(coords_ca)[1] <- 'CA1'
      colnames(coords_ca)[2] <- 'CA2'
      dataset.p <- merge(df_lda.per, coords_ca, by = "row.names", all.y = T)
      dataset.ps <- merge(dataset.p, typsit_symb, by.x = typSite_column, by.y = "tsite", all.x = T)
      dataset.ps$per <- per
      dataset.ps$color <- as.character(dataset.ps$color)
      # - - - - - - - - - - - - - - -
      dataset.ps$shape <- as.factor(dataset.ps$shape)
      names(dataset.ps)[names(dataset.ps) == 'Row.names'] <- num_column
      df_per_site <- df_per[ , c(site_column, num_column)]
      ff <- merge(dataset.ps, df_per_site, by = num_column, all.x = T)
      # ff <- ff[ , colnames(ca_all_tsite)]
      matches <- colnames(ca_all_tsite) # reorder
      ff <- ff[ ,match(matches, colnames(ff))]
      ca_all_tsite <- rbind(ca_all_tsite,ff)
      dd <- na.omit(ff) # rm var
      rownames(dd) <- dd[ , num_column]
      dd[ , num_column] <- NULL
      xxdat <- dd[ , c(percBOTA_column, percSUDO_column, percOC_column)]
      symb_dd <- merge (xxdat, ff, by.x = "row.names", by.y = num_column)
      colnames(symb_dd)[which(names(symb_dd) == paste0(percBOTA_column, ".x"))] <- percBOTA_column
      colnames(symb_dd)[which(names(symb_dd) == paste0(percOC_column, ".x"))] <- percOC_column
      colnames(symb_dd)[which(names(symb_dd) == paste0(percSUDO_column, ".x"))] <- percSUDO_column
      rownames(symb_dd) <- symb_dd$Row.names
      #colnames(symb_dd)[which(names(symb_dd)=="Row.names")] <- 'num'
      symb_dd[ , paste0(percBOTA_column, ".y")] <- NULL
      symb_dd[ , paste0(percOC_column, ".y")] <- NULL
      symb_dd[ , paste0(percSUDO_column, ".y")] <- NULL
      # symb_dd$Row.names <- symb_dd$percBOTA.y <- symb_dd$percSUDO.y <- symb_dd$percOC.y <- NULL
      dend1 <- symb_dd[ , c(percBOTA_column, percSUDO_column, percOC_column)] %>%
        dist %>% hclust(method = hc.meth) 
      ldendro[[length(ldendro) + 1]] <- dend1 # store
      lmaxheight <- c(lmaxheight, max(dend1$height))
      dend1 <-  dend1 %>% as.dendrogram  # cr?? un 'dendrogram'
      # r?ordonne le tab de donn?es apr?s le clustering
      symb_dd$ord <- seq(1, nrow(symb_dd)) # ord initial
      symb_dd <- symb_dd[match(order.dendrogram(dend1), symb_dd$ord), ] # reord
      dend1 <- dend1 %>%
        dendextend::set("branches_lwd", .2) %>%
        dendextend::set("labels_cex", labels_cex) %>% #.3
        dendextend::set("leaves_pch", as.numeric(as.character(symb_dd$shape))) %>%  # node point type
        dendextend::set("leaves_cex", leaves_cex) %>%  # 1.2
        dendextend::set("leaves_col", symb_dd$color)
      ggd1 <- dendextend::as.ggdend(dend1) # tranform en 'ggdend'
      ggdend <- ggplot2::ggplot(ggd1,
                                horiz = TRUE, 
                                theme = ggplot2::theme_minimal(),
                                offset_labels = -8) +
        # ggplot2::ylim(hdend, 0) + # fixe l'emprise pour comparer plusieurs dendro
        ggplot2::theme(panel.grid.major.y = ggplot2::element_blank(),
                       panel.grid.minor.y = ggplot2::element_blank(),
                       axis.text.y = ggplot2::element_blank(),
                       axis.title.y = ggplot2::element_blank(),
                       axis.title.x = ggplot2::element_blank(),
                       axis.text.x = ggplot2::element_text(size = axis_text)) + 
        ggplot2::annotate(geom = "text",  # write the period
                          x = as.integer(nrow(symb_dd)/2),  # au milieu du jeu de donn?es
                          y = hdend,
                          vjust = 0,
                          hjust = 0,
                          angle = per.label.ang,
                          label = unique(symb_dd$per),
                          size = per.label.sz) +
        ggplot2::theme(plot.margin = ggplot2::unit(c(0, 0, 0, 0), "pt"))+
        ggplot2::scale_y_continuous(breaks = seq(hdend, 10, by = -20))
      ldendro_tsit[[length(ldendro_tsit) + 1]] <- ggdend # add dendro
      lheights_tsit <- c(lheights_tsit, nrow(xxdat)) # dendro size
    } else {
      if(verbose){
        print(paste0("There's only ", nrow(df_per),
                     " individual in the dataframe, no HC can be computed"))
      }
    }
  }
  if(export.plot){
    dir.create(dirOut, showWarnings = FALSE)
    gout <- paste0(dirOut, hc.name)
    ggplot2::ggsave(file = gout, 
                    gridExtra::arrangeGrob(grobs = ldendro_tsit, ncol = 1, heights = lheights_tsit),
                    width = plot.width, height = plot.height, units = "cm",
                    dpi = plot.dpi)  ## save plot
    if(verbose){print(paste0("The plot '", hc.name,"' has been saved in '", dirOut,"'"))}
  } else {
    return(ldendro_tsit)
  }
}  
zoometh/zoowork documentation built on Aug. 21, 2022, 5:11 a.m.