R/variable_clustering.R

# Functions wrapping the variable clustering used in ClustOfVar

#' Explore the clustering structure in a set of variables
#'
#' This function wraps the clustering functionality from ClustOfVar into a more
#' user-friendly format. Use this function to perform the exploratory
#' heirarchical clustering, with the aim of deciding the number of final clusters
#' to use
#'
#' @param dat data.frame/tbl containing the variables to cluster. Be sure to only
#'        include input variables; no internal filtering is performed
#' @return an object of class \code{mtClust} containing elements \code{hclust},
#'         the object showing the heirarchical clustering, and \code{stab},
#'         showing the graph used to select the number of clusters. You can call
#'         \code{plot} (which in turn calls \code{plot.hclustvar} or
#'         \code{plot.clustab}) on either output to see the plots, which is useful
#'         for deciding how many clusters to keep
#' @details
#' The function internally tries to avoid copying of the input data as much as
#' possible, however the implementation of \code{ClustOfVar} makes an extraordinary
#' number of copies by default. This method may be prohibitively slow/memory
#' intensive on large datasets. The output of this function is designed to be used
#' as input to \code{variable_cluster}, which actually maps variables to clusters
#' @family clustering
#' @seealso \code{\link{variable_cluster}}
#' @export
#' @import magrittr
#' @examples
#' exploratory_cluster(iris)
#'

exploratory_cluster <- function(dat) {
  ### Split dataset into categorical + numeric variables
  is_num <- sapply(dat,is.numeric) # Which columns are numeric?

  ds <- list(
    quanti = data.frame(dat[,is_num]),
    quali = data.frame(dat[,!is_num])
  )

  colnames(ds$quanti) <- colnames(dat)[is_num]
  colnames(ds$quali) <- colnames(dat)[!is_num]

  ### Apply hclustvar
  # Note: by default this returns the original dataframe; avoid this behaviour
  if (ncol(ds$quali) == 0) {
    hclust_init <- ClustOfVar::hclustvar(X.quanti = ds$quanti)[c("call",
                                                               "rec",
                                                               "merge",
                                                               "height",
                                                               "order",
                                                               "labels",
                                                               "clusmat")]
  } else if (ncol(ds$quanti) == 0) {
    hclust_init <- ClustOfVar::hclustvar(X.quali = ds$quali)[c("call",
                                                               "rec",
                                                               "merge",
                                                               "height",
                                                               "order",
                                                               "labels",
                                                               "clusmat")]
  } else {
    hclust_init <- ClustOfVar::hclustvar(X.quanti = ds$quanti,
                                        X.quali = ds$quali)[c("call",
                                                              "rec",
                                                              "merge",
                                                              "height",
                                                              "order",
                                                              "labels",
                                                              "clusmat")]
  }

  # Reinstate the class of the original hclust object
  # This will facilitate plotting
  class(hclust_init) <- c("hclustvar","hclust")

  # However, this will not allow for calculating stability
  # Function to re-form the object with the correct class:
  reform_hclust <- function(obj) {
    obj[["X.quanti"]] <- obj$rec$X.quanti
    obj[["X.quali"]] <- obj$rec$X.quali
    class(obj) <- c("hclustvar","hclust")
    return(obj)
  }

  ### Cluster stability
  hclust_stability <- ClustOfVar::stability(reform_hclust(hclust_init),
                                            B = 25)


  ### Return a list with the hclust and stability objects
  clust_out <- list(
    hclust = hclust_init,
    stab = hclust_stability
  )

  class(clust_out) <- "mtClust"
  return(clust_out)

}

#' Map variables to clusters determined using heirarchical clustering
#'
#' This function takes the output of \code{exploratory_clusters} and maps
#' the variables in the dataset to a user-specified number of clusters.
#' @param clust An object of class "mtClust" obtained as output from the
#'        \code{exploratory_cluster} function
#' @param G Number of clusters, from \code{1} to \code{p} where \code{p} is
#'        the number of variables in the dataset
#' @return a tbl with two columns: \code{variable}, listing the variable,
#'         names, and \code{cluster}, giving a numeric indication of the
#'         clusters. Variables with the same value of \code{cluster}
#'         belong to the same cluster
#' @details As with other functions in the \code{modellingTools} package,
#'          this function mainly serves to wrap the functionality of the
#'          variable clustering provided by \code{ClustOfVars}, providing
#'          a consistent input/output interface. This function must be
#'          combined with the \code{exploratory_cluster} function to work;
#'          selecting the number of clusters is a subjective process that
#'          should not be automated
#' @seealso \code{\link{exploratory_cluster}}
#' @export
#' @import magrittr
#' @examples
#' clust <- exploratory_cluster(iris)
#' variable_cluster(clust,2)

variable_cluster <- function(clust,G) UseMethod("varclust")

varclust.mtClust <- function(clust,G) {
  clusters <- ClustOfVar::cutreevar(clust$hclust,
                                    k = G)$cluster
  return(dplyr::data_frame(
    "variable" = names(clusters),
    "cluster" = clusters
  ))
}
awstringer/modellingTools documentation built on May 11, 2019, 4:11 p.m.