R/hclust.R

Defines functions genPredictHclust predict.hclust exportHClustTab showModelHClustExists displayHClustStats plotDendrogram plotDendrogram2 centers.hclust executeClusterHClust on_hclust_discriminant_plot_button_clicked on_hclust_data_plot_button_clicked on_hclust_stats_button_clicked on_hclust_dendrogram_button_clicked on_hclust_radiobutton_toggled

Documented in centers.hclust on_hclust_data_plot_button_clicked on_hclust_dendrogram_button_clicked on_hclust_discriminant_plot_button_clicked on_hclust_radiobutton_toggled on_hclust_stats_button_clicked predict.hclust

# Gnome R Data Science: GNOME interface to R for Data Science
#
# Time-stamp: <Sunday 2026-02-08 14:46:29 +1100 Graham Williams>
#
# Implement hclust functionality.
#
# Copyright (c) 2009-2018 Togaware Pty Ltd
#
# This files is part of Rattle.
#
# Rattle is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# Rattle is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Rattle. If not, see <https://www.gnu.org/licenses/>.

########################################################################
# CALLBACKS

# When a radio button is selected, display the appropriate tab page.

on_hclust_radiobutton_toggled <- function(button)
{
  if (button$getActive())
    crv$CLUSTER$setCurrentPage(crv$CLUSTER.HCLUST.TAB)
  setStatusBar()
}

on_hclust_dendrogram_button_clicked <- function(button)
{
  if (theWidget("use_ggplot2")$getActive())
    plotDendrogram2()
  else
    plotDendrogram()
}

on_hclust_stats_button_clicked <- function(button)
{
  set.cursor("watch", Rtxt("Determining the cluster statistics...."))
  try(displayHClustStats())
  set.cursor("left-ptr", Rtxt("Cluster statistics displayed. Scroll to see all."))
}

on_hclust_data_plot_button_clicked <- function(button)
{

  # Make sure there is a cluster first.

  if (is.null(crs$hclust))
  {
    errorDialog(Rtxt("No cluster to plot.",
                     "The button should not have been sensitive."),
                crv$support.msg)
    return()
  }

  # Some background information.  Assume we have already built the
  # cluster, and so we don't need to check so many conditions.

  sampling  <- not.null(crs$train)
  num.clusters <- theWidget("hclust_clusters_spinbutton")$getValue()
  nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
  if (length(nums) > 0)
  {
    indicies <- getVariableIndicies(crs$input)
    include <- simplifyNumberList(intersect(nums, indicies))
  }

  if (length(nums) == 0 || length(indicies) == 0)
  {
    errorDialog(Rtxt("Clusters are currently calculated only for numeric data.",
                     "No numeric variables were found in the dataset",
                     "from amongst those having an input/target/risk role."))
    return()
  }

  # We can only plot if there is more than a single variable.

  if (length(intersect(nums, indicies)) == 1)
  {
    infoDialog(Rtxt("A data plot can not be constructed",
                    "because there is only one numeric variable available",
                    "in the data."))
    return()
  }

  # PLOT: Log the R command and execute.

  set.cursor("watch", Rtxt("Determining the cluster statistics...."))
  plot.cmd <- paste(sprintf(paste("plot(crs$dataset[%s, %s], ",
                                  "col=cutree(crs$hclust, %d))\n",
                                  sep=""),
                            ifelse (sampling, "crs$train", ""), include,
                            num.clusters),
                    genPlotTitleCmd(""))
  appendLog(Rtxt("Generate a data plot."), plot.cmd)
  newPlot()
  eval(parse(text=plot.cmd))

  set.cursor("left-ptr", Rtxt("Data plot has been generated."))
}

on_hclust_discriminant_plot_button_clicked <- function(button)
{

  # Make sure there is a cluster first.

  if (is.null(crs$hclust))
  {
    errorDialog(Rtxt("No cluster to plot.",
                     "The button should not have been sensitive."),
                crv$support.msg)
    return()
  }

  # Some background information.  Assume we have already built the
  # cluster, and so we don't need to check so many conditions.

  sampling  <- not.null(crs$train)
  num.clusters <- theWidget("hclust_clusters_spinbutton")$getValue()
  nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
  if (length(nums) > 0)
  {
    indicies <- getVariableIndicies(crs$input)
    include <- simplifyNumberList(intersect(nums, indicies))
  }

  if (length(nums) == 0 || length(indicies) == 0)
  {
    errorDialog(Rtxt("Clusters are currently calculated only for numeric data.",
                     "No numeric variables were found in the dataset",
                     "from amongst those having an input/target/risk role."))
    return()
  }

  # We can only plot if there is more than a single variable.

  if (length(intersect(nums, indicies)) == 1)
  {
    infoDialog(Rtxt("A discriminant coordinates plot can not be constructed",
                    "because there is only one numeric variable available",
                    "in the data."))
    return()
  }

  # PLOT: Log the R command and execute.

 plot.cmd <- paste(sprintf(paste("cluster::clusplot(na.omit(crs$dataset[%s, %s]), ",
                                 "cutree(crs$hclust, %d), ",
                                 "color=TRUE, shade=TRUE, ",
                                 "main='Discriminant Coordinates ",
                                 crs$dataname, "')\n", sep=""),
                           ifelse(sampling, "crs$train", ""), include, num.clusters))
  appendLog(Rtxt("Generate a discriminant coordinates plot."), plot.cmd)
  newPlot()
  eval(parse(text=plot.cmd))

  setStatusBar(Rtxt("Discriminant coordinates plot has been generated."))
}

########################################################################
# EXECUTION

executeClusterHClust <- function(include)
{
}

centers.hclust <- function(x, object, nclust=10, use.median=FALSE)
{
  # TODO 20170725 Does this actually work? Clearly needs work.

  if (!inherits(object, "hclust")) { stop(Rtxt("Not a legitimate hclust object")) }

  if (! "matrix" %in% class(x)) { x <- as.matrix(x) }

  if (use.median)
  {
    centres <- round(tapply(x, list(rep(cutree(object, nclust), ncol(x)), col(x)), median))
  }
  else
  {
    centres <- tapply(x, list(rep(cutree(object, nclust), ncol(x)), col(x)), mean)
  }
  dimnames(centres) <- list(NULL, dimnames(x)[[2]])

  return(centres)
}

plotDendrogram2 <- function()
{

  # Make sure there is a hclust object first.

  if (is.null(crs$hclust))
  {
    errorDialog(Rtxt("No Hierarchical Cluster to plot."), crv$support.msg)
    return()
  }

  startLog(Rtxt("Dendrogram Plot"))

  # Load the required package into the library.

  lib.cmd <- "library(ggplot2, quietly=TRUE)"
  if (! packageIsAvailable("ggplot2", Rtxt("plot a dendrogram"))) return(FALSE)
  appendLog(packageProvides("ggplot2", "ggplot"), lib.cmd)
  eval(parse(text=lib.cmd))

  lib.cmd <- "library(ggdendro, quietly=TRUE)"
  if (! packageIsAvailable("ggdendro", Rtxt("plot a dendrogram"))) return(FALSE)
  appendLog(packageProvides("ggdendro", "dendro_data"), lib.cmd)
  eval(parse(text=lib.cmd))

  # Show a busy cursor whilst drawing the plot.

  set.cursor("watch", Rtxt("Rendering the hierarchical cluster dendrogram...."))
  on.exit(set.cursor("left-ptr", ""))

  ttl <- genPlotTitleCmd(Rtxt("Cluster Dendrogram"), crs$dataname, vector=TRUE)
  plot.cmd <- paste('ddata <- dendro_data(crs$hclust, type="rectangle")',
                    'g <- ggplot(segment(ddata))',
                    'g <- g + geom_segment(aes(x = y, y = x, xend = yend, yend = xend))',
                    'g <- g + scale_y_discrete(labels = ddata$label$label)',
                    'g <- g + labs(x="Height", y="Observation")',
                    paste("g <- g +",
                          sprintf('ggtitle(expression(atop("%s", atop(italic("%s")))))',
                                  ttl[1], ttl[2])),
                    "print(g)",
                    sep="\n")

  # Log the R command and execute.

  appendLog(Rtxt("Generate the dendrogram plot."), plot.cmd)
  newPlot()
  eval(parse(text=plot.cmd))

  # TODO 130311 How to identify the clusters in the plot, if
  # specified.

  ## nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
  ## if (nclust > 1 && nclust <= length(crs$hclust$height))
  ## {
  ##   rect.cmd <- sprintf("rect.hclust(crs$hclust, k=%d)", nclust)
  ##   appendLog(Rtxt("Add in rectangles to show the clusters."), rect.cmd)
  ##   eval(parse(text=rect.cmd))
  ## }
}

plotDendrogram <- function()
{

  # Make sure there is a hclust object first.

  if (is.null(crs$hclust))
  {
    errorDialog(Rtxt("There is no Hierarchical Cluster yet we are",
                     "trying to plot it."), crv$support.msg)
    return()
  }

  # Load the required package into the library.  The library, cba,
  # should already be loaded. But check anyhow.

  lib.cmd <- "library(cba, quietly=TRUE)"
  if (! packageIsAvailable("cba", Rtxt("plot a dendrogram"))) return(FALSE)
  appendLog(packageProvides("cba", "plot"), lib.cmd)
  eval(parse(text=lib.cmd))

  # Show a busy cursor whilst drawing the plot.

  set.cursor("watch", Rtxt("Rendering the hierarchical cluster dendrogram...."))
  on.exit(set.cursor("left-ptr", ""))

  # Generate the plot command to not print the xaxis labels if there
  # are too many observations.

  if (length(crs$hclust$order) > 100)
    limit <- ", labels=FALSE, hang=0"
  else
    limit <- ""
  plot.cmd <- paste(sprintf('plot(crs$hclust, main="", sub="", xlab=""%s)\n',
                            limit),
                    genPlotTitleCmd(Rtxt("Cluster Dendrogram"), crs$dataname),
                    sep="")

  # Log the R command and execute.

  appendLog(Rtxt("Generate a dendrogram plot."), plot.cmd)
  newPlot()
  eval(parse(text=plot.cmd))

  # Identify the clusters in the plot, if specified.

  nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
  if (nclust > 1 && nclust <= length(crs$hclust$height))
  {
    rect.cmd <- sprintf("rect.hclust(crs$hclust, k=%d)", nclust)
    appendLog(Rtxt("Add in rectangles to show the clusters."), rect.cmd)
    eval(parse(text=rect.cmd))
  }
}

displayHClustStats <- function()
{
  # Initial setup.

  TV <- "hclust_textview"

  # Make sure there is a cluster first.

  if (is.null(crs$hclust))
  {
    errorDialog(Rtxt("No cluster to plot.",
                     "The button should not have been sensitive."),
                crv$support.msg)
    return()
  }

  # The fpc package is available for cluster.stats().

  if (!packageIsAvailable("fpc", Rtxt("calculate cluster statistics"))) return()
  lib.cmd <- "library(fpc, quietly=TRUE)"
  appendLog(packageProvides("fpc", "cluster.stats"), lib.cmd)
  eval(parse(text=lib.cmd))

  # 20090323 Don't reset the textview since we want to retain the
  # build information.

  # 090323 REMOVE resetTextview(TV)

  # Some background information.  Assume we have already built the
  # cluster, and so we don't need to check so many conditions.

  nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
  sampling  <- not.null(crs$train)
#  nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
#  if (length(nums) > 0)
#  {
#    indicies <- getVariableIndicies(crs$input)
#    include <- simplifyNumberList(intersect(nums, indicies))
#  }
#
#  if (length(nums) == 0 || length(indicies) == 0)
#  {
#    errorDialog("Clusters are currently calculated only for numeric data.",
#                "No numeric variables were found in the dataset",
#                "from amongst those having an input/target/risk role.")
#    return()
#  }

  include <- "crs$numeric" # 20110102 getNumericVariables()

  # Cluster centers. 20180522 Remove the na.omit - it is crashing -
  # reported by Tony Nolan.

  #centers.cmd <- sprintf("centers.hclust(na.omit(crs$dataset[%s, %s]), crs$hclust, %d)",
  centers.cmd <- sprintf("centers.hclust(crs$dataset[%s, %s], crs$hclust, %d)",
                       ifelse(sampling, "crs$train", ""), include, nclust)
  appendLog(Rtxt("List the suggested cluster centers for each cluster"), centers.cmd)
  appendTextview(TV, Rtxt("Cluster means:"), "\n\n",
                 collectOutput(centers.cmd, use.print=TRUE))

  # STATS: Log the R command and execute. 20170430 add
  # silhouette=FALSE since it is failing today with the error:
  #
  # Error in silhouette.default(clustering, dmatrix = dmat) :
  # object 'sildist' not found

  # 20180522 Remove the na.omit - it is crashing - reported by Tony
  # Nolan.

  #stats.cmd <- sprintf(paste("cluster.stats(dist(na.omit(crs$dataset[%s, %s])),",
  stats.cmd <- sprintf(paste("cluster.stats(dist(crs$dataset[%s, %s]),",
                             "cutree(crs$hclust, %d), silhouette=FALSE)\n"),
                       ifelse(sampling, "crs$train", ""), include,
                       nclust)
  appendLog(Rtxt("Generate cluster statistics using the fpc package."), stats.cmd)
  appendTextview(TV, Rtxt("General cluster statistics:"), "\n\n",
                 collectOutput(stats.cmd, use.print=TRUE))

  setStatusBar(Rtxt("HClust cluster statistics have been generated."))
}

## THIS IS NOT EVEN RELATED TO hclust!!!! USES PAM

## on_hclust_seriation_button_clicked <- function(button)
## {

##   ## Make sure there is a hclust object first.

##   if (is.null(crs$hclust))
##   {
##     errorDialog("SHOULD NOT BE HERE.", crv$support.msg)
##     return()
##   }

##   ## The library, cba, should already be loaded. But check anyhow. I
##   ## think this is required for the seriation. Need to check.

##   lib.cmd <- "library(cba, quietly=TRUE)"
##   if (! packageIsAvailable("cba", "generate a seriation plot")) return()
##   appendLog(packageProvides("cba", "Seriation"), lib.cmd)
##   eval(parse(text=lib.cmd))

##   ## Some background information.

##   sampling  <- not.null(crs$train)
##   nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
##   if (length(nums) > 0)
##   {
##     indicies <- getVariableIndicies(crs$input)
##     include <- simplifyNumberList(intersect(nums, indicies))
##   }

##   plot.cmd <- paste("d <- dist(as.matrix(crs$dataset",
##                     sprintf("[%s, %s]",
##                             ifelse(sampling, "crs$train", ""),
##                             include),
##                     "))\n",
##                     "l <- pam(d, 10, cluster.only = TRUE)\n",
##                     "res <- cluproxplot(d, l, method = ",
##                     'c("Optimal", "Optimal"), plot = FALSE)\n',
##                     'plot(res, plotOptions = list(main = "PAM + ',
##                     'Seriation (Optimal Leaf ordering)", ',
##                     'col = terrain.colors(64)))', sep="")

##   appendLog("Generate a seriation plot.", plot.cmd)
##   newPlot()
##   eval(parse(text=plot.cmd))

##   setStatusBar("Seriation plot completed.")
## }

# 100424 Support resetting of widgets on loading a project.

showModelHClustExists <- function(state=!is.null(crs$hclust))
{
  # If a model exists then make available the appropriate widgets.

  theWidget("hclust_dendrogram_button")$setSensitive(TRUE)
  theWidget("hclust_clusters_label")$setSensitive(TRUE)
  theWidget("hclust_clusters_spinbutton")$setSensitive(TRUE)
  theWidget("hclust_stats_button")$setSensitive(TRUE)
  theWidget("hclust_data_plot_button")$setSensitive(TRUE)
  theWidget("hclust_discriminant_plot_button")$setSensitive(TRUE)

}

########################################################################
# EXPORT

exportHClustTab <- function()
{
  # Make sure we have a model first!

  if (noModelAvailable(crs$hclust, crv$HCLUST)) return(FALSE)

  # Get some required information

  sampling  <- not.null(crs$train)
  nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
  include <- "crs$numeric" # 20110102 getNumericVariables()

  startLog(paste(Rtxt("Export"), commonName(crv$HCLUST)))

  save.name <- getExportSaveName(crv$HCLUST)
  if (is.null(save.name)) return(FALSE)
  ext <- tolower(get.extension(save.name))

  # Construct the command to produce PMML.

  pmml.cmd <- sprintf(paste("pmml(crs$hclust, centers=centers.hclust(",
                            "na.omit(crs$dataset[%s, %s]), crs$hclust, %d)%s)",
                            sep=""),
                      ifelse(sampling, "crs$train", ""), include, nclust,
                      ifelse(length(crs$transforms) > 0,
                             ", transforms=crs$transforms", ""))

  # We can't pass "\" in a filename to the parse command in
  # MS/Windows so we have to run the save/write command separately,
  # i.e., not inside the string that is being parsed.

  if (ext == "xml")
  {
    appendLog(Rtxt("Export hierarchical cluster as PMML."),
              sprintf('saveXML(%s, "%s")', pmml.cmd, save.name))
    XML::saveXML(eval(parse(text=pmml.cmd)), save.name)
  }
  else if (ext == "c")
  {
    if (isWindows()) save.name <- tolower(save.name)

    model.name <- sub("\\.c", "", basename(save.name))

    export.cmd <- generateExportPMMLtoC(model.name, save.name, "hclust_textview")

    appendLog(sprintf(Rtxt("Export %s as a C routine."), commonName(crv$HCLUST)),
              sprintf('pmml.cmd <- "%s"\n\n', pmml.cmd),
              export.cmd)

    eval(parse(text=export.cmd))

  }

  setStatusBar(sprintf(Rtxt("The model has been exported to '%s'."), save.name))

}

########################################################################
# SCORE

predict.hclust <- function(object, nclust=10, newdata, x, ...)
{
  # TODO 20170725 Need to actually get this function working!!! This
  # must take a newdata (the new dataset to be scored) and x as the
  # dataset the model was built on. Don't default x to newdata as it
  # is then misleading that x is required.
  #
  # 20170429 Shouldn't this use cutree? As Hamed Mamani noted the
  # resulting distribution of observations across the clusters is
  # different between what is reported by Stats button in the Cluster
  # tab and the Score file produced from the Evaluate
  # tab. Inconsistency is not good so use cutree() by default and if
  # the two datasets supplied then use predict.kmeans()
  #
  # 20090126 Initial work on a predict.hclust function, to allow using
  # a hclust model to allocate new DATA to pre-existing clusters that
  # are built from another dataset X. This uses the common model
  # interface function, predict. This makes it easy to use the Rattle
  # modelling code for hclust. We obtain the cluster centres to
  # identify clusters and add that as the hclust object's centers
  # attribute and then calculates distance and identify cluster
  # numbers using predict.kmeans(). This is only an
  # approximation. Gets pretty close for ward link and euclidean
  # distance.

  # Add centers to the model object and then use the kmeans version of
  # predict to calculate the nearest mean for each center/object.

  object$centers <- centers.hclust(x, object, nclust=nclust, use.median=FALSE)

  rownames(object$centers) <- seq_len(nclust)
  pr <- predict.kmeans(object, newdata) %>% as.integer

  return(pr)
}

genPredictHclust <- function(dataset)
{
  # 081227 Generate a command to obtain the prediction results when
  # applying the model to new data.

  nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
  sampling  <- not.null(crs$train)
  include <- "crs$numeric" # 20110102 getNumericVariables()

  # 20170429 Use cutree version (now the default) of predict.hclust()
  # by default so that the resulting distribution of cluster sizes is
  # the same as that reported in the Textview of the Cluster tab when
  # Stats button is invoked.

  # 20170725 HACK FOR NOW!!! The dataset should be numeric only.

  dataset %<>% sub('input', 'numeric', .)

  pr.cmd <- sprintf("crs$pr <- predict(crs$hclust, nclust=%s, newdata=%s, x=%s)",
                    nclust,
                    dataset,
                    sub("validate", "train", dataset)) # 20170725 THIS IS HACK FOR NOW!

  return(pr.cmd)
}

genResponseHclust <- function(dataset)
{
  # 081227 Generate a command to obtain the response when applying the
  # model to new data.

  return(genPredictHclust(dataset))
}

genProbabilityHclust <- function(dataset)
{
  # 081227 Generate a command to obtain the probability when applying
  # the model to new data. There is probably a prblem with simply
  # using the cluster label as the output, since it won't be a
  # probability or even look like it. Let's do it for now though -
  # should be okay.

  return(genPredictHclust(dataset))
}

Try the rattle package in your browser

Any scripts or data that you put into this service are public.

rattle documentation built on Feb. 9, 2026, 1:07 a.m.