Nothing
# Gnome R Data Science: GNOME interface to R for Data Science
#
# Time-stamp: <Sunday 2026-02-08 14:46:29 +1100 Graham Williams>
#
# Implement hclust functionality.
#
# Copyright (c) 2009-2018 Togaware Pty Ltd
#
# This files is part of Rattle.
#
# Rattle is free software: you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# Rattle is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Rattle. If not, see <https://www.gnu.org/licenses/>.
########################################################################
# CALLBACKS
# When a radio button is selected, display the appropriate tab page.
on_hclust_radiobutton_toggled <- function(button)
{
if (button$getActive())
crv$CLUSTER$setCurrentPage(crv$CLUSTER.HCLUST.TAB)
setStatusBar()
}
on_hclust_dendrogram_button_clicked <- function(button)
{
if (theWidget("use_ggplot2")$getActive())
plotDendrogram2()
else
plotDendrogram()
}
on_hclust_stats_button_clicked <- function(button)
{
set.cursor("watch", Rtxt("Determining the cluster statistics...."))
try(displayHClustStats())
set.cursor("left-ptr", Rtxt("Cluster statistics displayed. Scroll to see all."))
}
on_hclust_data_plot_button_clicked <- function(button)
{
# Make sure there is a cluster first.
if (is.null(crs$hclust))
{
errorDialog(Rtxt("No cluster to plot.",
"The button should not have been sensitive."),
crv$support.msg)
return()
}
# Some background information. Assume we have already built the
# cluster, and so we don't need to check so many conditions.
sampling <- not.null(crs$train)
num.clusters <- theWidget("hclust_clusters_spinbutton")$getValue()
nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
if (length(nums) > 0)
{
indicies <- getVariableIndicies(crs$input)
include <- simplifyNumberList(intersect(nums, indicies))
}
if (length(nums) == 0 || length(indicies) == 0)
{
errorDialog(Rtxt("Clusters are currently calculated only for numeric data.",
"No numeric variables were found in the dataset",
"from amongst those having an input/target/risk role."))
return()
}
# We can only plot if there is more than a single variable.
if (length(intersect(nums, indicies)) == 1)
{
infoDialog(Rtxt("A data plot can not be constructed",
"because there is only one numeric variable available",
"in the data."))
return()
}
# PLOT: Log the R command and execute.
set.cursor("watch", Rtxt("Determining the cluster statistics...."))
plot.cmd <- paste(sprintf(paste("plot(crs$dataset[%s, %s], ",
"col=cutree(crs$hclust, %d))\n",
sep=""),
ifelse (sampling, "crs$train", ""), include,
num.clusters),
genPlotTitleCmd(""))
appendLog(Rtxt("Generate a data plot."), plot.cmd)
newPlot()
eval(parse(text=plot.cmd))
set.cursor("left-ptr", Rtxt("Data plot has been generated."))
}
on_hclust_discriminant_plot_button_clicked <- function(button)
{
# Make sure there is a cluster first.
if (is.null(crs$hclust))
{
errorDialog(Rtxt("No cluster to plot.",
"The button should not have been sensitive."),
crv$support.msg)
return()
}
# Some background information. Assume we have already built the
# cluster, and so we don't need to check so many conditions.
sampling <- not.null(crs$train)
num.clusters <- theWidget("hclust_clusters_spinbutton")$getValue()
nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
if (length(nums) > 0)
{
indicies <- getVariableIndicies(crs$input)
include <- simplifyNumberList(intersect(nums, indicies))
}
if (length(nums) == 0 || length(indicies) == 0)
{
errorDialog(Rtxt("Clusters are currently calculated only for numeric data.",
"No numeric variables were found in the dataset",
"from amongst those having an input/target/risk role."))
return()
}
# We can only plot if there is more than a single variable.
if (length(intersect(nums, indicies)) == 1)
{
infoDialog(Rtxt("A discriminant coordinates plot can not be constructed",
"because there is only one numeric variable available",
"in the data."))
return()
}
# PLOT: Log the R command and execute.
plot.cmd <- paste(sprintf(paste("cluster::clusplot(na.omit(crs$dataset[%s, %s]), ",
"cutree(crs$hclust, %d), ",
"color=TRUE, shade=TRUE, ",
"main='Discriminant Coordinates ",
crs$dataname, "')\n", sep=""),
ifelse(sampling, "crs$train", ""), include, num.clusters))
appendLog(Rtxt("Generate a discriminant coordinates plot."), plot.cmd)
newPlot()
eval(parse(text=plot.cmd))
setStatusBar(Rtxt("Discriminant coordinates plot has been generated."))
}
########################################################################
# EXECUTION
executeClusterHClust <- function(include)
{
}
centers.hclust <- function(x, object, nclust=10, use.median=FALSE)
{
# TODO 20170725 Does this actually work? Clearly needs work.
if (!inherits(object, "hclust")) { stop(Rtxt("Not a legitimate hclust object")) }
if (! "matrix" %in% class(x)) { x <- as.matrix(x) }
if (use.median)
{
centres <- round(tapply(x, list(rep(cutree(object, nclust), ncol(x)), col(x)), median))
}
else
{
centres <- tapply(x, list(rep(cutree(object, nclust), ncol(x)), col(x)), mean)
}
dimnames(centres) <- list(NULL, dimnames(x)[[2]])
return(centres)
}
plotDendrogram2 <- function()
{
# Make sure there is a hclust object first.
if (is.null(crs$hclust))
{
errorDialog(Rtxt("No Hierarchical Cluster to plot."), crv$support.msg)
return()
}
startLog(Rtxt("Dendrogram Plot"))
# Load the required package into the library.
lib.cmd <- "library(ggplot2, quietly=TRUE)"
if (! packageIsAvailable("ggplot2", Rtxt("plot a dendrogram"))) return(FALSE)
appendLog(packageProvides("ggplot2", "ggplot"), lib.cmd)
eval(parse(text=lib.cmd))
lib.cmd <- "library(ggdendro, quietly=TRUE)"
if (! packageIsAvailable("ggdendro", Rtxt("plot a dendrogram"))) return(FALSE)
appendLog(packageProvides("ggdendro", "dendro_data"), lib.cmd)
eval(parse(text=lib.cmd))
# Show a busy cursor whilst drawing the plot.
set.cursor("watch", Rtxt("Rendering the hierarchical cluster dendrogram...."))
on.exit(set.cursor("left-ptr", ""))
ttl <- genPlotTitleCmd(Rtxt("Cluster Dendrogram"), crs$dataname, vector=TRUE)
plot.cmd <- paste('ddata <- dendro_data(crs$hclust, type="rectangle")',
'g <- ggplot(segment(ddata))',
'g <- g + geom_segment(aes(x = y, y = x, xend = yend, yend = xend))',
'g <- g + scale_y_discrete(labels = ddata$label$label)',
'g <- g + labs(x="Height", y="Observation")',
paste("g <- g +",
sprintf('ggtitle(expression(atop("%s", atop(italic("%s")))))',
ttl[1], ttl[2])),
"print(g)",
sep="\n")
# Log the R command and execute.
appendLog(Rtxt("Generate the dendrogram plot."), plot.cmd)
newPlot()
eval(parse(text=plot.cmd))
# TODO 130311 How to identify the clusters in the plot, if
# specified.
## nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
## if (nclust > 1 && nclust <= length(crs$hclust$height))
## {
## rect.cmd <- sprintf("rect.hclust(crs$hclust, k=%d)", nclust)
## appendLog(Rtxt("Add in rectangles to show the clusters."), rect.cmd)
## eval(parse(text=rect.cmd))
## }
}
plotDendrogram <- function()
{
# Make sure there is a hclust object first.
if (is.null(crs$hclust))
{
errorDialog(Rtxt("There is no Hierarchical Cluster yet we are",
"trying to plot it."), crv$support.msg)
return()
}
# Load the required package into the library. The library, cba,
# should already be loaded. But check anyhow.
lib.cmd <- "library(cba, quietly=TRUE)"
if (! packageIsAvailable("cba", Rtxt("plot a dendrogram"))) return(FALSE)
appendLog(packageProvides("cba", "plot"), lib.cmd)
eval(parse(text=lib.cmd))
# Show a busy cursor whilst drawing the plot.
set.cursor("watch", Rtxt("Rendering the hierarchical cluster dendrogram...."))
on.exit(set.cursor("left-ptr", ""))
# Generate the plot command to not print the xaxis labels if there
# are too many observations.
if (length(crs$hclust$order) > 100)
limit <- ", labels=FALSE, hang=0"
else
limit <- ""
plot.cmd <- paste(sprintf('plot(crs$hclust, main="", sub="", xlab=""%s)\n',
limit),
genPlotTitleCmd(Rtxt("Cluster Dendrogram"), crs$dataname),
sep="")
# Log the R command and execute.
appendLog(Rtxt("Generate a dendrogram plot."), plot.cmd)
newPlot()
eval(parse(text=plot.cmd))
# Identify the clusters in the plot, if specified.
nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
if (nclust > 1 && nclust <= length(crs$hclust$height))
{
rect.cmd <- sprintf("rect.hclust(crs$hclust, k=%d)", nclust)
appendLog(Rtxt("Add in rectangles to show the clusters."), rect.cmd)
eval(parse(text=rect.cmd))
}
}
displayHClustStats <- function()
{
# Initial setup.
TV <- "hclust_textview"
# Make sure there is a cluster first.
if (is.null(crs$hclust))
{
errorDialog(Rtxt("No cluster to plot.",
"The button should not have been sensitive."),
crv$support.msg)
return()
}
# The fpc package is available for cluster.stats().
if (!packageIsAvailable("fpc", Rtxt("calculate cluster statistics"))) return()
lib.cmd <- "library(fpc, quietly=TRUE)"
appendLog(packageProvides("fpc", "cluster.stats"), lib.cmd)
eval(parse(text=lib.cmd))
# 20090323 Don't reset the textview since we want to retain the
# build information.
# 090323 REMOVE resetTextview(TV)
# Some background information. Assume we have already built the
# cluster, and so we don't need to check so many conditions.
nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
sampling <- not.null(crs$train)
# nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
# if (length(nums) > 0)
# {
# indicies <- getVariableIndicies(crs$input)
# include <- simplifyNumberList(intersect(nums, indicies))
# }
#
# if (length(nums) == 0 || length(indicies) == 0)
# {
# errorDialog("Clusters are currently calculated only for numeric data.",
# "No numeric variables were found in the dataset",
# "from amongst those having an input/target/risk role.")
# return()
# }
include <- "crs$numeric" # 20110102 getNumericVariables()
# Cluster centers. 20180522 Remove the na.omit - it is crashing -
# reported by Tony Nolan.
#centers.cmd <- sprintf("centers.hclust(na.omit(crs$dataset[%s, %s]), crs$hclust, %d)",
centers.cmd <- sprintf("centers.hclust(crs$dataset[%s, %s], crs$hclust, %d)",
ifelse(sampling, "crs$train", ""), include, nclust)
appendLog(Rtxt("List the suggested cluster centers for each cluster"), centers.cmd)
appendTextview(TV, Rtxt("Cluster means:"), "\n\n",
collectOutput(centers.cmd, use.print=TRUE))
# STATS: Log the R command and execute. 20170430 add
# silhouette=FALSE since it is failing today with the error:
#
# Error in silhouette.default(clustering, dmatrix = dmat) :
# object 'sildist' not found
# 20180522 Remove the na.omit - it is crashing - reported by Tony
# Nolan.
#stats.cmd <- sprintf(paste("cluster.stats(dist(na.omit(crs$dataset[%s, %s])),",
stats.cmd <- sprintf(paste("cluster.stats(dist(crs$dataset[%s, %s]),",
"cutree(crs$hclust, %d), silhouette=FALSE)\n"),
ifelse(sampling, "crs$train", ""), include,
nclust)
appendLog(Rtxt("Generate cluster statistics using the fpc package."), stats.cmd)
appendTextview(TV, Rtxt("General cluster statistics:"), "\n\n",
collectOutput(stats.cmd, use.print=TRUE))
setStatusBar(Rtxt("HClust cluster statistics have been generated."))
}
## THIS IS NOT EVEN RELATED TO hclust!!!! USES PAM
## on_hclust_seriation_button_clicked <- function(button)
## {
## ## Make sure there is a hclust object first.
## if (is.null(crs$hclust))
## {
## errorDialog("SHOULD NOT BE HERE.", crv$support.msg)
## return()
## }
## ## The library, cba, should already be loaded. But check anyhow. I
## ## think this is required for the seriation. Need to check.
## lib.cmd <- "library(cba, quietly=TRUE)"
## if (! packageIsAvailable("cba", "generate a seriation plot")) return()
## appendLog(packageProvides("cba", "Seriation"), lib.cmd)
## eval(parse(text=lib.cmd))
## ## Some background information.
## sampling <- not.null(crs$train)
## nums <- seq(1,ncol(crs$dataset))[as.logical(sapply(crs$dataset, is.numeric))]
## if (length(nums) > 0)
## {
## indicies <- getVariableIndicies(crs$input)
## include <- simplifyNumberList(intersect(nums, indicies))
## }
## plot.cmd <- paste("d <- dist(as.matrix(crs$dataset",
## sprintf("[%s, %s]",
## ifelse(sampling, "crs$train", ""),
## include),
## "))\n",
## "l <- pam(d, 10, cluster.only = TRUE)\n",
## "res <- cluproxplot(d, l, method = ",
## 'c("Optimal", "Optimal"), plot = FALSE)\n',
## 'plot(res, plotOptions = list(main = "PAM + ',
## 'Seriation (Optimal Leaf ordering)", ',
## 'col = terrain.colors(64)))', sep="")
## appendLog("Generate a seriation plot.", plot.cmd)
## newPlot()
## eval(parse(text=plot.cmd))
## setStatusBar("Seriation plot completed.")
## }
# 100424 Support resetting of widgets on loading a project.
showModelHClustExists <- function(state=!is.null(crs$hclust))
{
# If a model exists then make available the appropriate widgets.
theWidget("hclust_dendrogram_button")$setSensitive(TRUE)
theWidget("hclust_clusters_label")$setSensitive(TRUE)
theWidget("hclust_clusters_spinbutton")$setSensitive(TRUE)
theWidget("hclust_stats_button")$setSensitive(TRUE)
theWidget("hclust_data_plot_button")$setSensitive(TRUE)
theWidget("hclust_discriminant_plot_button")$setSensitive(TRUE)
}
########################################################################
# EXPORT
exportHClustTab <- function()
{
# Make sure we have a model first!
if (noModelAvailable(crs$hclust, crv$HCLUST)) return(FALSE)
# Get some required information
sampling <- not.null(crs$train)
nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
include <- "crs$numeric" # 20110102 getNumericVariables()
startLog(paste(Rtxt("Export"), commonName(crv$HCLUST)))
save.name <- getExportSaveName(crv$HCLUST)
if (is.null(save.name)) return(FALSE)
ext <- tolower(get.extension(save.name))
# Construct the command to produce PMML.
pmml.cmd <- sprintf(paste("pmml(crs$hclust, centers=centers.hclust(",
"na.omit(crs$dataset[%s, %s]), crs$hclust, %d)%s)",
sep=""),
ifelse(sampling, "crs$train", ""), include, nclust,
ifelse(length(crs$transforms) > 0,
", transforms=crs$transforms", ""))
# We can't pass "\" in a filename to the parse command in
# MS/Windows so we have to run the save/write command separately,
# i.e., not inside the string that is being parsed.
if (ext == "xml")
{
appendLog(Rtxt("Export hierarchical cluster as PMML."),
sprintf('saveXML(%s, "%s")', pmml.cmd, save.name))
XML::saveXML(eval(parse(text=pmml.cmd)), save.name)
}
else if (ext == "c")
{
if (isWindows()) save.name <- tolower(save.name)
model.name <- sub("\\.c", "", basename(save.name))
export.cmd <- generateExportPMMLtoC(model.name, save.name, "hclust_textview")
appendLog(sprintf(Rtxt("Export %s as a C routine."), commonName(crv$HCLUST)),
sprintf('pmml.cmd <- "%s"\n\n', pmml.cmd),
export.cmd)
eval(parse(text=export.cmd))
}
setStatusBar(sprintf(Rtxt("The model has been exported to '%s'."), save.name))
}
########################################################################
# SCORE
predict.hclust <- function(object, nclust=10, newdata, x, ...)
{
# TODO 20170725 Need to actually get this function working!!! This
# must take a newdata (the new dataset to be scored) and x as the
# dataset the model was built on. Don't default x to newdata as it
# is then misleading that x is required.
#
# 20170429 Shouldn't this use cutree? As Hamed Mamani noted the
# resulting distribution of observations across the clusters is
# different between what is reported by Stats button in the Cluster
# tab and the Score file produced from the Evaluate
# tab. Inconsistency is not good so use cutree() by default and if
# the two datasets supplied then use predict.kmeans()
#
# 20090126 Initial work on a predict.hclust function, to allow using
# a hclust model to allocate new DATA to pre-existing clusters that
# are built from another dataset X. This uses the common model
# interface function, predict. This makes it easy to use the Rattle
# modelling code for hclust. We obtain the cluster centres to
# identify clusters and add that as the hclust object's centers
# attribute and then calculates distance and identify cluster
# numbers using predict.kmeans(). This is only an
# approximation. Gets pretty close for ward link and euclidean
# distance.
# Add centers to the model object and then use the kmeans version of
# predict to calculate the nearest mean for each center/object.
object$centers <- centers.hclust(x, object, nclust=nclust, use.median=FALSE)
rownames(object$centers) <- seq_len(nclust)
pr <- predict.kmeans(object, newdata) %>% as.integer
return(pr)
}
genPredictHclust <- function(dataset)
{
# 081227 Generate a command to obtain the prediction results when
# applying the model to new data.
nclust <- theWidget("hclust_clusters_spinbutton")$getValue()
sampling <- not.null(crs$train)
include <- "crs$numeric" # 20110102 getNumericVariables()
# 20170429 Use cutree version (now the default) of predict.hclust()
# by default so that the resulting distribution of cluster sizes is
# the same as that reported in the Textview of the Cluster tab when
# Stats button is invoked.
# 20170725 HACK FOR NOW!!! The dataset should be numeric only.
dataset %<>% sub('input', 'numeric', .)
pr.cmd <- sprintf("crs$pr <- predict(crs$hclust, nclust=%s, newdata=%s, x=%s)",
nclust,
dataset,
sub("validate", "train", dataset)) # 20170725 THIS IS HACK FOR NOW!
return(pr.cmd)
}
genResponseHclust <- function(dataset)
{
# 081227 Generate a command to obtain the response when applying the
# model to new data.
return(genPredictHclust(dataset))
}
genProbabilityHclust <- function(dataset)
{
# 081227 Generate a command to obtain the probability when applying
# the model to new data. There is probably a prblem with simply
# using the cluster label as the output, since it won't be a
# probability or even look like it. Let's do it for now though -
# should be okay.
return(genPredictHclust(dataset))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.