## **************************************************************************
##
## (c) 2018-2022 Guillaume Guénard
## Department de sciences biologiques,
## Université de Montréal
## Montreal, QC, Canada
##
## **constr.lshclust function**
##
## This file is part of constr.hclust
##
## constr.hclust is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## constr.hclust is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with constr.hclust. If not, see <https://www.gnu.org/licenses/>.
##
## /*+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\
## | |
## | CONSTRAINED HIERARCHICAL CLUSTERING |
## | using (user-specified) criterion |
## | |
## | C implementation of the Lance and Williams (1967) method |
## | for hierarchical clustering with or without the spatial |
## | contiguity constraint. |
## | |
## \-----------------------------------------------------------*/
##
## R source code file
##
## **************************************************************************
##
#' Space- And Time-Constrained Least Squares Clustering (Experimental)
#'
#' Function \code{constr.lshclust} carries out space-constrained or
#' time-constrained agglomerative clustering from a data matrix.
#'
#' @param x A data matrix
#' @param links A list of edges (or links) connecting the points. May be omitted
#' in some cases; see details and examples
#' @param coords Coordinates of the observations (data rows) in matrix \code{d}.
#' The coordinates are used for plotting maps of the clustering results; may be
#' omitted. A matrix or data frame with two columns, following the convention of
#' the Cartesian plane: first column for abscissa, second column for ordinates.
#' See examples
#' @param chron Logical (TRUE or FALSE) indicating whether a chronological (i.e.
#' time-constrained) clustering should be calculated (default:
#' \code{chron = FALSE})
#' @param output The type of edge lengths to return: square root sums of squares
#' (\code{"RSS"}, the default), sums of squares (\code{"SS"}), Euclidean
#' distance between centroids (\code{"D"}), square Euclidean distance between
#' centroids (\code{"D2"})
#'
#' @return A \code{\link{constr.hclust-class}} object.
#'
#' @details Agglomerative clustering can be carried out with a constraint of
#' spatial or temporal contiguity. This means that only the objects that are
#' linked in \code{links} are considered to be candidates for clustering: the
#' next pair of objects to cluster will be the pair that has the lowest
#' dissimilarity value among the pairs that are linked.
#'
#' The same rule applies during the subsequent clustering steps, which involve
#' groups of objects: the list of links is updated after each agglomeration
#' step. All objects that are neighbours of one of the components that have
#' fused are now neighbours of the newly formed cluster.
#'
#' The edges (links) are specified using argument \code{links}, which can be an
#' object of class \code{nb} (see, e.g., \code{\link[spdep]{tri2nb}}), an object
#' of class \code{listw} (see, e.g., \code{\link[spdep]{nb2listw}}), a
#' two-element \code{list} or an object coercible as a such (e.g., a two-column
#' \code{dataframe}), or a two-column matrix with each row representing an edge
#' and the columns representing the two ends of the edges. For lists with more
#' than two elements, as well as dataframes or matrices with more than
#' two-columns only the first two elements or columns are used for the analysis.
#' The edges are interpreted as being non directional; there is no need to
#' specify an edge going from point a to point b and one going from point b to
#' point a. While doing so is generally inconsequential for the analysis, it
#' carries some penalty in terms of computation time. It is a good practice to
#' place the nodes in increasing order of numbers from the top to the bottom and
#' from the left to the right of the list but this is not mandatory. A word of
#' caution: in cases where clusters with identical minimum distances occur, the
#' order of the edges in the list may have an influence on the result.
#' Alternative results would be statistically equivalent.
#'
#' When argument \code{link} is omitted, regular (unconstrained) clustering is
#' performed and a \code{\link{hclust}-class} object is returned unless
#' argument \code{chron = TRUE}. When argument \code{chron = TRUE},
#' chronological clustering is performed, taking the order of observations as
#' their positions in the sequence. Argument \code{links} is not used when
#' \code{chron = TRUE}. Argument \code{chron} allows one to perform a
#' chronological clustering in the case where observations are ordered
#' chronologically. Here, the term "chronologically" should not be taken
#' restrictively: the method remains applicable to other sequential data sets
#' such as spatial series made of observations along a transect.
#'
#' When the graph described by \code{link} is not entirely connected, the
#' resulting disjoint clusters (or singletons) are merged in the order of their
#' indices (ie. the two clusters with smallest indices are merged until all of
#' them have been merged), the dissimilarity at which these clusters are
#' merged (\code{$height}) is assumed to be a missing value (\code{NA}), and a
#' warning message is issued to warn the user about the presence and number of
#' disjoint clusters.
#'
#' Memory storage and time to compute constrained clustering for N objects. ---
#' The least squares clustering procedure generally uses less computer memory as
#' does the Lance and Williams algorithm implemented by function
#' \code{\link{constr.hclust}} because it does not use dissimilarity matrices.
#' Internally, the function makes to copies of data matrix \code{x}, one that
#' is used to accumulate the values as the clusters are being formed and one
#' that is squared and used to accumulate the squared values during the
#' clustering process. The amount of memory needed to store accumulation arrays
#' for N observations described by M variables as 64-bit double precision
#' floating point variables (IEEE 754) is the 8*N*M*2 bytes. It scales linearly
#' with increasing sample size. By contrast, the dissimilarity matrix used by
#' the Lance and Williams algorithm needs 8*N*(N-1)/2 bytes of storage; as much
#' storage as when M = (N-1)/4. Since M is much smaller than (N-1)/4 for many
#' practical cases, especially those when N is very large (e.g., many hundred
#' thousands or millions), performing least squares hierarchical clustering will
#' be faster (See the Benchmarking example below), require less storage, and be
#' applicable to cases with larger N than the distance-based Lance and Williams
#' algorithm, but at the price of a single classificatory criterion (i.e.,
#' within-cluster least squares).
#'
#' With large data sets, a manageable output describing the classification of
#' the sites is obtained with function \code{\link{cutree}}(x, k) where k is the
#' number of groups.
#'
#' @author Pierre Legendre \email{pierre.legendre@@umontreal.ca}
#' and Guillaume Guénard \email{guillaume.guenard@@umontreal.ca}
#'
#' @seealso \code{\link{plot.constr.hclust}}, \code{\link{hclust}}, and
#' \code{\link{cutree}}
#'
#' @references
#' Guénard, G. and P. Legendre. 2022. Hierarchical clustering with contiguity
#' constraint in {R}. Journal of Statistical Software 103(7): 1-26
#' \doi{10.18637/jss.v103.i07}
#'
#' Legendre, P. and L. Legendre. 2012. Numerical ecology, 3rd English edition.
#' Elsevier Science BV, Amsterdam. \doi{10.1016/S0304-3800(00)00291-X}
#'
#' @importFrom spdep listw2mat listw2sn nb2listw tri2nb
#'
#' @examples
#'
#' ## First example: Artificial map data from Legendre & Legendre
#' ## (2012, Fig. 13.26): n = 16
#'
#' dat <- c(41,42,25,38,50,30,41,43,43,41,30,50,38,25,42,41)
#' coord.dat <- matrix(c(1,3,5,7,2,4,6,8,1,3,5,7,2,4,6,8,
#' 4.4,4.4,4.4,4.4,3.3,3.3,3.3,3.3,
#' 2.2,2.2,2.2,2.2,1.1,1.1,1.1,1.1),16,2)
#'
#'
#' ## Obtaining a list of neighbours:
#' library(spdep)
#' listW <- nb2listw(tri2nb(coord.dat), style="B")
#' links.mat.dat <- listw2mat(listW)
#' neighbors <- listw2sn(listW)[,1:2]
#'
#' ## Display the points:
#' plot(coord.dat, type='n',asp=1)
#' title("Delaunay triangulation")
#' text(coord.dat, labels=as.character(as.matrix(dat)), pos=3)
#' for(i in 1:nrow(neighbors))
#' lines(rbind(coord.dat[neighbors[i,1],],
#' coord.dat[neighbors[i,2],]))
#'
#' ## Clustering without a contiguity constraint;
#' ## the result is represented as a dendrogram:
#' grpWD2_constr_lshclust <- constr.lshclust(x=dat, output="RSS")
#' plot(grpWD2_constr_lshclust, hang=-1)
#'
#' ## Clustering with a contiguity constraint described by a list of
#' ## links:
#' grpWD2cst_constr_lshclust <-
#' constr.lshclust(
#' dat, neighbors,
#' coord.dat, output="RSS")
#'
#' ## Plot the results on a map with k=3 clusters:
#' plot(grpWD2cst_constr_lshclust, k=3, links=TRUE, las=1, xlab="Eastings",
#' ylab="Northings", cex=3, lwd=3)
#'
#' ## Generic functions from hclust can be used, for instance to obtain
#' ## a list of members of each cluster:
#' cutree(grpWD2cst_constr_lshclust, k=3)
#'
#' ## Now with k=5 clusters:
#' plot(grpWD2cst_constr_lshclust, k=5, links=TRUE, las=1, xlab="Eastings",
#' ylab="Northings", cex=3, lwd=3)
#' cutree(grpWD2cst_constr_lshclust, k=5)
#'
#' ## End of the artificial map example
#'
#' ## Third example: Scotch Whiskey distilleries clustered using four tasting
#' ## scores (nose, body, palate, and finish) constrained with respect to the
#' ## distillery locations in Scotland.
#'
#' ## Documentation file about the Scotch Whiskey data: ?ScotchWhiskey
#'
#' data(ScotchWhiskey)
#'
#' ## Cluster analyses for the colour, nose, body, palate, and finish using
#' ## least squares on the basis of the Mahalanobis metric.
#'
#' ## Combining the data matrices:
#' cols <-
#' contr.treatment(
#' n = nlevels(ScotchWhiskey$colour)
#' )[ScotchWhiskey$colour,]
#' dimnames(cols) <- list(
#' rownames(ScotchWhiskey$geo[,"Distillery"]),
#' levels(ScotchWhiskey$colour)[-1L]
#' )
#' WhiskeyDat <- cbind(
#' cols,
#' ScotchWhiskey$body,
#' ScotchWhiskey$palate,
#' ScotchWhiskey$finish
#' )
#' rm(cols)
#'
#' ## Transforming WhiskeyDat into an orthonormal matrix using the Cholesky
#' ## factorization: the least squares will relate to the Mahalanobis metric.
#'
#' WhiskeyTr <- WhiskeyDat %*% solve(chol(cov(WhiskeyDat)))
#' grpWD2cst_ScotchWhiskey <-
#' constr.lshclust(
#' x=WhiskeyTr,
#' links=ScotchWhiskey$neighbors@data,
#' coords=ScotchWhiskey$geo@coords/1000
#' )
#'
#' ## A fonction to plot the Whiskey clustering results:
#'
#' plotWhiskey <- function(k) {
#' par(fig=c(0,1,0,1))
#' plot(grpWD2cst_ScotchWhiskey, k=k, links=TRUE, las=1,
#' xlab="Eastings (km)", ylab="Northings (km)", cex=0.1, lwd=3)
#' text(ScotchWhiskey$geo@coords/1000,labels=1:length(ScotchWhiskey$geo))
#' legend(x=375, y=700, lty=1L, lwd=3, col=rainbow(1.2*k)[1L:k],
#' legend=sprintf("Group %d",1:k), cex=1.25)
#' SpeyZoom <- list(xlim=c(314.7,342.2), ylim=c(834.3,860.0))
#' rect(xleft=SpeyZoom$xlim[1L], ybottom=SpeyZoom$ylim[1L], col="#E6E6E680",
#' xright=SpeyZoom$xlim[2L], ytop=SpeyZoom$ylim[2L], lwd=2, lty=1L)
#' par(fig=c(0.01,0.50,0.46,0.99), new=TRUE)
#' plot(grpWD2cst_ScotchWhiskey, xlim=SpeyZoom$xlim,
#' ylim=SpeyZoom$ylim, k=k, links=TRUE, las=1, xlab="", ylab="",
#' cex=0.1, lwd=3, axes=FALSE)
#' text(ScotchWhiskey$geo@coords/1000,labels=1:length(ScotchWhiskey$geo))
#' rect(xleft=SpeyZoom$xlim[1L], ybottom=SpeyZoom$ylim[1L],
#' xright=SpeyZoom$xlim[2L], ytop=SpeyZoom$ylim[2L], lwd=2, lty=1L)
#' }
#'
#' ## Plot the clustering results on the map of Scotland for 5 groups.
#' ## The inset map shows the Speyside distilleries in detail:
#'
#' plotWhiskey(k=5L)
#'
#' ## End of the Scotch Whiskey tasting data example
#'
#' \dontrun{
#'
#' ## Third example: Fish community composition along the Doubs River,
#' ## France. The sequence is analyzed as a case of chronological
#' ## clustering, substituting space for time.
#'
#' library(ade4)
#' data(doubs, package="ade4")
#'
#' ## Using the Hellinger metric on the species abundances:
#' Doubs.hel <- sqrt(doubs$fish / rowSums(doubs$fish))
#' Doubs.hel[rowSums(doubs$fish)==0,] <- 0
#' grpWD2cst_fish <- constr.lshclust(x=Doubs.hel, chron=TRUE,
#' coords=as.matrix(doubs$xy))
#'
#' plot(grpWD2cst_fish, k=5, las=1, xlab="Eastings (km)",
#' ylab="Northings (km)", cex=3, lwd=3)
#'
#' ## Repeat the plot with other values of k (number of groups)
#'
#' ## End of the Doubs River fish assemblages example
#'
#'
#' ## Benchmarking example
#' ## Benchmarking can be used to estimate computation time for different
#' ## values of N (number of sites) and M (number of variables).
#'
#' require(magrittr)
#' require(pryr)
#'
#' benchmark <- function(N, M) {
#' res <- matrix(NA,length(N)*length(M),4L) %>% as.data.frame
#' colnames(res) <- c("N(obs)","M(var)","Storage (MiB)","Time (sec)")
#' res[[1L]] <- rep(N,length(M))
#' res[[2L]] <- rep(M,each=length(N))
#'
#' for(i in 1:nrow(res)) {
#' ## i=1L
#' cat("N:",res[i,1L]," M:",res[i,2L],"\n")
#' coords.mem <- cbind(x=runif(res[i,1L],-1,1),y=runif(res[i,1L],-1,1))
#' if(i>1L) rm(dat.mem) ; gc()
#' dat.mem <- try(matrix(runif(res[i,1L]*res[i,2L],0,1),
#' res[i,1L],res[i,2L]))
#' if(any(class(dat.mem)=="try-error"))
#' break
#' neighbors.mem <-
#' (coords.mem %>%
#' tri2nb %>%
#' nb2listw(style="B") %>%
#' listw2sn)[,1:2]
#' {start.time = Sys.time()
#' res.mem <- try(constr.lshclust(dat.mem, neighbors.mem))
#' end.time = Sys.time()}
#' if(any(class(res.mem)=="try-error"))
#' break
#' res[i,3L] <- (3*object_size(dat.mem) + object_size(neighbors.mem) +
#' object_size(res.mem))/1048576 # n. bytes per MiB
#' res[i,4L] <- as.numeric(end.time) - as.numeric(start.time)
#' }
#' res[["N(obs)"]] <- as.integer(res[["N(obs)"]])
#' res[["M(var)"]] <- as.integer(res[["M(var)"]])
#' res
#' }
#'
#' N <- c(1000,2000,5000,10000,20000,50000)
#' M <- c(1,2,5,10,20,50)
#' res <- benchmark(N, M)
#'
#' ## Plotting the results:
#' par(mar=c(3,6,2,2),mfrow=c(2L,1L))
#' barplot(height = matrix(res[,"Time (sec)"],length(N),length(M)),
#' names.arg = N, ylab = "Time (seconds)\n", xlab = "",
#' las = 1L, log = "y", beside=TRUE)
#' par(mar=c(5,6,0,2))
#' barplot(height = matrix(res[,"Storage (MiB)"],length(N),length(M)),
#' names.arg = N, ylab = "Total storage (MB)\n",
#' xlab = "Number of observations", las = 1L, log = "y", beside=TRUE)
#'
#' ## Examine the output file
#' res
#'
#' ## Analyze how computing time and storage scales up with increasing number
#' ## of observations and variables.
#' lm(log(`Time (sec)`)~log(`N(obs)`)+log(`M(var)`), data=res)
#' lm(log(`Storage (MiB)`)~log(`N(obs)`)+log(`M(var)`), data=res)
#' }
#'
#' ## End of the benchmarking example
#'
#' ## End of examples
#'
#' @useDynLib constr.hclust, .registration = TRUE
#' @importFrom graphics segments points
#' @importFrom grDevices dev.hold dev.flush rainbow
#' @importFrom spdep nb2listw tri2nb listw2mat listw2sn
#' @importFrom stats cutree
#'
#' @export constr.lshclust
constr.lshclust <- function(x, links, coords, chron = FALSE, output = "RSS") {
OUTPUTS <- c("RSS", "SS", "D", "D2")
i.out <- pmatch(output, OUTPUTS)
n <- NROW(x)
if (is.null(n))
stop("invalid data")
if (n < 2)
stop("must have n >= 2 objects to cluster")
m <- NCOL(x)
if(!is.matrix(x))
x <- as.matrix(x)
storage.mode(x) <- "double"
if (missing(links)) {
nl <- 0L
links <- integer(0L)
type <- if (chron) 2L else 0L
} else {
if(class(links)[1L]=="nb")
links <- nb2listw(links, style="B")
if(class(links)[1L]=="listW")
links <- listw2sn(links)[1L:2L]
if(is.list(links)||is.data.frame(links)) {
nl <- length(links[[1L]])
links <- unlist(links)
storage.mode(links) <- "integer"
} else if(is.matrix(links)) {
if(ncol(links) < 2L)
stop("When 'links' is a matrix, it must have 2 or more columns")
nl <- nrow(links)
links <- as.integer(links)
} else
stop("'links' must be a list, dataframe, or matrix")
type <- 1L
}
hcl <- .C("cclustLS",
as.integer(n), ## n
merge = integer(2L*(n-1L)),
height = double(n-1L),
order = integer(n),
m,
x, ## diss0
as.integer(nl), ## nl
links, ## linkl
as.integer(type), ## type
i.out ## out
)[2L:4L]
dim(hcl$merge) <- c((n-1L),2L)
if(missing(coords)) {
if(chron) {
coords <- cbind(x=0:(n-1),y=0)
} else coords <- NULL
} else {
if(NCOL(coords) < 2L) {
coords <- cbind(x=coords,y=0)
} else coords <- as.matrix(coords)
}
if(any(nna <- is.na(hcl$height)))
warning("Impossible to cluster all the data using the links provided. ",
"To identify the ",sum(nna) + 1L," disjoint clusters found, ",
"use function cutree with argument k = ",sum(nna) + 1L,
" on the present function's output.")
return(
structure(
c(hcl,
list(labels = rownames(x),
method = "least squares", call = match.call(),
dist.method = "none",
links = if(length(links)) matrix(links,nl,2L) else NULL,
coords = coords)),
class = c("constr.hclust","hclust")))
}
##
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.