#' @title Cluster validity indices
#' @name .cluster_validity
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Compute different cluster validity indices. This function needs
#' as input a sits tibble with `cluster` column.
#' It is a front-end to `dtwclust::cvi` function.
#' That function computes five indices:
#' 1) adjusted Rand index; 2) Rand index; 3) Jaccard index;
#' 4) Fowlkes-Mallows; and 5) Variation of Information index
#' Please refer to the documentation in that package for more details.
#'
#' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param samples A tibble with `cluster` column.
#'
#' @return A vector with four validity indices.
#'
.cluster_validity <- function(samples) {
# set caller to show in errors
.check_set_caller(".cluster_validity")
# verifies if dtwclust package is installed
.check_require_packages("dtwclust")
# is the input data the result of a cluster function?
.check_samples_cluster(samples)
# compute CVIs and return
result <- dtwclust::cvi(
a = factor(samples[["cluster"]]),
b = factor(samples[["label"]]),
type = "external",
log.base = 10
)
return(result)
}
#' @title Compute a dendrogram using hierarchical clustering
#' @name .cluster_dendrogram
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#' @author Gilberto Camara, \email{gilberto.camara@@inpe.br}
#'
#' @description Cluster time series in hierarchical mode.
#' Hierarchical clustering, as its name suggests,
#' is an algorithm that tries to create a hierarchy of groups in which,
#' as the level in the hierarchy increases, clusters are created by merging
#' the clusters from the next lower level, producing
#' an ordered sequence of groupings. The similarity measure used to
#' group time series in a cluster is the dtw metric.
#' The procedure is deterministic, so it will always give the same
#' result for a chosen set of similarity measures
#' (see \code{\link[dtwclust]{tsclust}}).
#'
#' @references `dtwclust` package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param samples Time series data and metadata
#' to be used to generate the dendrogram.
#' @param bands Vector of bands to be clustered.
#' @param dist_method One of the supported distances (single char vector)
#' "dtw": DTW with a Sakoe-Chiba constraint.
#' "dtw2": DTW with L2 norm and Sakoe-Chiba constraint.
#' "dtw_basic": A faster DTW with less functionality.
#' "lbk": Keogh's lower bound for DTW.
#' "lbi": Lemire's lower bound for DTW.
#' @param linkage Agglomeration method to be used (single char vector)
#' One of "ward.D", "ward.D2", "single", "complete",
#' "average", "mcquitty", "median" or "centroid".
#' @param ... Any additional parameters to be passed
#' to dtwclust::tsclust() function.
#' @return Full dendrogram tree for data analysis
#' (class "dendrogram")
#'
.cluster_dendrogram <- function(samples,
bands,
dist_method = "dtw_basic",
linkage = "ward.D2", ...) {
# verifies if dtwclust package is installed
.check_require_packages("dtwclust")
# get the values of the time series
values <- .values_ts(samples, bands, format = "cases_dates_bands")
# call dtwclust and get the resulting dendrogram
dendro <- dtwclust::tsclust(
values,
type = "hierarchical",
k = max(nrow(samples) - 1, 2),
distance = dist_method,
control = dtwclust::hierarchical_control(method = linkage),
...
)
# return the dendrogram
return(dendro)
}
#' @title Compute validity indexes to a range of cut height
#' @name .cluster_dendro_bestcut
#' @keywords internal
#' @noRd
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description Reads a dendrogram object and its corresponding sits tibble and
#' computes the best number of clusters that maximizes the adjusted Rand index.
#'
#' @references
#' Lawrence Hubert and Phipps Arabie. Comparing partitions.
#' Journal of Classification, 2, p.193--218, 1985.
#'
#'
#' @param samples Input set of time series.
#' @param dendro Dendrogram object returned from
#' \code{\link[sits]{.cluster_dendrogram}}.
#' @return Vector with best number of clusters (k)
#' and its respective height.
#'
.cluster_dendro_bestcut <- function(samples, dendro) {
# compute range
k_range <- seq(2, max(length(dendro[["height"]]) - 1, 2))
# compute ARI for each k (vector)
ari <-
k_range |>
purrr::map(function(k) {
x <- stats::cutree(dendro, k = k)
y <- factor(samples[["label"]])
.cluster_rand_index(table(x, y))
}) |>
unlist()
# get the best ARI result
k_result <- k_range[which.max(ari)]
# compute each height corresponding to `k_result`
h_index <- length(dendro[["height"]]) - k_result + 2
h_result <- c(0, dendro[["height"]])[h_index]
# create a named vector and return
best_cut <- structure(c(k_result, h_result), .Names = c("k", "height"))
return(best_cut)
}
#' @title Compute Rand index for cluster table
#' @name .cluster_rand_index
#' @noRd
#' @param x a cluster produced by dtwclust::tsclust
#' @param correct use best calculation
#' @return Rand index for cluster
.cluster_rand_index <- function(x) {
.check_set_caller(".cluster_rand_index")
.check_that(length(dim(x)) == 2)
n <- sum(x)
ni <- rowSums(x)
nj <- colSums(x)
n2 <- choose(n, 2)
nis2 <- sum(choose(ni[ni > 1], 2))
njs2 <- sum(choose(nj[nj > 1], 2))
factor_1 <- (nis2 * njs2) / n2
factor_2 <- (nis2 + njs2) / 2
rand <- (sum(choose(x[x > 1], 2)) - factor_1) / (factor_2 - factor_1)
return(rand)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.