R/sits_cluster.R

Defines functions sits_cluster_clean sits_cluster_frequency sits_cluster_dendro.default sits_cluster_dendro.tbl_df sits_cluster_dendro.sits sits_cluster_dendro

Documented in sits_cluster_clean sits_cluster_dendro sits_cluster_dendro.default sits_cluster_dendro.sits sits_cluster_dendro.tbl_df sits_cluster_frequency

#' @title Find clusters in time series samples
#' @name sits_cluster_dendro
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#'
#' @description These functions support hierarchical agglomerative clustering in
#' sits. They provide support from creating a dendrogram and using it for
#' cleaning samples.
#'
#' \code{link[sits]{sits_cluster_dendro()}} takes a tibble with time series and
#' produces a sits tibble with an added "cluster" column. The function first
#' calculates a dendrogram and obtains a validity index for best clustering
#' using the adjusted Rand Index. After cutting the dendrogram using the chosen
#' validity index, it assigns a cluster to each sample.
#'
#' \code{link[sits]{sits_cluster_frequency()}} computes the contingency
#' table between labels
#' and clusters and produces a matrix.
#' Its input is a tibble produced by \code{link[sits]{sits_cluster_dendro()}}.
#'
#' \code{link[sits]{sits_cluster_clean()}} takes a tibble with time series
#' that has an additional `cluster` produced by
#' \code{link[sits]{sits_cluster_dendro()}}
#' and removes labels that are minority in each cluster.
#'
#' @references "dtwclust" package (https://CRAN.R-project.org/package=dtwclust)
#'
#' @param samples         Tibble with input set of time series (class "sits").
#' @param bands           Bands to be used in the clustering
#'                        (character vector)
#' @param dist_method     One of the supported distances (single char vector)
#'                        "dtw": DTW with a Sakoe-Chiba constraint.
#'                        "dtw2": DTW with L2 norm and Sakoe-Chiba constraint.
#'                        "dtw_basic": A faster DTW with less functionality.
#'                        "lbk": Keogh's lower bound for DTW.
#'                        "lbi": Lemire's lower bound for DTW.
#' @param linkage         Agglomeration method to be used (single char vector)
#'                        One of "ward.D", "ward.D2", "single", "complete",
#'                        "average", "mcquitty", "median" or "centroid".
#' @param k               Desired number of clusters (overrides default value)
#' @param palette         Color palette as per `grDevices::hcl.pals()` function.
#' @param  ...            Additional parameters to be passed
#'                        to dtwclust::tsclust() function.
#' @return                Tibble with "cluster" column (class "sits_cluster").
#'
#' @note
#' Please refer to the sits documentation available in
#' <https://e-sensing.github.io/sitsbook/> for detailed examples.
#' @examples
#' if (sits_run_examples()) {
#'     # default
#'     clusters <- sits_cluster_dendro(cerrado_2classes)
#'     # with parameters
#'     clusters <- sits_cluster_dendro(cerrado_2classes,
#'                 bands = "NDVI", k = 5)
#' }
#'
#' @export
sits_cluster_dendro <- function(samples,
                                bands = NULL,
                                dist_method = "dtw_basic",
                                linkage = "ward.D2",
                                k = NULL,
                                palette = "RdYlGn") {
    # needs package dtwclust
    .check_require_packages("dtwclust")
    # verify if data is OK
    .check_samples_train(samples)
    # bands in sits are uppercase
    bands <- .default(bands, sits_bands(samples))
    bands <- .tibble_bands_check(samples, bands)
    # check k (number of clusters)
    if (!purrr::is_null(k)) {
        .check_num_parameter(k, min = 2,  max = 200)
    }
    # check distance method
    .check_that(
        dist_method %in% .conf("dendro_dist_method"),
        msg = "Invalid distance method for dendrogram calculation"
    )
    # check linkage
    .check_that(
        linkage %in% .conf("dendro_linkage"),
        msg = "Invalid linkage method for dendrogram calculation"
    )
    # check palette
    .check_palette(palette)
     UseMethod("sits_cluster_dendro", samples)
}
#' @rdname sits_cluster_dendro
#' @export
sits_cluster_dendro.sits <- function(samples,
                                     bands = NULL,
                                     dist_method = "dtw_basic",
                                     linkage = "ward.D2",
                                     k = NULL,
                                     palette = "RdYlGn",
                                     ...) {
    # calculate the dendrogram object
    cluster <- .cluster_dendrogram(
        samples = samples,
        bands = bands,
        dist_method = dist_method,
        linkage = linkage, ...
    )

    # find the best cut for the dendrogram
    best_cut <- .cluster_dendro_bestcut(samples, cluster)
    message(paste0("best number of clusters = ", best_cut["k"]))
    message(paste0(
        "best height for cutting the dendrogram = ",
        best_cut["height"]
    ))
    # cut the tree (user-defined value overrides default)
    k <- .default(k, best_cut["k"])
    if (k != best_cut["k"]) {
        message(paste0("Caveat: desired number of clusters (", k, ")
                            overrides best value"))
        best_cut["k"] <- k
        best_cut["height"] <-
            c(0, cluster$height)[length(cluster$height) - k + 2]
    }
    samples$cluster <- stats::cutree(
        cluster,
        best_cut["k"],
        best_cut["height"]
    )
    # change the class
    class(samples) <- c("sits_cluster", class(samples))
    # plot the dendrogram
    plot(
        x = samples,
        cluster = cluster,
        cutree_height = best_cut["height"],
        palette = palette
    )
    return(samples)
}
#' @rdname sits_cluster_dendro
#' @export
sits_cluster_dendro.tbl_df <- function(samples, ...) {
    samples <- tibble::as_tibble(samples)
    if (all(.conf("sits_tibble_cols") %in% colnames(samples))) {
        class(samples) <- c("sits", class(samples))
    } else
        stop("Input should be a sits tibble")
    samples <- sits_cluster_dendro(samples, ...)
    return(samples)
}
#' @rdname sits_cluster_dendro
#' @export
sits_cluster_dendro.default <- function(samples, ...) {
    stop("Input samples should be of class sits")
}
#'
#' @title Show label frequency in each cluster produced by dendrogram analysis
#' @name sits_cluster_frequency
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#' @param samples         Tibble with input set of time series with additional
#'                        cluster information produced
#'                        by \code{link[sits]{sits_cluster_dendro}}.
#' @return                A matrix containing frequencies
#'                        of labels in clusters.
#' @examples
#' if (sits_run_examples()) {
#'     clusters <- sits_cluster_dendro(cerrado_2classes)
#'     freq <- sits_cluster_frequency(clusters)
#'     freq
#' }
#' @export
sits_cluster_frequency <- function(samples) {
    # set caller to show in errors
    .check_set_caller("sits_cluster_frequency")

    # is the input data the result of a cluster function?
    .check_samples_cluster(samples)
    # compute frequency table (matrix)
    result <- table(samples$label, samples$cluster)
    # compute total row and col
    result <- stats::addmargins(result,
        FUN = list(Total = sum),
        quiet = TRUE
    )
    return(result)
}

#' @title Removes labels that are minority in each cluster.
#' @name sits_cluster_clean
#' @author Rolf Simoes, \email{rolf.simoes@@inpe.br}
#' @description Takes a tibble with time series
#' that has an additional `cluster` produced by
#' \code{link[sits]{sits_cluster_dendro()}}
#' and removes labels that are minority in each cluster.
#'
#' @param samples         Tibble with set of time series with additional
#'                        cluster information produced
#'                        by \code{link[sits]{sits_cluster_dendro()}}
#' @return                Tibble with time series (class "sits")
#' @examples
#' if (sits_run_examples()) {
#'     clusters <- sits_cluster_dendro(cerrado_2classes)
#'     freq1 <- sits_cluster_frequency(clusters)
#'     freq1
#'     clean_clusters <- sits_cluster_clean(clusters)
#'     freq2 <- sits_cluster_frequency(clean_clusters)
#'     freq2
#' }
#' @export
sits_cluster_clean <- function(samples) {
    # set caller to show in errors
    .check_set_caller("sits_cluster_clean")
    # is the input data the result of a cluster function?
    .check_samples_cluster(samples)
    # compute frequency table (matrix)
    result <- table(samples$label, samples$cluster)
    # list of number of clusters
    num_cls <- unique(samples$cluster)
    # get the labels of the data
    lbs <- unique(samples$label)
    # for each cluster, get the label with the maximum number of samples
    lbs_max <- lbs[as.vector(apply(result, 2, which.max))]
    # compute the resulting table
    clean_clusters <- purrr::map2_dfr(
        lbs_max, num_cls,
        function(lb, cl) {
            partial <- dplyr::filter(
                samples,
                .data[["label"]] == lb &
                    .data[["cluster"]] == cl
            )
            return(partial)
        }
    )
    return(clean_clusters)
}
e-sensing/sits documentation built on Jan. 28, 2024, 6:05 a.m.