# semcloud: Post-processing of token-level clouds.
# Copyright (C) 2021 Mariana Montes
#
# See full notice in README.md file.
#' Compute distances per cluster
#'
#' @param clustering Named vector with token IDs as names and (HDBSCAN) clusters
#' as values. We assume that each cluster has at least 8 items.
#' @param dists Long format table with one row per pair of tokens (that are not
#' the same), the distance between them, the cluster that the first token belongs
#' to and whether they both belong to the same cluster.
#' @param k Number of nearest neighbors to get maximum distance from
#'
#' @return A tibble with one row per cluster and various distance-derived values:
#' \describe{
#' \item{min_, mean_ and max_identicals}{Minimum, mean and maximum number
#' of identical tokens per token in the cluster.}
#' \item{min_, mean_ and max_k8}{Minimum, mean and maximum distance from
#' each token in the cluster and its 8th nearest neighbour.}
#' \item{min_, mean_, max_ and sd_inner_dist}{Minimum, mean, and maximum
#' distance, as well as their standard deviation, between each token of the
#' cluster and all other tokens in the same cluster.}
#' \item{min_, mean_, max_ and sd_outer_dist}{Minimum, mean, and maximum
#' distance, as well as their standard deviation, between each token of the
#' cluster and all other tokens in other clusters.}
#' }
#' @export
#' @importFrom rlang .data
clusterDistance <- function(clustering, dists, k=8){
identicals <- tibble::enframe(clustering, 'token_A', 'cluster_A') %>%
dplyr::mutate(
identicals = purrr::map_dbl(
.data$token_A,
~nrow(dplyr::filter(dists, .data$token_A == .x, .data$same_cluster, .data$distance < 0.000001))),
max_k = purrr::map_dbl(
.data$token_A,
~sort(dplyr::filter(dists, .data$token_A == .x, .data$same_cluster)$distance)[k]
)
) %>%
dplyr::group_by(.data$cluster_A) %>%
dplyr::summarize(
'min_identicals' = min(.data$identicals),
'mean_identicals' = mean(.data$identicals),
'max_identicals' = max(.data$identicals),
'min_k' = min(.data$max_k),
'mean_k' = mean(.data$max_k),
'max_k' = max(.data$max_k)
)
cluster_distances <- dists %>%
dplyr::group_by(.data$cluster_A, .data$same_cluster) %>%
dplyr::summarize(
'min' = min(.data$distance), 'mean' = mean(.data$distance), 'max' = max(.data$distance), 'sd' = stats::sd(.data$distance),
.groups = 'keep') %>%
dplyr::mutate(same_cluster = dplyr::if_else(.data$same_cluster, 'inner_dist', 'outer_dist')) %>%
tidyr::pivot_wider(names_from = 'same_cluster', values_from = c('min', 'mean', 'max', 'sd'))
dplyr::full_join(identicals, cluster_distances, by = 'cluster_A') %>%
dplyr::rename('cluster' = .data$cluster_A)
}
#' Summarize HDBSCAN data per cluster
#'
#' @param m Tibble with one token per row and HDBSCAN information. The \code{coords}
#' element of a model resulting from \code{\link{summarizeHDBSCAN}}.
#'
#' @return Tibble with one row per cluster and various HDBSCAN-derived values:
#' \describe{
#' \item{min_, mean_, max_ and sd_cws}{Minimum, mean and maximum, as well
#' as standard deviation, of the number of first-order context words per
#' token in that cluster.}
#' \item{min_, mean_, max_ and sd_eps}{Minimum, mean and maximum, as well
#' as standard deviation, of the \eqn{\epsilon} value of the tokens in
#' that cluster.}
#' \item{size, rel_size}{Absolute number of tokens in the cluster and proportion
#' of \emph{modelled} tokens covered by the cluster.}
#' \item{deeper_than_noise}{Proportion of tokens in that cluster with an
#' \eqn{\epsilon} value lower than the minimum \eqn{\epsilon} of noise
#' tokens in that model.}
#' \item{cw_tokens, _types, _ttratio}{Union of first-order context words
#' of tokens in that cluster: number of types and of tokens and type-token
#' ratio.}
#' }
#'
#' @export
#'
#' @importFrom rlang .data
clusterHDBSCAN <- function(m) {
min_eps <- m %>%
dplyr::filter(!is.na(.data$eps)) %>%
dplyr::group_by(.data$cluster) %>%
dplyr::summarize(eps = min(.data$eps)) %>%
tibble::deframe()
m %>%
dplyr::mutate(deeper_than_noise = if ('0' %in% names(min_eps)) .data$eps < min_eps[['0']] else TRUE) %>%
dplyr::group_by(.data$cluster) %>%
dplyr::mutate(
cws_n = purrr::map_dbl(.data$cws, length),
cws = purrr::map_chr(.data$cws, paste, collapse = ';')
) %>%
dplyr::summarize(
min_cws = min(.data$cws_n), mean_cws = mean(.data$cws_n),
max_cws = max(.data$cws_n), sd_cws = stats::sd(.data$cws_n),
min_eps = min(.data$eps), max_eps = max(.data$eps),
mean_eps = mean(.data$eps), sd_eps = stats::sd(.data$eps),
size = dplyr::n(), deeper_than_noise = sum(.data$deeper_than_noise)/.data$size,
cwlist = paste(.data$cws, collapse = ';')
) %>%
dplyr::mutate(
cwlist = purrr::map(.data$cwlist, stringr::str_split, ';') %>% purrr::map(1),
cw_tokens = purrr::map_dbl(.data$cwlist, length),
cwlist = purrr::map(.data$cwlist, unique),
cw_types = purrr::map_dbl(.data$cwlist, length),
cw_ttratio = .data$cw_types/.data$cw_tokens,
rel_size = .data$size/sum(.data$size)
) %>%
dplyr::select(-'cwlist')
}
#' Compute Semvar values on clusters
#'
#' @inheritParams clusterHDBSCAN
#' @param k Number of tokens to compute \code{\link{separationkNN}}
#'
#' @return A tibble with one row per cluster and output from
#' \code{\link{separationkNN}} and \code{\link[cluster]{silhouette}}
#' for each class, based on the \emph{coordinates} in the input and both including
#' and excluding noise tokens.
#'
#' @export
#'
#' @importFrom rlang .data
clusterSeparation <- function(m, k = 8) {
if (!requireNamespace('cluster', quietly = TRUE)) {
stop("Package `cluster` or `semvar` needed.")
}
sil_func <- function(dists, classes){
sils <- summary(cluster::silhouette(as.numeric(classes), dists))$clus.avg.widths
clus_names <- names(sils)
dim(sils) <- NULL
names(sils) <- clus_names
sils
}
knn_func <- function(dists, classes){
separationkNN(dists, classes, k = k)$classqual
}
classes <- as.character(m$cluster)
if (length(unique(classes)) == 1) return(tibble::tibble(cluster = unique(classes)))
if (min(table(classes)) == 1) { # cover for the occasional single noise point :)
removed <- names(table(classes)[table(classes) == 1])
m <- m %>% dplyr::filter(.data$cluster != removed)
classes <- m$cluster
}
full_dists <- stats::dist(m[c("model.x", "model.y")], diag = TRUE, upper = TRUE)
no_noise <- stats::dist(
dplyr::filter(m, .data$cluster != '0')[c("model.x", "model.y")],
diag = TRUE, upper = TRUE
)
tibble::tibble(
cluster = unique(classes),
kNN_full = knn_func(full_dists, classes)[.data$cluster],
SIL_full = sil_func(full_dists, classes)[.data$cluster],
kNN_no_noise = knn_func(no_noise, classes[classes != '0'])[.data$cluster],
SIL_no_noise = sil_func(no_noise, classes[classes != '0'])[.data$cluster]
)
}
#' Classify clouds in a model
#'
#' @param mdata Data of a model, as given by \code{\link{summarizeHDBSCAN}}.
#' @param mname Name of a model; the name of \code{mdata}.
#' @param ttmx_dir Directory where the token-level distance matrices are stored.
#' @param suffix Suffix of the file names for the token-level distance matrices;
#' the function assumes that the name of the file is the name of the medoid
#' plus the suffix.
#' @param k Number of nearest neighbors for \code{\link{clusterDistance}}
#' and \code{\link{clusterSeparation}}.
#'
#' @return A table with one row per cluster in the model, the columns created by
#' \code{\link{clusterSeparation}}, \code{\link{clusterHDBSCAN}} and
#' \code{\link{clusterDistance}} and the classification of each cluster based
#' on the Nephological Shapes from \insertCite{montes_2021;textual}{semcloud}
#' (see \href{https://cloudspotting.marianamontes.me/shapes.html}{Chapter 5}
#' for a full description and examples).
#'
#' @references
#' \insertAllCited{}
#'
#' @export
#'
#' @examples
#' \dontrun{
#' purrr::imap_dfr(models$medoidCoords, classifyModel, ttmx_dir = 'path/to/dir')
#' }
#' @importFrom Rdpack reprompt
classifyModel <- function(mdata, mname, ttmx_dir, suffix = '.ttmx.dist.pac',
k = 8){
m <- mdata$coords
clustering <- tibble::deframe(dplyr::select(m, '_id', 'cluster'))
dists <- tokensFromPac(file.path(ttmx_dir, paste0(mname, suffix))) %>%
tibble::as_tibble(rownames = 'token_A') %>%
tidyr::pivot_longer(-'token_A', names_to = 'token_B', values_to = 'distance') %>%
dplyr::filter(.data$token_A != .data$token_B) %>%
dplyr::mutate(cluster_A = clustering[.data$token_A],
same_cluster = clustering[.data$token_B] == .data$cluster_A)
model_data <- dplyr::full_join(
clusterSeparation(m, k), clusterHDBSCAN(m),
by = 'cluster'
) %>%
dplyr::full_join(clusterDistance(clustering, dists, k = k), by = 'cluster')
if (nrow(model_data) == 1) {
model_class <- dplyr::mutate(
model_data,
cloud_type = "Cirrostratus",
Hail = .data$max_identicals >= k
)
clouds <- 'Cirrostratus'
} else {
model_class <- dplyr::mutate(
model_data,
cloud_type = dplyr::case_when(
.data$cluster == '0' ~ "Cirrostratus",
.data$rel_size >= 0.5 ~ "Cumulonimbus",
.data$kNN_full >= 0.75 & .data$SIL_full >= 0.5 &
.data$mean_inner_dist <= 0.5 &
(.data$deeper_than_noise >= 0.9 | sum(.data$rel_size) >= 0.9) ~ "Cumulus",
(sum(.data$rel_size) <= 0.25 | .data$SIL_full >= 0.5 | .data$mean_inner_dist <= 0.2) &
(.data$deeper_than_noise > 0.5 | sum(.data$rel_size) > 0.9) ~ "Stratocumulus",
TRUE ~ "Cirrus"
),
Hail = .data$max_identicals >= k
)
clouds <- setdiff(sort(unique(model_class$cloud_type)), 'Cirrostratus')
}
if (sum(model_class$Hail) > 1) clouds <- c(clouds, 'Hail')
clouds <- paste(clouds, collapse = '-')
model_class %>%
dplyr::mutate(model = mname, maincat = clouds) %>%
dplyr::select('model', 'maincat', 'cluster', 'cloud_type', 'Hail', dplyr::everything()) %>%
dplyr::arrange(.data$cluster)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.