#' Estimate optimal number of clusters.
#'
#' Estimates the optimal number of clusters (\code{k}) using various methods.
#'
#' \code{mt_cluster_k} estimates the number of clusters (\code{k}) using four
#' commonly used k-selection methods (specified via \code{compute}): cluster
#' stability (\code{stability}), the gap statistic (\code{gap}), the jump
#' statistic (\code{jump}), and the slope statistic (\code{slope}).
#'
#' Cluster stability methods select \code{k} as the number of clusters for which
#' the assignment of objects to clusters is most stable across bootstrap
#' samples. This function implements the model-based and model-free methods
#' described by Haslbeck & Wulff (2020). See references.
#'
#' The remaining three methods select \code{k} as the value that optimizes the
#' gap statistic (Tibshirani, Walther, & Hastie, 2001), the jump statistic
#' (Sugar & James, 2013), and the slope statistic (Fujita, Takahashi, &
#' Patriota, 2014), respectively.
#'
#' For clustering trajectories, it is often useful that the endpoints of all
#' trajectories share the same direction, e.g., that all trajectories end in the
#' top-left corner of the coordinate system (\link{mt_remap_symmetric} or
#' \link{mt_align} can be used to achieve this). Furthermore, it is recommended
#' to use length normalized trajectories (see \link{mt_length_normalize}; Wulff
#' et al., 2019).
#'
#' @inheritParams mt_cluster
#' @param kseq a numeric vector specifying set of candidates for k. Defaults to
#' 2:15, implying that all values of k within that range are compared using
#' the metrics specified in \code{compute}.
#' @param compute character vector specifying the to be computed measures. Can
#' be any subset of \code{c("stability","gap","jump","slope")}.
#' @param method character string specifying the type of clustering procedure
#' for the stability-based method. Either \code{hclust} or \code{kmeans}.
#' @param n_bootstrap an integer specifying the number of bootstrap comparisons
#' used by \code{stability}. See \link[cstab]{cStability}.
#' @param model_based boolean specifying whether the model-based or the
#' model-free should be used by \code{stability}, when method is
#' \code{kmeans}. See \link[cstab]{cStability} and Haslbeck & Wulff (2020).
#' @param n_gap integer specifying the number of simulated datasets used by
#' \code{gap}. See Tibshirani et al. (2001).
#'
#' @return A list containing two lists that store the results of the different
#' methods. \code{kopt} contains the estimated \code{k} for each of the
#' methods specified in \code{compute}. \code{paths} contains the values for
#' each \code{k} in \code{kseq} as computed by each of the methods specified
#' in \code{compute}. The values in \code{kopt} are optima for each of the
#' vectors in \code{paths}.
#'
#' @seealso \link{mt_distmat} for more information about how the distance matrix
#' is computed when the hclust method is used.
#'
#' \link{mt_cluster} for performing trajectory clustering with a specified
#' number of clusters.
#'
#' @examples
#'
#' \dontrun{
#' # Length normalize trajectories
#' KH2017 <- mt_length_normalize(KH2017)
#'
#' # Find k
#' results <- mt_cluster_k(KH2017, use="ln_trajectories")
#'
#' # Retrieve results
#' results$kopt
#' results$paths
#' }
#'
#' @author
#' Dirk U. Wulff
#'
#' Jonas M. B. Haslbeck
#'
#' @references Haslbeck, J. M. B., & Wulff, D. U. (2020). Estimating the Number
#' of Clusters via a Corrected Clustering Instability. \emph{Computational
#' Statistics, 35}, 1879–1894.
#'
#' Wulff, D. U., Haslbeck, J. M. B., Kieslich, P. J., Henninger, F., &
#' Schulte-Mecklenbeck, M. (2019). Mouse-tracking: Detecting types in movement
#' trajectories. In M. Schulte-Mecklenbeck, A. Kühberger, & J. G. Johnson
#' (Eds.), \emph{A Handbook of Process Tracing Methods} (pp. 131-145). New
#' York, NY: Routledge.
#'
#' Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the number of
#' clusters in a data set via the gap statistic. \emph{Journal of the Royal
#' Statistical Society: Series B (Statistical Methodology), 63}(2), 411-423.
#'
#' Sugar, C. A., & James, G. M. (2013). Finding the number of clusters in a
#' dataset. \emph{Journal of the American Statistical Association, 98}(463),
#' 750-763.
#'
#' Fujita, A., Takahashi, D. Y., & Patriota, A. G. (2014). A non-parametric
#' method to estimate the number of clusters. \emph{Computational Statistics &
#' Data Analysis, 73}, 27-39.
#'
#' @export
mt_cluster_k <- function(
data,
use = 'ln_trajectories',
dimensions = c('xpos','ypos'),
# k-selection type
kseq = 2:15, # considered k-sequence
compute = c('stability','gap','jump','slope'),
method = 'hclust', # or 'kmeans'
# distance arguments
weights = rep(1,length(dimensions)),
pointwise = TRUE, #
minkowski_p = 2,
# stability arguments
hclust_method = 'ward.D', # hclust_method
kmeans_nstart = 10, # number of reinitializations for k-means algorithm
n_bootstrap = 10, # bootstrap samples
model_based = FALSE, # stab_predict
# arguments distance-based k-selection methods
n_gap = 10, # simulated datasets for Gap Statistic
na_rm = FALSE,
verbose=FALSE
){
# Extract data
trajectories <- extract_data(data,use)
# Tests
if (method=="hclust") {
method <- "hierarchical"
} else if (method != "kmeans") {
stop('Method must either be "hclust" or "kmeans"')
}
if(model_based == TRUE & method == 'hierarchical'){
stop('Model-based instability methods are only available for k-means clustering.')
}
# prepare trajectories
trajectories = prepare_trajectories(trajectories = trajectories,
dimensions = dimensions,
weights = weights,
na_rm = na_rm)
# Transform data structure for clustering input
rearranged_trajectories = cbind(trajectories[,,dimensions[1]],trajectories[,,dimensions[2]])
# Define containers
kopt = list()
paths = list()
# Stability-based k-selection methods
if('stability' %in% compute) {
if(verbose == TRUE) message('Calculating stability-based k-selection methods.')
cStab_obj <- cstab::cStability(data = rearranged_trajectories,
kseq = kseq,
nB = n_bootstrap,
norm = TRUE,
predict = model_based,
method = method,
linkage = hclust_method,
kmIter = kmeans_nstart,
pbar = FALSE)
kopt[['stab_kopt']] <- unlist(cStab_obj$k_instab_norm)
paths[['stab_seq']] <- cStab_obj$instab_path_norm
}
# ---- Distance-based k-selection methods
if(any(compute %in% c('gap','jump','slope'))) {
if(verbose == TRUE) message('Calculating distance-based k-selection methods.')
cDist_obj <- cstab::cDistance(data = rearranged_trajectories,
kseq = kseq,
method = method,
linkage = hclust_method,
kmIter = kmeans_nstart,
gapIter = n_gap)
if('gap' %in% compute){
kopt[['gap_kopt']] <- unlist(cDist_obj$k_Gap)
paths[['gap_seq']] <- cDist_obj$Gaps
}
if('jump' %in% compute){
kopt[['jump_kopt']] <- unlist(cDist_obj$k_Jump)
paths[['jump_seq']] <- cDist_obj$Jumps
}
if('slope' %in% compute){
kopt[['slope_kopt']] <- unlist(cDist_obj$k_Slope)
paths[['slope_seq']] <- cDist_obj$Slopes
}
}
#output
output = list(kopt=kopt,paths=paths)
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.