Nothing
#' Permutation Test on Monothetic Tree
#'
#' Testing the significance of each monothetic clustering split by permutation
#' methods. The "simple-withhold" method (`"sw"`) shuffles the observations
#' between two groups without the splitting variable. The other two methods
#' shuffle the values in the splitting variable to create a new data set, then
#' it either splits again on that variable ("resplit-limit", `"rl"`) or use all
#' variables as the splitting candidates ("resplit-nolimit", `"rn"`).
#'
#' @param object The `MonoClust` object as the result of the clustering.
#' @param data The data set which is being clustered.
#' @param auto.pick Whether the algorithm stops when p-value becomes larger than
#' `sig.val` or keeps testing and let the researcher pick the final splitting
#' tree. Default value is `FALSE`.
#' @param sig.val Significance value to decide when to stop splitting. This
#' option is ignored if `auto.pick = FALSE`, and is 0.05 by default when
#' `auto.pick = TRUE`.
#' @param method Can be chosen between `sw` (simple-withhold, default), `rl`
#' (resplit-limit), or `rn` (resplit-nolimit). See Details.
#' @param rep Number of permutations required to calculate test statistic.
#' @param stat Statistic to use. Choosing between `"f"` (Calinski-Harabasz's
#' pseudo-F (Calinski and Harabasz, 1974)) or `"aw"` (Average silhoutte width
#' by Rousseeuw (1987)).
#' @param bon.adj Whether to adjust for multiple testing problem using
#' Bonferroni correction.
#' @param ncores Number of CPU cores on the current host. When set to NULL,
#' all available cores are used.
#'
#' @return The same `MonoClust` object with an extra column (p-value), as well
#' as the `numofclusters` object if `auto.pick = TRUE`.
#'
#' @details
#' ## Permutation Methods
#' ### Simple-Withhold: Shuffle the observations between two proposed clusters
#' The `stat` calculated from the shuffles create the reference distribution
#' to find the p-value. Because the splitting variable that was chosen is
#' already the best in terms of reduction of inertia, that variable is withheld
#' from the distance matrix used in the permutation test.
#'
#' ### Resplit-Limit: Shuffle splitting variable, split again on that variable
#' This method shuffles the values of the splitting variables while keeping
#' other variables fixed to create a new data set, then the chosen `stat` is
#' calculated for each rep to compare with the observed `stat`.
#'
#' ### Resplit-Nolimit: Shuffle splitting variable, split on all variables
#' Similar to Method 2 but all variables are splitting candidates.
#'
#' ## Bonferroni Correction
#' A hypothesis test occurred lower in the monothetic clustering tree could have
#' its p-value corrected for multiple tests happened before it in order to reach
#' that node. The formula is
#' \deqn{adj.p = unadj.p \times depth,}
#' with \eqn{depth} is 1 at the root node.
#'
#' @note This function uses [foreach::foreach()] to facilitate parallel
#' processing. It distributes reps to processes.
#'
#' @references
#' Calinski, T. and Harabasz, J (1974). "A dendrite method for cluster
#' analysis". en. In: *Communications in Statistics* 3.1, pp. 1-27.
#' \doi{10.1080/03610927408827101}.
#'
#' Rousseeuw, P. J. (1987). "Silhouettes: A graphical aid to the interpretation
#' and validation of cluster analysis". In: *Journal of Computational and
#' Applied Mathematics* 20, pp. 53-65. ISSN: 03770427.
#' \doi{10.1016/0377-0427(87)90125-7}.
#'
#' @export
#'
#' @examples
#' library(cluster)
#' data(ruspini)
#' \donttest{
#' ruspini6sol <- MonoClust(ruspini, nclusters = 6)
#' ruspini6.p_value <- perm.test(ruspini6sol, data = ruspini, method = "sw",
#' rep = 1000)
#' ruspini6.p_value
#' }
perm.test <- function(object, data, auto.pick = FALSE, sig.val = 0.05,
method = c("sw", "rl", "rn"),
rep = 1000L,
stat = c("f", "aw"),
bon.adj = TRUE,
ncores = 1L) {
if (!is_MonoClust(object))
stop("Not a legitimate \"MonoClust\" object.")
method <- match.arg(method)
stat <- match.arg(stat)
if (!is.null(ncores)){
if (!is.numeric(ncores)) {
stop("\"ncores\" should be either NULL or a positive integer.")
}
if (ncores < 1) {
stop("\"ncores\" should be > 1.")
}
}
if (is.null(ncores))
ncores <- parallel::detectCores() - 1
frame <- object$frame
# It would be better to create a jump table for reference of tree walking
jump_table <- tibble::add_column(make_jump_table(frame),
split.order = frame$split.order)
jump_table$p_value <- NA
jump_table$members <- NA
jump_table$members[1] <- paste(seq_len(nrow(data)), collapse = ",")
# Now tracing the tree to find the cluster Trace along the split order,
# quit when reaching the end or if auto.pick, stop when p value > sig.val
last_split <- max(jump_table$split.order[!is.na(jump_table$split.order)]) + 1
for (i in 1:max(jump_table$split.order[!is.na(jump_table$split.order)])) {
current <- which(jump_table$split.order == i)
# Node
split_var <- jump_table$var[current]
split_value <- jump_table$cut[current]
members <- as.numeric(strsplit(jump_table$members[current], ",")[[1]])
members_l <- members[which(data[members, split_var] < split_value)]
jump_table$members[jump_table$left[current]] <-
paste(members_l, collapse = ",")
members_r <- setdiff(members, members_l)
jump_table$members[jump_table$right[current]] <-
paste(members_r, collapse = ",")
p_value_unadj <- test_split(members_l, members_r, method, data, split_var,
rep, stat, ncores)
# If Bonferroni correction is applied
p_value <- ifelse(bon.adj, p_value_unadj * i, p_value_unadj)
jump_table$p_value[current] <- ifelse(p_value > 1, 1,
ceiling(p_value * rep) / rep)
# If auto.pick is applied
if (auto.pick) {
if (p_value > sig.val) {
last_split <- i
break
} else if (i ==
max(jump_table$split.order[!is.na(jump_table$split.order)])) {
last_split <- i + 1
}
}
}
frame$p.value <- jump_table$p_value
object$frame <- frame
object$numofclusters <- last_split
return(object)
}
#' Hypothesis Test at Split
#'
#' @param members_l,members_r Vector of the index of observations that
#' are members of the left child node and the right child node, respectively.
#' @param split_var Splitting variable at current split.
#' @inheritParams perm.test
#'
#' @return p-value of the test
#' @keywords internal
test_split <- function(members_l, members_r, method, data, split_var, rep,
stat, ncores) {
# Membership is consecutive because the distance matrix will be moved around
# with members_l and members_r put next to each other.
fmem2 <- c(rep(1, length(members_l)), rep(2, length(members_r)))
if (ncores > 1) {
# Initiate processes
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
`%dodo%` <- foreach::`%dopar%`
} else {
`%dodo%` <- foreach::`%do%`
}
if (method == "sw") {
# Simple-withhold: shuffling the membership, not used splitting variable
data_reduced <- data[colnames(data) != split_var]
} else {
data_reduced <- data
}
# A distance matrix with observations left and right put in order
current_data <- data_reduced[c(members_l, members_r), , drop = FALSE]
# Find the observed statistic
distmat_twogroup <- as.matrix(cluster::daisy(current_data))
stat_obs_l <- cluster_stats(distmat_twogroup, fmem2)
# Create permuted matrix
perm <- permute::shuffleSet(nrow(current_data),
control = permute::how(nperm = rep))
# The number of permutations may be smaller than rep
permutations <- nrow(perm)
# Shuffling
stat_rep_l <-
if (method == "sw") {
foreach::foreach(k = 1:permutations,
.inorder = FALSE) %dodo% {
distmat_rep <- distmat_twogroup[perm[k, ], perm[k, ]]
return(cluster_stats(distmat_rep, fmem2))
}
} else if (method == "rl") {
foreach::foreach(k = 1:permutations,
.inorder = FALSE,
.packages = c("monoClust")) %dodo% {
current_data[, split_var] <- current_data[perm[k, ],
split_var]
# Resplit-limit: limit the splitting variables
cluster_rep <- MonoClust(toclust = current_data,
nclusters = 2,
variables = split_var)
distmat_rep <- cluster_rep$Dist
fmem2_rep <- cluster_rep$Membership
return(cluster_stats(distmat_rep, fmem2_rep))
}
} else if (method == "rn") {
foreach::foreach(k = 1:permutations,
.inorder = FALSE,
.packages = c("monoClust")) %dodo% {
current_data[, split_var] <- current_data[perm[k, ],
split_var]
# Resplit-nolimit: splitting all variables
cluster_rep <- MonoClust(toclust = current_data,
nclusters = 2)
distmat_rep <- cluster_rep$Dist
fmem2_rep <- cluster_rep$Membership
return(cluster_stats(distmat_rep, fmem2_rep))
}
}
if (ncores > 1) {
# Stop processes
parallel::stopCluster(cl)
}
stat_rep_tbl <- dplyr::bind_rows(stat_rep_l)
if (stat == "F") {
stat_obs <- stat_obs_l$f_stat
stat_rep <- stat_rep_tbl$f_stat
} else {
stat_obs <- stat_obs_l$asw
stat_rep <- stat_rep_tbl$asw
}
p_value <- (sum(stat_rep >= stat_obs) + 1 -
sqrt(.Machine$double.eps)) / (permutations + 1)
return(p_value)
}
#' Cluster Statistics Calculation
#'
#' Calinski-Harabasz's pseudo-F (Calinski and Harabasz, 1974) and Average
#' silhoutte width (Rousseeuw, 1987) calculation.
#'
#' @param d Distance object (as generated by [dist()]) or a distance matrix
#' between cases.
#' @param clustering Integer vector of length of the number of cases, which
#' indicates a clustering. The clusters have to be numbered from 1 to the
#' number of clusters.
#'
#' @return
#' \describe{
#' \item{f_stat}{Calinski-Harabasz's pseudo-F.}
#' \item{asw}{Average silhouette width.}
#' }
#'
#' @references
#' * Caliński, T. and Harabasz, J (1974). "A dendrite method for cluster
#' analysis". en. In: *Communications in Statistics* 3.1, pp. 1–27.
#' \doi{10.1080/03610927408827101}.
#' * Rousseeuw, P. J. (1987). "Silhouettes: A graphical aid to the
#' interpretation and validation of cluster analysis". In: *Journal of
#' Computational and Applied Mathematics* 20, pp. 53–65. ISSN: 03770427.
#' \doi{10.1016/0377-0427(87)90125-7}.
#'
#' @seealso [cluster::silhouette()]
#'
#' @keywords internal
cluster_stats <- function(d, clustering) {
d <- stats::as.dist(d)
nclust <- max(clustering)
n <- length(clustering)
# Total cluster sum of squares
total_ss <- sum(d^2) / n
# Within cluster sum of squares
dmat <- as.matrix(d)
within_ss <- 0
for (i in 1:nclust) {
d_i <- stats::as.dist(dmat[clustering == i, clustering == i])
if (i <= nclust) {
within_ss <- within_ss + sum(d_i^2) / sum(clustering == i)
}
}
# Between cluster sum of squares
between_ss <- total_ss - within_ss
f_stat <- between_ss * (n - nclust) / (within_ss * (nclust - 1))
# Average silhouette width
sii <- cluster::silhouette(clustering, dmatrix = dmat)
sc <- summary(sii)
asw <- sc$avg.width
return(list(f_stat = f_stat, asw = asw))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.