Nothing
#' Selection of number of clusters via distance-based measures
#'
#' Selection of number of clusters via \emph{gap statistic}, \emph{jump statistic}, and
#' \emph{slope statistic}
#'
#' @param data a n x p data matrix of type numeric.
#' @param kseq a vector with considered numbers clusters k > 1
#' @param method character string indicating the clustering algorithm. 'kmeans' for the
#' k-means algorithm, 'hierarchical' for hierarchical clustering.
#' @param linkage character specifying the linkage criterion, in case
#' \code{type='hierarchical'}. The available options are "single", "complete",
#' "average", "mcquitty", "ward.D", "ward.D2", "centroid" or "median". See
#' \link[stats]{hclust}.
#' @param kmIter integer specifying the the number of restarts of the k-means algorithm
#' in order to avoid local minima.
#' @param gapIter integer specifying the number of simulated datasets to compute the
#' \emph{gap statistic} (see Tibshirani et al., 2001).
#'
#' @return a list with the optimal numbers of cluster determined by the \emph{gap statistic}
#' (Tibshirani et al., 2001), the \emph{jump Statistic} (Sugar & James, 2011) and the
#' \emph{slope statistic} (Fujita et al., 2014). Along the function returns the \emph{gap},
#' \emph{jump} and \code{slope} for each k in \code{kseq}.
#'
#' @references
#' Tibshirani, R., Walther, G., & Hastie, T. (2001). Estimating the number of clusters in a
#' data set via the gap statistic. \emph{Journal of the Royal Statistical Society: Series B
#' (Statistical Methodology), 63}(2), 411-423.
#'
#' Sugar, C. A., & James, G. M. (2011). Finding the number of clusters in a dataset. \emph{Journal
#' of the American Statistical Association, 98}(463), 750-763,
#'
#' Fujita, A., Takahashi, D. Y., & Patriota, A. G. (2014). A non-parametric method to estimate
#' the number of clusters. \emph{Computational Statistics & Data Analysis, 73}, 27-39.
#'
#' @author
#' Dirk U. Wulff <dirk.wulff@gmail.com>
#' Jonas M. B. Haslbeck <jonas.haslbeck@gmail.com>
#'
#' @examples
#' \dontrun{
#' # Generate Data from Gaussian Mixture
#' s <- .1
#' n <- 50
#' data <- rbind(cbind(rnorm(n, 0, s), rnorm(n, 0, s)),
#' cbind(rnorm(n, 1, s), rnorm(n, 1, s)),
#' cbind(rnorm(n, 0, s), rnorm(n, 1, s)),
#' cbind(rnorm(n, 1, s), rnorm(n, 0, s)))
#' plot(data)
#'
#' # Selection of Number of Clusters using Distance-based Measures
#' cDistance(data, kseq=2:10)
#' }
#'
#' @export
cDistance <- function(data, # n x p data matrix
kseq, #sequence of ks to be checked
method = 'kmeans', # or: hiearchical
linkage = 'complete',
kmIter = 10, # restarts of k means algorithm
gapIter = 10) # number of simulated datasets in gap statistic
{
# ----- INPUT TESTS
# On Data
if(sum(is.na(data))>0) stop('No missing values permitted!')
# On k-sequence
if(1 %in% kseq) stop('Please select a k sequence starting with 2: {2,3,...K}!')
# On type
if(method %in% c('kmeans', 'hierarchical'))
# ----- HELPERS
n = nrow(data)
dims = ncol(data)
if(!1 %in% kseq) kseq = c(1,kseq)
# ----- EVALUATE REAL DATA
WCD <- Sil <- MSE <- numeric()
for(k in kseq) {
obj = getMeasures(data, k, method=method, linkage=linkage, kmIter = kmIter, measures = c('wcd','sil','mse'))
WCD[k] = obj$WCD
Sil[k] = obj$Sil
MSE[k] = obj$MSE
}
# ----- EVALUATE SYNTHETIC DATA (Gap-statistic)
WCD_runs = matrix(NA,nrow=gapIter, ncol=length(kseq))
for(i in 1:gapIter) {
data_syn = UniformData(data)
WCDs = numeric()
for(j in 1:length(kseq)) {
k = kseq[j]
obj = getMeasures(data_syn, k, method=method, linkage=linkage, kmIter = kmIter, measures = c('wcd'))
WCDs[j] = obj$WCD
}
WCD_runs[i,] = WCDs
}
WCD_syn = colMeans(WCD_runs)
# ----- COMPUTE MEASURES
# Gap Statistic
WCD_dat_log = log(WCD)
WCD_syn_log = log(WCD_syn)
WCD_dat_log = WCD_dat_log - WCD_dat_log[1]
WCD_syn_log = WCD_syn_log - WCD_syn_log[1]
gap = WCD_syn_log - WCD_dat_log
kopt_gap = kseq[gap == max(gap)]
# Slope Statistic
p = 1
slope = -(Sil[-1] - Sil[-length(Sil)]) * Sil[-1]^p
kopt_slope = kseq[slope == max(slope)]
## Jump Statistic
MSE_tr = MSE^(- dims/2)
jump = (MSE_tr - c(0, MSE_tr[-length(MSE_tr)])) #[-1]
kopt_jump = kseq[jump == max(jump)]
outlist <- list('k_Gap'=kopt_gap,
'k_Slope'=kopt_slope,
'k_Jump'=kopt_jump,
'WCD'=WCD,
'WCD_syn'=WCD_syn,
'Gaps'= gap,
'Silhouettes'=Sil,
'Slopes'=slope,
'Jumps'=jump)
return(outlist)
} # EoF
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.