#' Estimating Number of Cluster with Gabriel Cross-validation (unadjusted)
#'
#' This function determines the number of clusters using Gabriel
#' cross-validation without adjusting for correlation between dimensions.
#'
#' @param input.data - data matrix
#' @param kclust.min - minimum number of clusters
#' @param kclust.max - maximum number of clusters
#' @param nfold.r - number of folds row-wise
#' @param nfold.c - number of folds column-wise
#' @param alg - clustering algorithm used k-means or spectral, with k-means as
#' the default
#' @return Returns a list of the final estimation of the number of clusters that
#' minimizes the cross-validation error and the cluster assignments for
#' each observation.
#'
#' @import fields dplyr
#'
#' @export
ClusterCVMain <- function(input.data, kclust.min, kclust.max, nfold.r, nfold.c, alg="k.means"){
# Random determine the column and row folds
split.col <- split(1:ncol(input.data), dplyr::ntile(stats::runif(ncol(input.data)),nfold.c))
split.data <- split(input.data, dplyr::ntile(stats::runif(nrow(input.data)),nfold.r))
# Create a sequence of cluster sizes to test on
clusts <- seq(kclust.min, kclust.max)
# Enumerate all combinations of row and column folds
folds.all <- expand.grid(1:nfold.r, 1:nfold.c)
# Initialize vector of CV errors per cluster
err.clust <- rep(NA, length(clusts))
# For each cluster
for (i in clusts){
# Initialize vector of CV errors per fold
err.fold <- rep(NA, nrow(folds.all))
# For each fold
for (k in 1:nrow(folds.all)){
# Split the data into training and test sets
vals <- 1:nfold.r
vals <- vals[vals!=folds.all[k,1]]
test <- dplyr::bind_rows(split.data[folds.all[k,1]])
train <- dplyr::bind_rows(split.data[vals])
col.ind <- unlist(split.col[folds.all[k,2]])
# Applying the clustering
train$clust <- cluster_alg(alg=alg, train_sub=train[,col.ind], i)
# Update the cluster means for each of the X, Y columns
groups.vals <- stats::aggregate(. ~ clust, data = train, FUN = mean)
groups.means.x <- groups.vals[,-c(1, col.ind+1)]
groups.means.y <- groups.vals[,col.ind+1]
# Assign each of the test observations to a cluster
# Based on the mean X values
test$assigns <- apply(fields::rdist(test[,-col.ind], groups.means.x), 1, which.min)
# Update the CV error for the fold
if (i > 1){
groups.means.y <- as.data.frame(groups.means.y)
groups.means.y$assigns <- 1:i
tmp <- merge(test, groups.means.y, by="assigns")
tmp <- tmp[,c(col.ind+1, (ncol(test)+1):ncol(tmp))]
err.fold[k] <- mean(sqrt(rowSums((tmp[,1:(ncol(tmp)/2)]-tmp[,(ncol(tmp)/2+1):ncol(tmp)])^2)))
} else{
groups.means.y <- as.data.frame(matrix(unlist(groups.means.y),
nrow=1,byrow=T))
groups.means.y$assigns <- 1:i
tmp <- merge(test, groups.means.y, by="assigns")
tmp <- tmp[,c(col.ind+1, (ncol(test)+1):ncol(tmp))]
err.fold[k] <- mean(sqrt(rowSums((tmp[,1:(ncol(tmp)/2)]-tmp[,(ncol(tmp)/2+1):ncol(tmp)])^2)))
}
}
err.clust[i] <- mean(err.fold)
}
kstar <- clusts[which.min(err.clust)]
clust.f <- cluster_alg(alg=alg, train_sub=input.data, kstar)
# Return list of kstar and cluster assignments
return(list(kstar=kstar, clust.assign=clust.f))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.