## ------------------------------------------------------------------------------
## The core clustering algorithm
## ------------------------------------------------------------------------------
#' Core clustering.
#'
#' @param data The data. Either a matrix or a data frame, depending on the requirements of the clustering function.
#' @param clustering_func A clustering function to use, which given data returns cluster indices.
#' @param sampling_func A sampling function to use. Default is NULL. Only used when method is \code{distribution}.
#' @param method The method to use. String. Either \code{bootstrap} to
#' use bootstrapping or \code{distribution} to calculate the exact
#' co-occurrence probabilities.
#' @param alpha The confidenence level. Default 0.1
#' @param n_iter The number of iterations to use. The default is 1000.
#' @param weak_cluster The idnex to use for weak points. Default is -1.
#' @param using_trim Boolean. Does the clustering method use trimming?
#' @param seed Random seed. Default is NULL to not set the random seed.
#'
#' @return A list structure:
#' \describe{
#' \item{data}{The data}
#' \item{alpha}{The confidence level}
#' \item{iterations}{The number of iterations}
#' \item{cluster}{The initial clustering}
#' \item{cluster_core}{The core clustering}
#' \item{mat}{The co-occurrence probability matrix}
#' \item{amat}{The adjacency matrix}
#' \item{using_trim}{Whether or not the clustering method is using trimming}
#' \item{weak_cluster}{The index used to describe the weak clusters}
#' }
#'
#' @export
#' @useDynLib corecluster
#' @importFrom Rcpp sourceCpp
core_clustering <- function(data,
clustering_func = NULL,
sampling_func = NULL,
method = "bootstrap",
alpha = 0.1,
n_iter = 1000,
weak_cluster = -1,
using_trim = FALSE,
seed = NULL) {
## Set the random seed
if (! is.null(seed))
set.seed(seed)
## Check which method is to be used
if (! method %in% c("bootstrap", "distribution"))
stop("Unknown method. Cannot continue")
## Check if clustering function is given
if (is.null(clustering_func))
stop("No clustering function provided. Cannot continue")
## Get initial clustering
clusters_initial <- clustering_func(data)
## Check which method that is to be used
if (method == "bootstrap") {
pmat <- matrix(1 / nrow(data), nrow = nrow(data), ncol = nrow(data))
pmat <- co_occurrence_prob_all(data = data, cluster_function = clustering_func, iter = n_iter)
}
if (method == "distribution") {
if (is.null(sampling_func))
stop("No sampling function provided. Cannot continue")
pmat <- matrix(0, nrow = nrow(data), ncol = nrow(data))
NN <- nrow(data)
k <- 0
kmax <- (NN*NN - N) / 2
## Initialize a progress bar
prog_bar <- txtProgressBar(min = 0, max = kmax, initial = 0, style = 3)
for (i in seq.int(nrow(data) - 1)) {
for (j in seq.int(i + 1, nrow(data))) {
setTxtProgressBar(prog_bar, k)
k <- (k + 1)
pmat[i, j] <- co_occurrence_prob_single_parallel(data[i,], data[j,], data_function = sampling_func, cluster_function = clustering_func, iter = n_iter)
}
}
close(prog_bar)
}
## Discretize to get adjacency matrix
amat <- 1 * (pmat >= (1 - alpha))
g <- graph.adjacency(adjmatrix = amat, mode = "upper", diag = TRUE)
## g_plottable <- simplify(g, remove.multiple = FALSE, remove.loops = TRUE)
## Determine largest maximal cliques inside each cluster
tmp <- find_core_clusters(g, clusters_initial)
## If we are using a clustering method that does trimming (e.g. tkmeans), we must remove the
## first cluster from the core clusters, corresponding to cluster index zero, i.e., the trimmed
## points
if (using_trim)
tmp <- tmp[-1]
clusters_core <- clusters_from_cliques(tmp, nrow(data), weak_cluster = weak_cluster)
## Return the result
list("data" = data,
"alpha" = alpha,
"iterations" = n_iter,
"cluster" = clusters_initial,
"cluster_core" = clusters_core,
"pmat" = pmat,
"amat" = amat,
"using_trim" = using_trim,
"weak_cluster" = weak_cluster)
}
## ------------------------------------------------------------------------------
## Functions for calculating the co-occurrence probabilities
## ------------------------------------------------------------------------------
#' Estimate all co-occurence probabilities Pij using bootstrap and within-bag-samples
#'
#' @param data A matrix with data points.
#' @param cluster_function A clustering function that returns a set of cluster labels.
#' @param iter The number of iterations. Optional. Default is 1000.
#'
#' @return A square matrix of size (nrow(data) x nrow(data))
#' containing the co-occurence probabilities of the items in the data
#' matrix.
#'
#' @export
co_occurrence_prob_all <- function(data, cluster_function, iter = 1000) {
n <- nrow(data)
same <- matrix((1 / n), nrow = n, ncol = n)
count <- matrix(1, nrow = n, ncol = n)
## Initialize a progress bar
prog_bar <- txtProgressBar(min = 0, max = iter, initial = 0, style = 3)
for (k in seq.int(iter)) {
setTxtProgressBar(prog_bar, k)
## loop until a clustering result is achieved
## -----------------------
bp <- 0
cl <- NULL
while ("success" != tryCatch({
idx <- sample.int(n, size = n, replace = TRUE)
cl <- cluster_function(data[idx, ])
"success"
}, error = function(e) {
print(e)
"failure"
}, finally = {
bp <- bp + 1
})) {
if (bp >= 5) {
print("break in co_occurrence_prob_all")
break
}
}
## --------------------------------------------------
## call C++ code
## --------------------------------------------------
if (! is.null(cl)) {
res <- co_occurrence_mat(idx, cl, same, count)
same <- res$same
count <- res$count
}
## --------------------------------------------------
}
close(prog_bar)
diag(same) <- diag(count)
same / count
}
#' Estimate co-occurence probability Pij of two data items x1 and x2. The function is
#' parallellized and utilizes (number-of-cores - 1) parallel computations.
#'
#' @param x1 Data item i.
#' @param x2 Data item j.
#' @param data_function A function, such that data_function() returns n-2 resampled data points.
#' @param cluster_function A clustering function that returns a set of cluster labels.
#' @param iter The number of iterations. Optional. Default is 1000.
#'
#' @return The co-occurence probability of x1 and x2; i.e., the
#' probability that data items x1 and x2 occur in the same cluster.
#'
#' @export
co_occurrence_prob_single_parallel <- function(x1, x2, data_function, cluster_function, iter = 1000) {
res <- mclapply(1:iter, function(i) co_occurrence(x1, x2, data_function, cluster_function), mc.cores = (detectCores() - 1))
res <- unlist(res)
sum(res, na.rm = TRUE) / sum(! is.na(res))
}
#' A helper function for that determines if two data items x1 and x2 occur in one
#' clustering of the data
#'
#' @param x1 Data item i.
#' @param x2 Data item j.
#' @param data_function A function, such that data_function() returns n-2 resampled data points.
#' @param cluster_function A clustering function that returns a set of cluster labels.
#'
#' @return 1 if x1 and x2 are in the same clustering, 0 if not. NULL if the clustering failed.
#'
#' @export
co_occurrence <- function(x1, x2, data_function, cluster_function) {
clu <- NULL
bp <- 0
while ("success" != tryCatch({
clu <- cluster_function(rbind(x1, x2, data_function()))
"success"
}, error = function(e) {
print(e)
"failure"
}, finally = {
bp <- bp + 1
})) {
if (bp >= 5) {
print("break in co_occurrence_prob_all")
break
}
}
## check if the clustering failed and returned a NULL
if (is.null(clu))
return(NA)
## otherwise
if (clu[1] == clu[2]) { 1 } else { 0 }
}
## ------------------------------------------------------------------------------
## Functions for finding the core clusters
## ------------------------------------------------------------------------------
#' Determine the core clusters
#'
#' @param g A graph
#' @param clusters An initial clustering
#'
#' @return The core clusters
#'
#' @export
find_core_clusters <- function(g, clusters) {
out <- vector(mode = "list", length = length(unique(clusters)))
j <- 1
for (i in sort(unique(clusters))) {
vtmp <- sort(which(clusters == i))
g2 <- induced.subgraph(g, v = vtmp)
cl <- find_largest_clique(g2)
out[[j]] <- vtmp[cl]
j <- j + 1
}
out
}
#' Find the largest clique in the graph g
#'
#' @param g A graph
#'
#' @return The indices of the items in the largest clique in g.
#'
#' @export
find_largest_clique <- function(g) {
mcl <- maximal.cliques(g)
mcl[[which.max(sapply(mcl, length))]]
}
#' Get the clusters from the cliques
#'
#' @param cliques The cliques
#' @param N The number of data items.
#' @param weak_cluster The id of the weak points. The default is 0.
#'
#' @return The clusters
#'
#' @export
clusters_from_cliques <- function(cliques, N, weak_cluster = 0) {
clusters <- rep(weak_cluster, N)
for (i in seq.int(length(cliques)))
clusters[unlist(cliques[[i]])] <- i
clusters
}
## ------------------------------------------------------------------------------
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.