R/cc_crossclustering.R

#' A partial clustering algorithm with automatic estimation of
#' the number of clusters and identification of outliers
#'
#' This function performs the CrossClustering algorithm. This method
#' combines the Ward's minimum variance and Complete-linkage (default,
#' useful for finding spherical clusters) or Single-linkage (useful for
#' finding elongated clusters) algorithms, providing automatic estimation of
#' a suitable number of clusters and identification of outlier elements.
#'
#' @param dist A dissimilarity structure as produced by the function
#'        \code{dist}
#' @param k_w_min [int] Minimum number of clusters for the Ward's minimum
#'        variance method. By default is set equal 2
#' @param k_w_max [int] Maximum number of clusters for the Ward's minimum
#'        variance method (see details)
#' @param k2_max [int] Maximum number of clusters for the
#'        Complete/Single-linkage method. It can not be equal or greater
#'        than the number of elements to cluster (see details)
#' @param out [lgl] If \code{TRUE} (default) outliers must be searched (see
#'        details)
#' @param method [chr] "complete" (default) or "single". CrossClustering
#'        combines Ward's algorithm with Complete-linkage if method is set
#'        to "complete", otherwise (if method is set to 'single')
#'        Single-linkage will be used.
#' @return A list of objects describing characteristics of the partitioning
#'         as follows:
#'           \item{Optimal_cluster}{number of clusters}
#'           \item{Cluster_list}{a list of clusters; each element of this
#'           lists contains the indices of the elements belonging to the
#'           cluster}
#'           \item{Silhouette}{the average silhouette width over all the
#'             clusters}
#'           \item{n_total}{total number of input elements}
#'           \item{n_clustered}{number of input elements that have actually
#'             been clustered}
#'
#' @export
#'
#' @details See cited document for more details.
#' @examples
#' library(CrossClustering)
#'
#' #### Example of Cross-Clustering as in reference paper
#' #### method = "complete"
#'
#' data(toy)
#'
#' ### toy is transposed as we want to cluster samples (columns of the
#' ### original matrix)
#' toy_dist <- t(toy) %>%
#'   dist(method = "euclidean")
#'
#' ### Run CrossClustering
#' cc_crossclustering(toy_dist,
#'   k_w_min = 2,
#'   k_w_max = 5,
#'   k2_max  = 6,
#'   out     = TRUE
#' )
#'
#' #### Simulated data as in reference paper
#' #### method = "complete"
#' set.seed(10)
#' sg <- c(500, 250, 700, 300, 100)
#'
#' # 5 clusters
#'
#' t <- matrix(0, nrow = 5, ncol = 5)
#' t[1, ] <- rep(6, 5)
#' t[2, ] <- c( 0,  5, 12, 13, 15)
#' t[3, ] <- c(15, 11,  9,  5,  0)
#' t[4, ] <- c( 6, 12, 15, 10,  5)
#' t[5, ] <- c(12, 17,  3,  7, 10)
#'
#' t_mat <- NULL
#' for (i in seq_len(nrow(t))){
#'   t_mat <- rbind(
#'     t_mat,
#'     matrix(rep(t[i, ], sg[i]), nrow = sg[i], byrow = TRUE)
#'   )
#' }
#'
#' data_15 <- matrix(NA, nrow = 2000, ncol = 5)
#' data_15[1:1850, ] <- matrix(abs(rnorm(sum(sg) * 5, sd = 1.5)),
#'   nrow = sum(sg),
#'   ncol = 5
#' ) + t_mat
#'
#' set.seed(100) # simulate outliers
#' data_15[1851:2000, ] <- matrix(
#'   runif(n = 150 * 5, min = 0, max = max(data_15, na.rm = TRUE)),
#'   nrow = 150,
#'   ncol = 5
#' )
#'
#' ### Run CrossClustering
#' cc_crossclustering(dist(data_15),
#'   k_w_min = 2,
#'   k_w_max = 19,
#'   k2_max  = 20,
#'   out     = TRUE
#' )
#'
#'
#' #### Correlation-based distance is often used in gene expression time-series
#' ### data analysis. Here there is an example, using the "complete" method.
#'
#' data(nb_data)
#' nb_dist <- as.dist(1 - abs(cor(t(nb_data))))
#' cc_crossclustering(dist = nb_dist, k_w_max = 20, k2_max = 19)
#'
#'
#'
#'
#' #### method = "single"
#' ### Example on a famous shape data set
#' ### Two moons data
#'
#' data(twomoons)
#'
#' moons_dist <- twomoons[, 1:2] %>%
#'   dist(method = "euclidean")
#'
#' cc_moons <- cc_crossclustering(moons_dist,
#'   k_w_max = 9,
#'   k2_max  = 10,
#'   method  = 'single'
#' )
#'
#' moons_col <- cc_get_cluster(cc_moons)
#' plot(twomoons[, 1:2], col = moons_col,
#'   pch      = 19,
#'   xlab     = "",
#'   ylab     = "",
#'   main     = "CrossClustering-Single"
#' )
#'
#' ### Worms data
#' data(worms)
#'
#' worms_dist <- worms[, 1:2] %>%
#'   dist(method = "euclidean")
#'
#' cc_worms <- cc_crossclustering(worms_dist,
#'   k_w_max = 9,
#'   k2_max  = 10,
#'   method  = 'single'
#' )
#'
#' worms_col <-  cc_get_cluster(cc_worms)
#'
#' plot(worms[, 1:2], col = worms_col,
#'   pch      = 19,
#'   xlab     = "",
#'   ylab     = "",
#'   main     = "CrossClustering-Single"
#' )
#'
#'
#' ### CrossClustering-Single is not affected to chain-effect problem
#'
#' data(chain_effect)
#'
#' chain_dist <- chain_effect %>%
#'   dist(method = "euclidean")

#' cc_chain <- cc_crossclustering(chain_dist,
#'   k_w_max = 9,
#'   k2_max  = 10,
#'   method  = 'single'
#' )
#'
#' chain_col <- cc_get_cluster(cc_chain)
#'
#' plot(chain_effect, col = chain_col,
#'   pch  = 19,
#'   xlab = "",
#'   ylab = "",
#'   main = "CrossClustering-Single"
#' )
#'
#' @author
#' Paola Tellaroli, <paola [dot] tellaroli [at] unipd [dot] it>;;
#' Marco Bazzi, <bazzi [at] stat [dot] unipd [dot] it>;
#' Michele Donato, <mdonato [at] stanford [dot] edu>
#'
#' @references
#' Tellaroli P, Bazzi M., Donato M., Brazzale A. R., Draghici S. (2016).
#' Cross-Clustering: A Partial Clustering Algorithm with Automatic
#' Estimation of the Number of Clusters. PLoS ONE 11(3):   e0152333.
#' doi:10.1371/journal.pone.0152333
#'
#' #' Tellaroli P, Bazzi M., Donato M., Brazzale A. R., Draghici S. (2017).
#' E1829: Cross-Clustering: A Partial Clustering Algorithm with Automatic
#' Estimation of the Number of Clusters. CMStatistics 2017, London 16-18
#' December, Book of Abstracts (ISBN 978-9963-2227-4-2)


cc_crossclustering <- function(dist,
  k_w_min = 2,
  k_w_max = attr(dist, 'Size') - 2,
  k2_max  = k_w_max + 1,
  out     = TRUE,
  method  = c('complete', 'single')
) {
  # imput check
  assertive::assert_is_any_of(dist, 'dist')
  assertive::assert_is_a_number(k_w_min)

  assertive::assert_all_are_positive(k_w_min)
  if (!assertive::assert_all_are_equal_to(k_w_min, as.integer(k_w_min))) {
    stop(paste0('k_w_min must be an integer. It is: ', k_w_min, '.'))
  }

  assertive::assert_is_a_number(k_w_max)
  assertive::assert_all_are_less_than(k_w_min, k_w_max)
  if (!assertive::assert_all_are_equal_to(k_w_max, as.integer(k_w_max))) {
    stop(paste0('k_w_max must be an integer. It is: ', k_w_max, '.'))
  }

  assertive::assert_is_a_number(k2_max)
  assertive::assert_all_are_less_than(k_w_min, k2_max)
  if (!assertive::assert_all_are_equal_to(k2_max, as.integer(k2_max))) {
    stop(paste0('k2_max must be an integer. It is: ', k2_max, '.'))
  }

  assertive::assert_is_a_bool(out)
  assertive::assert_is_character(method)

  method <- match.arg(method)

  n <- attr(dist, 'Size')

  cluster_ward  <- hclust(dist, method = "ward.D")
  cluster_other <- hclust(dist, method = method)

  grid <- as.matrix(expand.grid(k_w_min:k_w_max, k_w_min:k2_max))

  if (out) {
    grid <- grid[grid[, 2] >  grid[, 1], ]
  } else {
    grid <- grid[grid[, 2] >= grid[, 1], ]
  }

  n_clu <- vector('list', length = nrow(grid))

  for(i in seq_len(nrow(grid))) {
    n_clu[[i]] <- consensus_cluster(grid[i, ],
      cluster_ward  = cluster_ward,
      cluster_other = cluster_other
    )[['max_consensus']]
  }

  grid <- cbind(grid, unlist(n_clu))
  colnames(grid) <- c("Ward", method, "N. classified")

  grid_star <- which(grid == max(grid[, 3]), arr.ind = TRUE)[, 1]
  k_star    <- rbind(grid[grid_star, 1:2])

  if(is.null(dim(k_star))){
    cluster_list <- consensus_cluster(k_star,
      cluster_ward  = cluster_ward,
      cluster_other = cluster_other
    )
    clustz <- cc_get_cluster(cluster_list$elements, n)
  } else {
    cluster_list <- apply(k_star, 1, consensus_cluster,
      cluster_ward  = cluster_ward,
      cluster_other = cluster_other
    )
    clustz <- sapply(cluster_list,
      function(lasim) cc_get_cluster(lasim$elements, n)
    )
  }

  if(is.null(dim(clustz))){
    clustz <- matrix(clustz, ncol = 1)
  }

  Sil <- vector('double', length = ncol(clustz))

  for (c in seq_len(ncol(clustz))) {
    Sil[[c]] <- mean(
      cluster::silhouette(as.numeric(clustz[, c]), dist = dist)[, 3]
    )
  }

  if(is.null(dim(k_star))) {
    k_star_star <- k_star[which.max(Sil)]
  } else {
    k_star_star <- k_star[which.max(Sil), ]
  }

  Cluster_list <- cluster_list[[which.max(Sil)]]$elements
  Cluster_list <- stats::setNames(Cluster_list,
    paste("cluster", seq_along(Cluster_list), sep = "_")
  )

  n_clustered  <- length(unlist(Cluster_list))



  structure(Cluster_list,
    optimal_cluster = length(cluster_list[[which.max(Sil)]]$elements),
    silhouette      = max(unlist(Sil)),
    n_total         = n,
    n_clustered     = n_clustered,
    input           = list(
      dist    = dist,
      k_w_min = k_w_min,
      k_w_max = k_w_max,
      k2_max  = k2_max,
      out     = out,
      method  = method
    ),
    class = "crossclustering"
  )
}


#' print method for crossclustering class
#'
#' @inheritParams base::print
#' @describeIn cc_crossclustering
#'
#' @export
print.crossclustering <- function(x, ...) {
  cat(paste0("\n",
    "    CrossClustering with method ",
         crayon::green(attr(x, 'input')[['method']]), ".\n",
    "\n"
  ))
  cat(paste0("Parameter used:\n",
    "  - Interval for the number of cluster of Ward's algorithm: [",
       crayon::green(attr(x, 'input')[['k_w_min']]), ", ",
       crayon::green(attr(x, 'input')[['k_w_max']]), "].\n",
    "  - Interval for the number of cluster of the ",
       crayon::green(attr(x, 'input')[['method']]),
       " algorithm: [", crayon::green(attr(x, 'input')[['k_w_min']]), ", ",
       crayon::green(attr(x, 'input')[['k2_max']]), "].\n"
  ))
  cat(paste0("  - Outliers ", crayon::green('are '),
    if(!attr(x, 'input')[['out']]) crayon::red('NOT '), "considered.",
    "\n\n"
  ))
  cat(paste0("Number of clusters found: ",
    crayon::blue(attr(x, 'optimal_cluster')), ".\n",
    "Leading to an avarage silhouette width of: ",
     crayon::blue(round(attr(x, 'silhouette'), 4)), ".\n",
    "\n"
  ))
  cat(paste0("A total of ",
    crayon::blue(attr(x, 'n_clustered')), " elements clustered out of ",
    crayon::blue(attr(x, 'n_total')), " elements considered."
  ))
}
CorradoLanera/CrossClustering documentation built on May 12, 2019, 4:33 a.m.