cc_testr: Rigorously test cluster stability

#' Calculate PAC
#'
#' This function calculates the proportion of ambiguous clustering (PAC) for
#' each value of \emph{k} tested via consensus clustering.
#'
#' @param consensus_mats A list of consensus matrices, as created by a call to
#'   \code{\link{consensus}}, \code{\link{M3C}}, or \code{
#'   \link[ConsensusClusterPlus]{ConsensusClusterPlus}}.
#' @param pac_window Lower and upper bounds for the consensus index sub-interval
#'   over which to calculate the PAC. Must be on (0, 1).
#' @param plot Return plot of PAC scores by \emph{k}?
#'
#' @details
#' Consensus clustering is a method for testing the stability of cluster
#' membership under resampling (Monti et al., 2003). Senbabaoglu et al. (2014)
#' demonstrated that traditional methods for estimating optimal cluster number
#' fail when probes are not independent, which they rarely are in omic data. The
#' authors propose a new measure, the proportion of ambiguous clustering (PAC),
#' which represents the increase in the empirical CDF curve for each potential 
#' cluster number \emph{k} over a user-defined sub-interval of the consensus 
#' index generated by the consensus cluster algorithm. The default settings of 
#' \code{pac_window = c(0.1, 0.9)} are taken from the original PAC paper, and 
#' generally lead to stable results.
#'
#' @return A data frame with PAC scores for each value of \emph{k} in \code{
#' consensus_mats}. If \code{plot = TRUE}, then PAC scores by \emph{k} are
#' plotted.
#'
#' @references
#' Monti, S., Tamayo, P., Mesirov, J., & Golub, T. (2003).
#' \href{http://link.springer.com/article/10.1023/A:1023949509487}{Consensus
#' Clustering: A Resampling-Based Method for Class Discovery and Visualization
#' of Gene Expression Microarray Data}. \emph{Machine Learning}, \emph{52}:
#' 91-118.
#'
#' Senbabaoglu, Y., Michailidis, G. & Li, J.Z. (2014).
#' \href{http://www.nature.com/articles/srep06207}{Critical limitations of
#' consensus clustering in class discovery}. \emph{Scientific Reports}, \emph{
#' 4}:6207.
#'
#' @examples
#' mat <- matrix(rnorm(1000 * 12), nrow = 1000, ncol = 12)
#' cc <- consensus(mat)
#' pac <- PAC(cc, plot = TRUE)
#'
#' @export
#' @importFrom purrr keep
#' @import dplyr
#' @import ggplot2
#'

PAC <- function(consensus_mats,
                pac_window = c(0.1, 0.9),
                plot = FALSE) {

  # Preliminaries
  max_k <- length(consensus_mats)
  if (!is.list(consensus_mats)) {
    stop('consensus_mats must be a list object containing consensus matrices.')
  }
  if (any(names(consensus_mats) == 'consensusMatrix')) {
    consensus_mats <- lapply(seq_len(max_k), function(k) {
      if (k == 1L) {
        return(NULL)
      } else {
        return(consensus_mats[[k]]$consensusMatrix)
      }
    })
  }
  if (length(pac_window) != 2) {
    stop('pac_window must be a vector of length 2.')
  }
  if (min(pac_window) <= 0 || max(pac_window) >= 1) {
    stop('Both values of pac_window must be on (0, 1).')
  }
  
  # Calculate PAC
  suppressWarnings(
    pacs <- expand.grid(k = 2:max_k,
                        Idx = pac_window) %>%
      rowwise(.) %>%
      mutate(CDF = ecdf(consensus_mats[[k]] %>% keep(lower.tri(.)))(Idx),
             k = as.factor(k)) %>%
      group_by(k) %>%
      mutate(PAC = diff(CDF)) %>%
      distinct(k, PAC) %>%
      ungroup(.) %>%
      mutate(k = as.integer(k))
  )
  
  # Plot output
  if (plot) {
    pacs <- pacs %>% 
      mutate(Optimal = PAC == min(PAC))
    p <- ggplot(pacs, aes(k, PAC)) +
      geom_point(aes(color = Optimal), size = 3) +
      geom_line() +
      scale_x_continuous(breaks = seq(0, length(consensus_mats), 1)) +
      scale_color_manual(values = c('black', 'red')) + 
      guides(color = guide_legend(reverse = TRUE)) +
      labs(x = expression('Cluster Number'~italic(k))) +
      theme_bw() +
      theme(plot.title = element_text(hjust = 0.5))
    print(p)
  }
  
  # Export
  return(pacs)

}