diceR: Diverse Cluster Ensemble in R

Documented in ev_confmat ev_nmi

#' External validity indices
#'
#' \strong{E}xternal \strong{v}alidity indices compare a predicted clustering
#' result with a reference class or gold standard.
#'
#' `ev_nmi` calculates the normalized mutual information
#'
#' @param pred.lab predicted labels generated by classifier
#' @param ref.lab reference labels for the observations
#' @param method method of computing the entropy. Can be any one of "emp", "mm",
#'   "shrink", or "sg".
#'
#' @return `ev_nmi` returns the normalized mutual information.
#' @references Strehl A, Ghosh J. Cluster ensembles: a knowledge reuse framework
#'   for combining multiple partitions. J. Mach. Learn. Res. 2002;3:583-617.
#' @note `ev_nmi` is adapted from [infotheo::mutinformation()]
#' @author Johnson Liu, Derek Chiu
#' @name external_validity
#' @export
#'
#' @examples
#' set.seed(1)
#' E <- matrix(rep(sample(1:4, 1000, replace = TRUE)), nrow = 100, byrow =
#'               FALSE)
#' x <- sample(1:4, 100, replace = TRUE)
#' y <- sample(1:4, 100, replace = TRUE)
#' ev_nmi(x, y)
#' ev_confmat(x, y)
ev_nmi <- function(pred.lab, ref.lab, method = "emp") {
  U <- data.frame(ref.lab, pred.lab)
  Hyx <- infotheo::entropy(U, method)
  Hx <- infotheo::entropy(pred.lab, method)
  Hy <- infotheo::entropy(ref.lab, method)
  I <- ifelse(Hx + Hy - Hyx < 0, 0, Hx + Hy - Hyx)
  NMI <- I / sqrt(Hx * Hy)
  NMI
}

#' @details `ev_confmat` calculates a variety of statistics associated with
#'   confusion matrices. Accuracy, Cohen's kappa, and Matthews correlation
#'   coefficient have direct multiclass definitions, whereas all other
#'   metrics use macro-averaging.
#'
#' @return `ev_confmat` returns a tibble of the following summary statistics using [yardstick::summary.conf_mat()]:
#' * `accuracy`: Accuracy
#' * `kap`: Cohen's kappa
#' * `sens`: Sensitivity
#' * `spec`: Specificity
#' * `ppv`: Positive predictive value
#' * `npv`: Negative predictive value
#' * `mcc`: Matthews correlation coefficient
#' * `j_index`: Youden's J statistic
#' * `bal_accuracy`: Balanced accuracy
#' * `detection_prevalence`: Detection prevalence
#' * `precision`: alias for `ppv`
#' * `recall`: alias for `sens`
#' * `f_meas`: F Measure
#' @rdname external_validity
#' @export
ev_confmat <- function(pred.lab, ref.lab) {
  if (!all(unique(pred.lab) %in% unique(ref.lab))) {
    stop("Cluster labels should be the same in the predicted and reference
         classes.")
  }
  # Relabel predicted classes
  pred.relab <- relabel_class(pred.lab, ref.lab) %>%
    factor(levels = sort(unique(ref.lab)))

  # Confusion matrix and summary statistics
  CM <- table(pred.relab, ref.lab)
  summary(yardstick::conf_mat(CM))
}


#' Concordance counts for all pairs of points between two cluster partitions
#' @return A list with 4 elements
#' * yy: number of points belonging to the same cluster both in cl1 and cl2
#' * yn: number of points belonging to the same cluster both in cl1 but not in cl2
#' * ny: number of points belonging to the same cluster both in cl2 but not in cl1
#' * nn: number of points _not_ belonging to the same cluster both in cl1 and cl2
#' @noRd
concordance_counts <- function(cl1, cl2) {
  n <- length(cl1)
  yy <- yn <- ny <- nn <- 0
  for (i in 1:(n - 1)) {
    for (j in (i + 1):n) {
      if (cl1[[i]] == cl1[[j]]) {
        if (cl2[[i]] == cl2[[j]]) {
          yy <- yy + 1
        } else {
          yn <- yn + 1
        }
      } else {
        if (cl2[[i]] == cl2[[j]]) {
          ny <- ny + 1
        } else {
          nn <- nn + 1
        }
      }
    }
  }
  return(list(yy = yy, yn = yn, ny = ny, nn = nn))
}

#' Hubert index
#' @noRd
ev_hubert <- function(cl1, cl2) {
  cc <- concordance_counts(cl1, cl2)
  Nt <- Reduce(`+`, cc)
  yy <- cc[["yy"]]
  yn <- cc[["yn"]]
  ny <- cc[["ny"]]
  nn <- cc[["nn"]]
  (Nt * yy - (yy + yn) * (yy + ny)) /
    sqrt((yy + yn) * (yy + ny) * (nn + yn) * (nn + ny))
}

#' Jaccard index
#' @noRd
ev_jaccard <- function(cl1, cl2) {
  cc <- concordance_counts(cl1, cl2)
  cc[["yy"]] / c(cc[["yy"]] + cc[["yn"]] + cc[["ny"]])
}

#' McNemar index
#' @noRd
ev_mcnemar <- function(cl1, cl2) {
  cc <- concordance_counts(cl1, cl2)
  (cc[["yn"]] - cc[["ny"]]) / sqrt(cc[["yn"]] + cc[["ny"]])
}

#' Rand index
#' @noRd
ev_rand <- function(cl1, cl2) {
  cc <- concordance_counts(cl1, cl2)
  Nt <- Reduce(`+`, cc)
  (cc[["yy"]] + cc[["nn"]]) / Nt
}