R/roc_analysis.R

Defines functions roc_analysis.matrix roc_analysis.data.frame roc_analysis.default roc_analysis

Documented in roc_analysis roc_analysis.data.frame roc_analysis.default roc_analysis.matrix

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To Do:
#   [!!!] Remove dependency on mlr in parameters sections.
#
#   [!!!] DESCRIPTION MUST BE UPDATED
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#' Carry out the ROC analysis
#'
#' Do the ROC (receiver operating characteristic) analysis and calculate
#' vector of cut-off values and associated number of
#' true positives (TP),
#' false negatives (FN),
#' false positives (FP), and
#' true negatives (TN) as well as performance measures such as sensitivity,
#' specificity, etc.
#'
#' @name roc_analysis
#' @param x (\code{numeric}) \cr A numeric vector.\cr
#'          (in \code{print} function) An object to print.
#' @param ... [!!!] Passed to further methods.
#'
#' @param gr (\code{factor}) \cr A factor vector with two levels.
#' @param pos_label (\code{character(1)}) \cr A string with the name of
#'                  positive group.
#' @param pos_is_larger (\code{NULL}|\code{TRUE}|\code{FALSE}) \cr
#'        A flag indicating, if values of positive group are on avedage
#'        are expected to be larger than values of negative group.
#'        If \code{NULL}, this option is determined basing on data using
#'        group medians.
#' @param optimize_by (\code{string(1)}) \cr [!!!] Method to determine the
#'                    optimal cut-off value.
#'                    Current options: \code{"bac"},
#'                                    \code{"youden"},
#'                                    \code{"kappa"}.
#'
#' @param results (\code{character(1)}) \cr A string indicating which results
#'                should be returned: either \code{"all"} (as described in
#'                section "Values") or just \code{"optimal"}.
#'
#'
#' @return A list (which also inherits from class \code{"roc_result_list"})
#' with three fields: \code{$info}, \code{$optimal}, \code{$all_results}.
#' \itemize{
#'
#' \item \bold{\code{$info}} is a data frame with columns
#'      \code{var_name} - empty string reserved for variable name,
#'      \code{neg_label}, \code{pos_label} labels of negative and positive
#'                              groups respectively,
#'      \code{n_neg}, \code{n_pos}, \code{n_total} - number of negative
#' and positive cases as well as number of cases in total.
#'
#' \item \bold{\code{$optimal}} one row from \code{$all_results}, which was
#' determined as having optima threshold (cut-off) value. Sometimes it can be
#' several rows, if the performance is equally good. \cr\cr
#'
#' \item \bold{\code{$all_results}} is a data frame with columns
#'       \code{cutoffs} for cutoff values,
#'       \code{tp} (number of true positives),
#'       \code{fn} (number of false negatives),
#'       \code{fp} (number of false positives),
#'       \code{tn} (number of true negatives),
#'       ... [!!!]
#'
#' }\cr\cr
#'
#'
#' @export
#'
#' @note
#' This function is inspired by functions \code{predict} and
#' \code{.compute.unnormalized.roc.curve} from \pkg{ROCR} package.
#'
#' @author Vilmantas Gegzna
#' @family functions for ROC
#'
#' @examples
#' library(manyROC)
#' library(ggplot2)
#'
#' # Make some data
#'
#' set.seed(1)
#' (x <- rnorm(10))
#'
#' (gr <- gl(n = 2, k = 5, length = 10, labels = c("H", "S")))
#'
#'
#' # Explore the functions
#'
#' roc_analysis(x, gr)
#'
#' roc_analysis(x, gr, pos_label = "H")
#'
#'
#' # --- Example 2 ---
#'
#' set.seed(1)
#' x2 <- c(rnorm(50, mean = 14), rnorm(50, mean = 20))
#' gr2 <- gl(2, 50, labels = c("Neg", "Pos"))
#'
#' (roc_rez <- roc_analysis(x2, gr2))
#'
#' optimal_cutoff2 <- roc_rez$optimal[1]
#'
#' qplot(x2, fill = gr2, color = gr2,
#'   geom = c("density", "rug"), alpha = I(0.3)) +
#'   geom_vline(xintercept = optimal_cutoff2)
#'
#'
#' # --- Example 3 ---
#'
#' set.seed(1)
#' x3 <- c(rnorm(100, mean = 11), rnorm(100, mean = 14))
#' gr3 <- gl(2, 100, labels = c("Neg", "Pos"))
#'
#' (roc_rez3 <- roc_analysis(x3, gr3))
#'
#' optimal_cutoff3 <- roc_rez3$optimal[1]
#'
#' qplot(x3, fill = gr3, color = gr3,
#'   geom = c("density", "rug"), alpha = I(0.3)) +
#'   geom_vline(xintercept = roc_rez3$optimal[1])
roc_analysis <- function(x,
                         gr,
                         pos_label  = levels(gr)[2],
                         pos_is_larger = NULL,
                         optimize_by = "bac",
                         results = "all", ...) {
  UseMethod("roc_analysis")
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @rdname roc_analysis
#' @export
roc_analysis.default <- function(x,
                                 gr,
                                 pos_label  = levels(gr)[2],
                                 pos_is_larger = NULL,
                                 optimize_by = "bac",
                                 results = "all", ...) {

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Check the input

  # [!!!] Inputs `optimize_by = NULL` and `results = "optimize"` are
  # incompatible. An assertion and warning are needed

  assert_numeric(x)
  assert_vector(x, strict = TRUE)

  assert_factor(gr, n.levels = 2)

  if (length(x) != length(gr)) {
    warning("Number of cases in `x` and `gr` must agree.")
  }
  assert_set_equal(length(x), length(gr))

  assert_string(pos_label)
  assert_choice(pos_label, levels(gr))

  assert_flag(pos_is_larger, null.ok = TRUE)

  assert_string(optimize_by, null.ok = TRUE)
  assert_choice(optimize_by, c("bac", "youden", "kappa"), null.ok = TRUE)

  assert_choice(results, c("all", "optimal"))

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Capture data to return later
  # data <- list(x = x, gr = gr)

  # Remove missing and infinite values
  finite_bool <- is.finite(x) & is.finite(gr)
  x  <-  x[finite_bool]
  gr <- gr[finite_bool]
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  ## Reorder levels to get predictabe results:
  ##  first -  label of negative group,
  ##  second - label of positive group.
  neg_label <- setdiff(levels(gr), pos_label)
  levels <- c(neg_label, pos_label)
  gr <- ordered(gr, levels = levels)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Automatically determine, which group has  smaller `x` values on average
  medians <- tapply(x, gr, median)
  dimnames(medians) <- NULL

  if (is.null(pos_is_larger)) {
    decreasing <- medians[1] < medians[2]  # median__neg_gr < median__pos_gr
  } else {
    decreasing <- pos_is_larger
  }

  # Determine, which group is above threshond, which belod.
  # Checks if positive group is prone to have larger values
  if (decreasing) {
    below <- neg_label
    above <- pos_label
  } else {
    below <- pos_label
    above <- neg_label
  }

  # ==========================================================================
  # # Do the ROC analysis
  x_order <- order(x, decreasing = decreasing)
  x_sorted <- x[x_order]

  # [!!!] Why is `rev()` called at all? Is it necessary?
  dupls <- rev(duplicated(rev(x_sorted)))

  ##  --- Original algorithm to determine cutoffs ---
  ## cutofs are values of vector x:
  ##
  # SIGN <- if (decreasing == TRUE) 1 else -1
  # cutoffs <- c(SIGN * Inf, x_sorted[!dupls])

  ##  --- Improved algorithm to determine cutoffs ---
  ##  cutoffs are middle values between two adjacent x values

  x_s <- x_sorted[!dupls]
  n <- length(x_s)
  if (decreasing == TRUE) {
    # cutoffs<-c(max(x_s), (x_s[1:(n - 1)] + diff(x_s)/2), min(x_s))
    cutoffs <- c(+Inf,     (x_s[1:(n - 1)] + diff(x_s) / 2), -Inf)
  } else {
    # cutoffs<-c(min(x_s), (x_s[1:(n - 1)] - diff(x_s)/2), max(x_s))
    cutoffs <- c(-Inf,     (x_s[1:(n - 1)] - diff(x_s) / 2), +Inf)
  }

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  n_neg <- sum(gr == neg_label)
  n_pos <- sum(gr == pos_label)

  # Following variables are vectors
  tp <- c(0, cumsum(gr[x_order] == pos_label)[!dupls])
  fp <- c(0, cumsum(gr[x_order] == neg_label)[!dupls])

  fn <- n_pos - tp
  tn <- n_neg - fp

  SE <- calculate_sensitivity(tp, fn)
  SP <- calculate_specificity(tn, fp)

  ppv <- calculate_ppv(tp, fp)
  npv <- calculate_npv(tn, fn)
  bac <- calculate_bac(SE, SP)
  youden <- calculate_youdens_j(SE, SP)

  # =========================================================================
  # The matrix of all numeric results

  all_results <- cbind(cutoff = cutoffs,
    tp = tp,
    fn = fn,
    fp = fp,
    tn = tn,
    sens = SE,
    spec = SP,
    ppv  = ppv,
    npv  = npv,
    bac  = bac,
    youden = youden
    # pos = tp + fp,
    # neg = tn + fn
  )
  # [!!!] class names must be reviewed, especially  "roc_results"
  all_results <- add_class_label(all_results, c("roc_results", "roc_df"))

  # =========================================================================
  # One row of results, which are considered to be optimal

  if (!is.null(optimize_by)) {
    max_ind <- switch(tolower(optimize_by),
      "bac"    = bac == max(bac),
      "youden" = youden == max(youden),
      # Calculations of kappa are relatively slow, thus if
      # not needed, it is not computed for the whole vector:
      "kappa"  = {
        kappa <-  calculate_kappa(tp, fn, fp, tn)
        kappa == max(kappa)
      }
    )

    # If several equally optimal values are present, the ties are resolved
    # by choosing the middle (rounded) index of available possibilities
    if (sum(max_ind) > 1)
      max_ind <- which(max_ind)[round(sum(max_ind) / 2)]

    # Next, the kappa and AUC values are calculated.
    optimal <- c(all_results[max_ind, ],
      kappa = calculate_kappa(tp[max_ind],
        fn[max_ind],
        fp[max_ind],
        tn[max_ind]),
      auc = calculate_auc(SE, SP),
      median_neg = medians[1],
      median_pos = medians[2]
    )
    optimal <- t(as.matrix(optimal))
    attr(optimal, "optimized_by") <- optimize_by
    optimal <- add_class_label(optimal, c("roc_opt_result", "roc_df"))

  } else {
    # If optimize_by = NULL
    optimal <- "*** Not calculated ***"
  }

  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Output if results = "optimal"
  if (results == "optimal")
    return(optimal)
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

  # ==========================================================================
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Information about the analysis
  info <- data.frame(
    var_name    = "",
    n_total     = n_neg + n_pos,
    n_neg       = n_neg,
    n_pos       = n_pos,
    neg_label   = neg_label,
    pos_label   = pos_label,
    median_neg  = medians[1],
    median_pos  = medians[2],
    below  = below,
    cutoff = optimal[1],
    above  = above,
    # pos_is_larger = decreasing,

    stringsAsFactors = FALSE
  )

  info <- add_class_label(info, c("roc_info", "roc_df"))

  # ==========================================================================
  # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  # Output

  # otherwise:
  res <- list(info = info,
    optimal = optimal,
    all_results = all_results
    # , data = data
  )

  add_class_label(res, c("roc_result_list"))

}
# =============================================================================
#' @rdname roc_analysis
#' @export
roc_analysis.data.frame <- function(x,
                                    gr,
                                    pos_label  = levels(gr)[2],
                                    pos_is_larger = NULL,
                                    optimize_by = "bac",
                                    results = "all", ...) {
  assert_data_frame(x, types = "numeric", ncols = 1)
  roc_analysis(as.vector(x),
    gr,
    pos_label = pos_label,
    pos_is_larger = pos_is_larger,
    optimize_by = optimize_by,
    results = results,
    ...)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @rdname roc_analysis
#' @export
roc_analysis.matrix <- function(x,
                                gr,
                                pos_label  = levels(gr)[2],
                                pos_is_larger = NULL,
                                optimize_by = "bac",
                                results = "all", ...) {
  assert_matrix(x, ncols = 1)
  roc_analysis(as.vector(x),
    gr,
    pos_label = pos_label,
    pos_is_larger = pos_is_larger,
    optimize_by = optimize_by,
    results = results,
    ...)
}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
GegznaV/manyROC documentation built on Sept. 18, 2020, 10:28 p.m.