R/classify_risk.R
In RiskyCNV: Risk Analysis of Genomic Copy Number Variation

Documented in classify_risk

#' Classify Samples into Risk Categories
#'
#' Reads a CSV file containing sample metadata and assigns each sample to a
#' risk category based on a specified scoring column. Supports built-in
#' presets for seven major disease types, fully custom user-defined risk
#' boundaries, or automatic classification using a normalised Risk Score
#' derived from the data itself.
#'
#' @param file_path Character. Path to the input CSV file containing sample
#'   metadata.
#' @param column_name Character. Name of the column containing the grading
#'   or staging score (e.g., Gleason score, Nottingham score, TNM stage).
#' @param disease_type Character. Disease type for built-in preset risk
#'   groupings. Supported values: \code{"prostate"}, \code{"breast"},
#'   \code{"colorectal"}, \code{"lung"}, \code{"cervical"},
#'   \code{"lymphoma"}, \code{"melanoma"}. Use \code{"custom"} to supply
#'   your own groupings via the \code{risk_groups} argument. Use
#'   \code{"auto"} to automatically classify using a normalised Risk Score
#'   derived from the data. Default is \code{"auto"}.
#' @param n_groups Integer. Number of risk groups to create. Only used when
#'   \code{disease_type = "auto"}. Must be between 2 and 5. Default is
#'   \code{3}.
#'   \describe{
#'     \item{2 groups}{low_risk, high_risk}
#'     \item{3 groups}{low_risk, intermediate_risk, high_risk}
#'     \item{4 groups}{very_low_risk, low_risk, high_risk, very_high_risk}
#'     \item{5 groups}{very_low_risk, low_risk, intermediate_risk,
#'       high_risk, very_high_risk}
#'   }
#' @param score_min Numeric or NULL. Minimum possible value of the score.
#'   If NULL (default), automatically detected from the data.
#' @param score_max Numeric or NULL. Maximum possible value of the score.
#'   If NULL (default), automatically detected from the data.
#' @param risk_groups Named list of functions. Required only when
#'   \code{disease_type = "custom"}. Each element must be a function that
#'   takes a numeric or character vector and returns a logical vector. The
#'   name of each element becomes the risk group label.
#' @param output_dir Character or NULL. Directory to save the output CSV
#'   file. If NULL (default), output is saved in the same directory as the
#'   input file.
#'
#' @return A named list where each element corresponds to a risk group and
#'   contains the sample IDs belonging to that group. The number of elements
#'   matches the number of risk groups detected or specified.
#'
#' @details
#' When \code{disease_type = "auto"}, the function computes a normalised
#' Risk Score for each sample using min-max normalisation:
#'
#' \deqn{Risk Score = \frac{score - min(score)}{max(score) - min(score)}}
#'
#' The Risk Score ranges from 0 (lowest risk) to 1 (highest risk). Risk
#' group boundaries are then determined automatically:
#'
#' \itemize{
#'   \item If the score distribution is approximately symmetric (skewness
#'     between -0.5 and +0.5), equal-width boundaries are used, dividing
#'     the 0-1 range into \code{n_groups} equal intervals.
#'   \item If the score distribution is skewed (skewness outside -0.5 to
#'     +0.5), quantile-based boundaries are used, ensuring approximately
#'     equal numbers of samples per group.
#' }
#'
#' The splitting method chosen is reported via a message. Risk group labels
#' are generated automatically based on \code{n_groups}.
#'
#' Built-in presets use clinically validated risk stratification systems:
#' \describe{
#'   \item{prostate}{D'Amico classification (D'Amico et al., 1998):
#'     low_risk (<=6), intermediate_risk (7), high_risk (>=8).}
#'   \item{breast}{Nottingham Prognostic Index (Galea et al., 1992):
#'     low_risk (3-5), intermediate_risk (6-7), high_risk (8-9).}
#'   \item{colorectal}{Dukes-based risk (Dukes, 1932):
#'     low_risk (A), intermediate_risk (B/C), high_risk (D).}
#'   \item{lung}{TNM stage-based (Goldstraw et al., 2016):
#'     low_risk (I), intermediate_risk (II/III), high_risk (IV).}
#'   \item{cervical}{FIGO stage-based (Bhatla et al., 2019):
#'     low_risk (I), intermediate_risk (II/III), high_risk (IV).}
#'   \item{lymphoma}{Ann Arbor/Lugano (Cheson et al., 2014):
#'     limited (I/II), advanced (III/IV).}
#'   \item{melanoma}{Breslow depth (Breslow, 1970):
#'     low_risk (<=1.0mm), intermediate_risk (1.0-4.0mm),
#'     high_risk (>4.0mm).}
#' }
#'
#' @references
#' \itemize{
#'   \item D'Amico AV, et al. (1998). Biochemical outcome after radical
#'     prostatectomy. \emph{JAMA}, 280(11):969-974.
#'   \item Galea MH, et al. (1992). The Nottingham prognostic index.
#'     \emph{Breast Cancer Res Treat}, 22(3):207-219.
#'   \item Dukes CE. (1932). The classification of cancer of the rectum.
#'     \emph{J Pathol Bacteriol}, 35:323-332.
#'   \item Goldstraw P, et al. (2016). The IASLC Lung Cancer Staging
#'     Project. \emph{J Thorac Oncol}, 11(1):39-51.
#'   \item Bhatla N, et al. (2019). Revised FIGO staging for carcinoma of
#'     the cervix uteri. \emph{Int J Gynaecol Obstet}, 145(1):129-135.
#'   \item Cheson BD, et al. (2014). The Lugano Classification.
#'     \emph{J Clin Oncol}, 32(27):3059-3068.
#'   \item Breslow A. (1970). Thickness and depth of invasion in the
#'     prognosis of cutaneous melanoma. \emph{Ann Surg}, 172(5):902-908.
#' }
#'
#' @examples
#' # Auto mode - let the function decide risk grouping (any disease)
#' sample_file <- system.file("extdata", "sample_data.csv",
#'                             package = "RiskyCNV")
#' result <- classify_risk(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "auto",
#'   n_groups     = 3,
#'   output_dir   = tempdir()
#' )
#' print(names(result))
#'
#' # Prostate cancer preset
#' result_prostate <- classify_risk(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "prostate",
#'   output_dir   = tempdir()
#' )
#' print(result_prostate$low_risk)
#'
#' # Custom risk groups for any disease
#' \dontrun{
#' result_custom <- classify_risk(
#'   file_path    = "samples.csv",
#'   column_name  = "risk_score",
#'   disease_type = "custom",
#'   risk_groups  = list(
#'     "low_risk"  = function(x) x <= 5,
#'     "high_risk" = function(x) x > 5
#'   ),
#'   output_dir   = tempdir()
#' )
#' }
#'
#' @export
classify_risk <- function(file_path,
                          column_name,
                          disease_type = "auto",
                          n_groups     = 3,
                          score_min    = NULL,
                          score_max    = NULL,
                          risk_groups  = NULL,
                          output_dir   = NULL) {

  # --- Built-in presets ---------------------------------------------------
  presets <- list(

    prostate = list(
      "low_risk"          = function(x) x <= 6,
      "intermediate_risk" = function(x) x == 7,
      "high_risk"         = function(x) x >= 8
    ),

    breast = list(
      "low_risk"          = function(x) x >= 3 & x <= 5,
      "intermediate_risk" = function(x) x >= 6 & x <= 7,
      "high_risk"         = function(x) x >= 8 & x <= 9
    ),

    colorectal = list(
      "low_risk"          = function(x) x == "A",
      "intermediate_risk" = function(x) x == "B" | x == "C",
      "high_risk"         = function(x) x == "D"
    ),

    lung = list(
      "low_risk"          = function(x) x == 1,
      "intermediate_risk" = function(x) x == 2 | x == 3,
      "high_risk"         = function(x) x == 4
    ),

    cervical = list(
      "low_risk"          = function(x) x == 1,
      "intermediate_risk" = function(x) x == 2 | x == 3,
      "high_risk"         = function(x) x == 4
    ),

    lymphoma = list(
      "limited"  = function(x) x == 1 | x == 2,
      "advanced" = function(x) x == 3 | x == 4
    ),

    melanoma = list(
      "low_risk"          = function(x) x <= 1.0,
      "intermediate_risk" = function(x) x > 1.0 & x <= 4.0,
      "high_risk"         = function(x) x > 4.0
    )
  )

  # --- Auto risk labels ---------------------------------------------------
  .make_risk_labels <- function(n) {
    switch(as.character(n),
      "2" = c("low_risk", "high_risk"),
      "3" = c("low_risk", "intermediate_risk", "high_risk"),
      "4" = c("very_low_risk", "low_risk", "high_risk", "very_high_risk"),
      "5" = c("very_low_risk", "low_risk", "intermediate_risk",
               "high_risk", "very_high_risk"),
      paste0("risk_group_", seq_len(n))
    )
  }

  # --- Read data ----------------------------------------------------------
  df          <- utils::read.csv(file_path)
  risk_column <- "risk_category"
  df[[risk_column]] <- NA_character_

  disease_type <- tolower(disease_type)

  # --- Resolve classification approach ------------------------------------
  if (disease_type == "auto") {

    scores <- as.numeric(df[[column_name]])

    if (all(is.na(scores))) {
      stop("Column '", column_name, "' could not be converted to numeric. ",
           "For non-numeric scores, use disease_type = 'custom' or a preset.")
    }

    # Min-max normalisation
    s_min <- if (is.null(score_min)) min(scores, na.rm = TRUE) else score_min
    s_max <- if (is.null(score_max)) max(scores, na.rm = TRUE) else score_max

    if (s_min == s_max) {
      stop("All values in '", column_name, "' are identical. ",
           "Cannot compute Risk Score.")
    }

    risk_score    <- (scores - s_min) / (s_max - s_min)
    df$risk_score <- risk_score

    # Skewness check to decide splitting method
    n      <- sum(!is.na(risk_score))
    mean_r <- mean(risk_score, na.rm = TRUE)
    sd_r   <- stats::sd(risk_score, na.rm = TRUE)
    skew   <- if (sd_r == 0) 0 else
      (sum((risk_score - mean_r)^3, na.rm = TRUE) / n) / (sd_r^3)

    if (abs(skew) <= 0.5) {
      method <- "equal-width"
      breaks <- seq(0, 1, length.out = n_groups + 1)
      breaks[1]            <- -Inf
      breaks[n_groups + 1] <- Inf
    } else {
      method <- "quantile"
      probs  <- seq(0, 1, length.out = n_groups + 1)
      breaks <- stats::quantile(risk_score, probs = probs, na.rm = TRUE)
      breaks[1]            <- -Inf
      breaks[n_groups + 1] <- Inf
    }

    message("Risk Score computed. Distribution skewness: ",
            round(skew, 3), ". Using ", method, " splitting.")

    labels <- .make_risk_labels(n_groups)
    df[[risk_column]] <- as.character(
      cut(risk_score, breaks = breaks, labels = labels, include.lowest = TRUE)
    )

  } else if (disease_type == "custom") {

    if (is.null(risk_groups) || !is.list(risk_groups)) {
      stop("When disease_type = 'custom', you must supply a named list of ",
           "functions via the 'risk_groups' argument.")
    }
    active_groups <- risk_groups

    for (group_name in names(active_groups)) {
      matched <- tryCatch(
        active_groups[[group_name]](df[[column_name]]),
        error = function(e) stop("Error in risk group '", group_name,
                                  "': ", conditionMessage(e))
      )
      matched[is.na(matched)] <- FALSE
      df[[risk_column]][matched] <- group_name
    }

  } else if (disease_type %in% names(presets)) {

    active_groups <- presets[[disease_type]]

    for (group_name in names(active_groups)) {
      matched <- tryCatch(
        active_groups[[group_name]](df[[column_name]]),
        error = function(e) stop("Error in risk group '", group_name,
                                  "': ", conditionMessage(e))
      )
      matched[is.na(matched)] <- FALSE
      df[[risk_column]][matched] <- group_name
    }

  } else {
    stop("Unsupported disease_type: '", disease_type, "'. ",
         "Choose one of: ",
         paste(c(names(presets), "auto", "custom"), collapse = ", "), ".")
  }

  # --- Save output --------------------------------------------------------
  timestamp        <- format(Sys.time(), "%Y%m%d_%H%M%S")
  output_dir       <- if (is.null(output_dir)) dirname(file_path) else output_dir
  output_file_name <- sub("\\.csv$",
                          paste0("_with_risk_categories_", timestamp, ".csv"),
                          basename(file_path))
  output_file_path <- file.path(output_dir, output_file_name)

  utils::write.csv(df, file = output_file_path, row.names = FALSE)
  message("Risk categories saved to: ", output_file_path)

  # --- Return grouped sample IDs -----------------------------------------
  groups <- unique(stats::na.omit(df[[risk_column]]))
  categorized_samples        <- lapply(groups, function(g)
    df[!is.na(df[[risk_column]]) & df[[risk_column]] == g, "sample"])
  names(categorized_samples) <- groups

  return(categorized_samples)
}