R/extract_metadata.R
In RiskyCNV: Risk Analysis of Genomic Copy Number Variation

Documented in extract_metadata

#' Extract Sample Metadata and Classify into Grade or Stage Groups
#'
#' Reads a CSV file containing sample metadata and classifies each sample into
#' grade or stage groups based on a specified scoring column. Supports
#' built-in presets for seven major disease types, fully custom user-defined
#' thresholds, or automatic classification using a normalised Risk Score
#' derived from the data itself.
#'
#' For prostate cancer, an optional \code{pattern_col} parameter allows
#' accurate distinction between Grade Group 2 (Gleason 3+4=7) and Grade
#' Group 3 (Gleason 4+3=7) using the primary histological pattern column.
#'
#' @param file_path Character. Path to the input CSV file containing sample
#'   metadata.
#' @param column_name Character. Name of the column containing the grading or
#'   staging score (e.g., Gleason score, Nottingham score, TNM stage).
#' @param disease_type Character. Disease type for built-in preset thresholds.
#'   Supported values: \code{"prostate"}, \code{"breast"}, \code{"colorectal"},
#'   \code{"lung"}, \code{"cervical"}, \code{"lymphoma"}, \code{"melanoma"}.
#'   Use \code{"custom"} to supply your own thresholds via the
#'   \code{thresholds} argument. Use \code{"auto"} to automatically classify
#'   using a normalised Risk Score derived from the data. Default is
#'   \code{"auto"}.
#' @param pattern_col Character or NULL. Only used when
#'   \code{disease_type = "prostate"}. Name of the column containing the
#'   primary Gleason pattern (pattern1). When provided, Grade Group 2
#'   (Gleason 3+4=7, primary pattern 3) and Grade Group 3 (Gleason 4+3=7,
#'   primary pattern 4) are distinguished accurately. If NULL (default),
#'   all Gleason 7 samples are assigned to Grade Group 2.
#' @param n_groups Integer. Number of grade or stage groups to create. Only
#'   used when \code{disease_type = "auto"}. Must be between 2 and 5.
#'   Default is \code{3}.
#' @param group_type Character. Type of group labels to generate. Only used
#'   when \code{disease_type = "auto"}. One of \code{"grade"},
#'   \code{"stage"}, or \code{"risk"}. Default is \code{"grade"}.
#'   \describe{
#'     \item{grade}{Labels as Grade 1, Grade 2, ...}
#'     \item{stage}{Labels as Stage I, Stage II, ...}
#'     \item{risk}{Labels as low_risk, intermediate_risk, high_risk, ...}
#'   }
#' @param score_min Numeric or NULL. Minimum possible value of the score.
#'   If NULL (default), automatically detected from the data.
#' @param score_max Numeric or NULL. Maximum possible value of the score.
#'   If NULL (default), automatically detected from the data.
#' @param thresholds Named list of functions. Required only when
#'   \code{disease_type = "custom"}. Each element must be a function that
#'   takes a numeric or character vector and returns a logical vector. The
#'   name of each element becomes the grade or stage group label.
#' @param output_dir Character or NULL. Directory to save the output CSV file.
#'   If NULL (default), output is saved in the same directory as the input
#'   file.
#'
#' @return A named list where each element corresponds to a grade or stage
#'   group and contains the sample IDs belonging to that group.
#'
#' @details
#' \strong{Prostate cancer Grade Group 2 vs Grade Group 3 distinction:}
#'
#' Both Grade Group 2 (Gleason 3+4=7) and Grade Group 3 (Gleason 4+3=7)
#' have the same total Gleason score of 7, making them indistinguishable
#' from the total score alone. The primary histological pattern determines
#' the correct assignment:
#' \itemize{
#'   \item Primary pattern 3 + secondary pattern 4 → Grade Group 2
#'   \item Primary pattern 4 + secondary pattern 3 → Grade Group 3
#' }
#' Supply the name of the primary pattern column via \code{pattern_col}
#' (typically \code{"pattern1"}) to enable this distinction. If
#' \code{pattern_col} is not supplied, all Gleason 7 samples are assigned
#' to Grade Group 2 and a message is shown.
#'
#' \strong{Auto mode:}
#'
#' When \code{disease_type = "auto"}, the function computes a normalised
#' Risk Score for each sample using min-max normalisation:
#'
#' \deqn{Risk Score = \frac{score - min(score)}{max(score) - min(score)}}
#'
#' The Risk Score ranges from 0 (lowest risk) to 1 (highest risk). Group
#' boundaries are determined automatically based on distribution skewness:
#' \itemize{
#'   \item Symmetric distribution (skewness between -0.5 and +0.5):
#'     equal-width boundaries
#'   \item Skewed distribution (skewness outside -0.5 to +0.5):
#'     quantile-based boundaries
#' }
#'
#' @references
#' \itemize{
#'   \item Epstein JI, et al. (2016). The 2014 ISUP Consensus Conference on
#'     Gleason Grading. \emph{Am J Surg Pathol}, 40(2):244-252.
#'   \item Elston CW & Ellis IO. (1991). Pathological prognostic factors in
#'     breast cancer. \emph{Histopathology}, 19(5):403-410.
#'   \item Dukes CE. (1932). The classification of cancer of the rectum.
#'     \emph{J Pathol Bacteriol}, 35:323-332.
#'   \item Goldstraw P, et al. (2016). The IASLC Lung Cancer Staging Project.
#'     \emph{J Thorac Oncol}, 11(1):39-51.
#'   \item Bhatla N, et al. (2019). Revised FIGO staging for carcinoma of
#'     the cervix uteri. \emph{Int J Gynaecol Obstet}, 145(1):129-135.
#'   \item Cheson BD, et al. (2014). The Lugano Classification.
#'     \emph{J Clin Oncol}, 32(27):3059-3068.
#'   \item Breslow A. (1970). Thickness and depth of invasion in the
#'     prognosis of cutaneous melanoma. \emph{Ann Surg}, 172(5):902-908.
#' }
#'
#' @examples
#' sample_file <- system.file("extdata", "sample_data.csv",
#'                             package = "RiskyCNV")
#'
#' # Prostate preset — without pattern column (Grade Group 2 and 3 merged)
#' result <- extract_metadata(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "prostate",
#'   output_dir   = tempdir()
#' )
#' print(names(result))
#'
#' # Prostate preset — with pattern column (Grade Group 2 and 3 distinguished)
#' result_full <- extract_metadata(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "prostate",
#'   pattern_col  = "pattern1",
#'   output_dir   = tempdir()
#' )
#' print(names(result_full))
#'
#' # Auto mode
#' result_auto <- extract_metadata(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "auto",
#'   n_groups     = 3,
#'   group_type   = "grade",
#'   output_dir   = tempdir()
#' )
#' print(names(result_auto))
#'
#' # Custom thresholds
#' result_custom <- extract_metadata(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "custom",
#'   thresholds   = list(
#'     "Stage I"   = function(x) x <= 6,
#'     "Stage II"  = function(x) x == 7,
#'     "Stage III" = function(x) x == 8,
#'     "Stage IV"  = function(x) x > 8
#'   ),
#'   output_dir   = tempdir()
#' )
#' print(names(result_custom))
#'
#' @export
extract_metadata <- function(file_path,
                             column_name,
                             disease_type = "auto",
                             pattern_col  = NULL,
                             n_groups     = 3,
                             group_type   = "grade",
                             score_min    = NULL,
                             score_max    = NULL,
                             thresholds   = NULL,
                             output_dir   = NULL) {

  # --- Built-in presets ----------------------------------------------------
  presets <- list(

    prostate = list(
      "Grade Group 1" = function(x) x <= 6,
      "Grade Group 2" = function(x) x == 7,
      "Grade Group 4" = function(x) x == 8,
      "Grade Group 5" = function(x) x >= 9
    ),

    breast = list(
      "Grade 1" = function(x) x >= 3 & x <= 5,
      "Grade 2" = function(x) x >= 6 & x <= 7,
      "Grade 3" = function(x) x >= 8 & x <= 9
    ),

    colorectal = list(
      "Stage A" = function(x) x == "A",
      "Stage B" = function(x) x == "B",
      "Stage C" = function(x) x == "C",
      "Stage D" = function(x) x == "D"
    ),

    lung = list(
      "Stage I"   = function(x) x == 1,
      "Stage II"  = function(x) x == 2,
      "Stage III" = function(x) x == 3,
      "Stage IV"  = function(x) x == 4
    ),

    cervical = list(
      "Stage I"   = function(x) x == 1,
      "Stage II"  = function(x) x == 2,
      "Stage III" = function(x) x == 3,
      "Stage IV"  = function(x) x == 4
    ),

    lymphoma = list(
      "Stage I"   = function(x) x == 1,
      "Stage II"  = function(x) x == 2,
      "Stage III" = function(x) x == 3,
      "Stage IV"  = function(x) x == 4
    ),

    melanoma = list(
      "Thin"         = function(x) x <= 1.0,
      "Intermediate" = function(x) x > 1.0 & x <= 4.0,
      "Thick"        = function(x) x > 4.0
    )
  )

  # --- Auto group labels ---------------------------------------------------
  .make_labels <- function(n, type) {
    roman <- c("I", "II", "III", "IV", "V")
    switch(type,
           "stage" = paste("Stage", roman[seq_len(n)]),
           "grade" = paste("Grade", seq_len(n)),
           "risk"  = switch(as.character(n),
                            "2" = c("low_risk", "high_risk"),
                            "3" = c("low_risk", "intermediate_risk", "high_risk"),
                            "4" = c("very_low_risk", "low_risk", "high_risk", "very_high_risk"),
                            "5" = c("very_low_risk", "low_risk", "intermediate_risk",
                                    "high_risk", "very_high_risk"),
                            paste0("group_", seq_len(n))
           ),
           paste0("Group ", seq_len(n))
    )
  }

  # --- Read data -----------------------------------------------------------
  df           <- utils::read.csv(file_path)
  grade_column <- "grade_group"
  df[[grade_column]] <- NA_character_

  disease_type <- tolower(disease_type)

  # --- Resolve classification approach ------------------------------------
  if (disease_type == "auto") {

    scores <- as.numeric(df[[column_name]])

    if (all(is.na(scores))) {
      stop("Column '", column_name, "' could not be converted to numeric. ",
           "For non-numeric scores use disease_type = 'custom' or a preset.")
    }

    s_min <- if (is.null(score_min)) min(scores, na.rm = TRUE) else score_min
    s_max <- if (is.null(score_max)) max(scores, na.rm = TRUE) else score_max

    if (s_min == s_max) {
      stop("All values in '", column_name, "' are identical. ",
           "Cannot compute Risk Score.")
    }

    risk_score    <- (scores - s_min) / (s_max - s_min)
    df$risk_score <- risk_score

    n      <- sum(!is.na(risk_score))
    mean_r <- mean(risk_score, na.rm = TRUE)
    sd_r   <- stats::sd(risk_score, na.rm = TRUE)
    skew   <- if (sd_r == 0) 0 else
      (sum((risk_score - mean_r)^3, na.rm = TRUE) / n) / (sd_r^3)

    if (abs(skew) <= 0.5) {
      method <- "equal-width"
      breaks <- seq(0, 1, length.out = n_groups + 1)
      breaks[1]            <- -Inf
      breaks[n_groups + 1] <- Inf
    } else {
      method <- "quantile"
      probs  <- seq(0, 1, length.out = n_groups + 1)
      breaks <- stats::quantile(risk_score, probs = probs, na.rm = TRUE)
      breaks[1]            <- -Inf
      breaks[n_groups + 1] <- Inf
    }

    message("Risk Score computed. Distribution skewness: ",
            round(skew, 3), ". Using ", method, " splitting.")

    labels <- .make_labels(n_groups, group_type)
    df[[grade_column]] <- as.character(
      cut(risk_score, breaks = breaks, labels = labels, include.lowest = TRUE)
    )

  } else if (disease_type == "custom") {

    if (is.null(thresholds) || !is.list(thresholds)) {
      stop("When disease_type = 'custom', you must supply a named list of ",
           "functions via the 'thresholds' argument.")
    }

    for (group_name in names(thresholds)) {
      matched <- tryCatch(
        thresholds[[group_name]](df[[column_name]]),
        error = function(e) stop("Error in threshold for '", group_name,
                                 "': ", conditionMessage(e))
      )
      matched[is.na(matched)] <- FALSE
      df[[grade_column]][matched] <- group_name
    }

  } else if (disease_type == "prostate") {

    # --- Prostate: handle Grade Group 2 vs 3 distinction ------------------
    active_thresholds <- presets[["prostate"]]

    # First apply non-Gleason-7 thresholds
    for (group_name in names(active_thresholds)) {
      if (group_name == "Grade Group 2") next  # handle separately below
      matched <- tryCatch(
        active_thresholds[[group_name]](df[[column_name]]),
        error = function(e) stop("Error in threshold for '", group_name,
                                 "': ", conditionMessage(e))
      )
      matched[is.na(matched)] <- FALSE
      df[[grade_column]][matched] <- group_name
    }

    # Now handle Gleason 7 samples
    gleason7 <- df[[column_name]] == 7 & !is.na(df[[column_name]])

    if (!is.null(pattern_col)) {
      # pattern_col provided — distinguish Grade Group 2 vs 3
      if (!pattern_col %in% colnames(df)) {
        stop("Column '", pattern_col, "' not found in the data. ",
             "Please check the pattern_col argument.")
      }
      pattern <- df[[pattern_col]]

      # Primary pattern 3 → Grade Group 2 (3+4=7)
      gg2 <- gleason7 & pattern == 3 & !is.na(pattern)
      # Primary pattern 4 → Grade Group 3 (4+3=7)
      gg3 <- gleason7 & pattern == 4 & !is.na(pattern)
      # Unknown primary pattern → Grade Group 2 with warning
      gg_unknown <- gleason7 & !(pattern %in% c(3, 4)) & !is.na(pattern)

      df[[grade_column]][gg2] <- "Grade Group 2"
      df[[grade_column]][gg3] <- "Grade Group 3"

      if (any(gg_unknown)) {
        warning("Some Gleason 7 samples have an unrecognised primary pattern ",
                "and have been assigned to Grade Group 2.")
        df[[grade_column]][gg_unknown] <- "Grade Group 2"
      }

      message("Grade Group 2 (3+4=7): ", sum(gg2), " samples. ",
              "Grade Group 3 (4+3=7): ", sum(gg3), " samples.")

    } else {
      # No pattern_col — assign all Gleason 7 to Grade Group 2
      df[[grade_column]][gleason7] <- "Grade Group 2"
      message("Note: 'pattern_col' not supplied. All Gleason 7 samples are ",
              "assigned to Grade Group 2. To distinguish Grade Group 2 ",
              "(3+4=7) from Grade Group 3 (4+3=7), supply the primary ",
              "pattern column name via pattern_col (e.g., pattern_col = ",
              "'pattern1').")
    }

  } else if (disease_type %in% names(presets)) {

    active_thresholds <- presets[[disease_type]]

    for (group_name in names(active_thresholds)) {
      matched <- tryCatch(
        active_thresholds[[group_name]](df[[column_name]]),
        error = function(e) stop("Error in threshold for '", group_name,
                                 "': ", conditionMessage(e))
      )
      matched[is.na(matched)] <- FALSE
      df[[grade_column]][matched] <- group_name
    }

  } else {
    stop("Unsupported disease_type: '", disease_type, "'. ",
         "Choose one of: ",
         paste(c(names(presets), "auto", "custom"), collapse = ", "), ".")
  }

  # --- Save output ---------------------------------------------------------
  timestamp        <- format(Sys.time(), "%Y%m%d_%H%M%S")
  output_dir       <- if (is.null(output_dir)) dirname(file_path) else output_dir
  output_file_name <- sub("\\.csv$",
                          paste0("_with_grade_groups_", timestamp, ".csv"),
                          basename(file_path))
  output_file_path <- file.path(output_dir, output_file_name)

  utils::write.csv(df, file = output_file_path, row.names = FALSE)
  message("Grade groups saved to: ", output_file_path)

  # --- Return grouped sample IDs ------------------------------------------
  groups <- unique(stats::na.omit(df[[grade_column]]))
  grouped_samples        <- lapply(groups, function(g)
    df[!is.na(df[[grade_column]]) & df[[grade_column]] == g, "sample"])
  names(grouped_samples) <- groups

  return(grouped_samples)
}