Nothing
#' Extract Sample Metadata and Classify into Grade or Stage Groups
#'
#' Reads a CSV file containing sample metadata and classifies each sample into
#' grade or stage groups based on a specified scoring column. Supports
#' built-in presets for seven major disease types, fully custom user-defined
#' thresholds, or automatic classification using a normalised Risk Score
#' derived from the data itself.
#'
#' For prostate cancer, an optional \code{pattern_col} parameter allows
#' accurate distinction between Grade Group 2 (Gleason 3+4=7) and Grade
#' Group 3 (Gleason 4+3=7) using the primary histological pattern column.
#'
#' @param file_path Character. Path to the input CSV file containing sample
#' metadata.
#' @param column_name Character. Name of the column containing the grading or
#' staging score (e.g., Gleason score, Nottingham score, TNM stage).
#' @param disease_type Character. Disease type for built-in preset thresholds.
#' Supported values: \code{"prostate"}, \code{"breast"}, \code{"colorectal"},
#' \code{"lung"}, \code{"cervical"}, \code{"lymphoma"}, \code{"melanoma"}.
#' Use \code{"custom"} to supply your own thresholds via the
#' \code{thresholds} argument. Use \code{"auto"} to automatically classify
#' using a normalised Risk Score derived from the data. Default is
#' \code{"auto"}.
#' @param pattern_col Character or NULL. Only used when
#' \code{disease_type = "prostate"}. Name of the column containing the
#' primary Gleason pattern (pattern1). When provided, Grade Group 2
#' (Gleason 3+4=7, primary pattern 3) and Grade Group 3 (Gleason 4+3=7,
#' primary pattern 4) are distinguished accurately. If NULL (default),
#' all Gleason 7 samples are assigned to Grade Group 2.
#' @param n_groups Integer. Number of grade or stage groups to create. Only
#' used when \code{disease_type = "auto"}. Must be between 2 and 5.
#' Default is \code{3}.
#' @param group_type Character. Type of group labels to generate. Only used
#' when \code{disease_type = "auto"}. One of \code{"grade"},
#' \code{"stage"}, or \code{"risk"}. Default is \code{"grade"}.
#' \describe{
#' \item{grade}{Labels as Grade 1, Grade 2, ...}
#' \item{stage}{Labels as Stage I, Stage II, ...}
#' \item{risk}{Labels as low_risk, intermediate_risk, high_risk, ...}
#' }
#' @param score_min Numeric or NULL. Minimum possible value of the score.
#' If NULL (default), automatically detected from the data.
#' @param score_max Numeric or NULL. Maximum possible value of the score.
#' If NULL (default), automatically detected from the data.
#' @param thresholds Named list of functions. Required only when
#' \code{disease_type = "custom"}. Each element must be a function that
#' takes a numeric or character vector and returns a logical vector. The
#' name of each element becomes the grade or stage group label.
#' @param output_dir Character or NULL. Directory to save the output CSV file.
#' If NULL (default), output is saved in the same directory as the input
#' file.
#'
#' @return A named list where each element corresponds to a grade or stage
#' group and contains the sample IDs belonging to that group.
#'
#' @details
#' \strong{Prostate cancer Grade Group 2 vs Grade Group 3 distinction:}
#'
#' Both Grade Group 2 (Gleason 3+4=7) and Grade Group 3 (Gleason 4+3=7)
#' have the same total Gleason score of 7, making them indistinguishable
#' from the total score alone. The primary histological pattern determines
#' the correct assignment:
#' \itemize{
#' \item Primary pattern 3 + secondary pattern 4 → Grade Group 2
#' \item Primary pattern 4 + secondary pattern 3 → Grade Group 3
#' }
#' Supply the name of the primary pattern column via \code{pattern_col}
#' (typically \code{"pattern1"}) to enable this distinction. If
#' \code{pattern_col} is not supplied, all Gleason 7 samples are assigned
#' to Grade Group 2 and a message is shown.
#'
#' \strong{Auto mode:}
#'
#' When \code{disease_type = "auto"}, the function computes a normalised
#' Risk Score for each sample using min-max normalisation:
#'
#' \deqn{Risk Score = \frac{score - min(score)}{max(score) - min(score)}}
#'
#' The Risk Score ranges from 0 (lowest risk) to 1 (highest risk). Group
#' boundaries are determined automatically based on distribution skewness:
#' \itemize{
#' \item Symmetric distribution (skewness between -0.5 and +0.5):
#' equal-width boundaries
#' \item Skewed distribution (skewness outside -0.5 to +0.5):
#' quantile-based boundaries
#' }
#'
#' @references
#' \itemize{
#' \item Epstein JI, et al. (2016). The 2014 ISUP Consensus Conference on
#' Gleason Grading. \emph{Am J Surg Pathol}, 40(2):244-252.
#' \item Elston CW & Ellis IO. (1991). Pathological prognostic factors in
#' breast cancer. \emph{Histopathology}, 19(5):403-410.
#' \item Dukes CE. (1932). The classification of cancer of the rectum.
#' \emph{J Pathol Bacteriol}, 35:323-332.
#' \item Goldstraw P, et al. (2016). The IASLC Lung Cancer Staging Project.
#' \emph{J Thorac Oncol}, 11(1):39-51.
#' \item Bhatla N, et al. (2019). Revised FIGO staging for carcinoma of
#' the cervix uteri. \emph{Int J Gynaecol Obstet}, 145(1):129-135.
#' \item Cheson BD, et al. (2014). The Lugano Classification.
#' \emph{J Clin Oncol}, 32(27):3059-3068.
#' \item Breslow A. (1970). Thickness and depth of invasion in the
#' prognosis of cutaneous melanoma. \emph{Ann Surg}, 172(5):902-908.
#' }
#'
#' @examples
#' sample_file <- system.file("extdata", "sample_data.csv",
#' package = "RiskyCNV")
#'
#' # Prostate preset — without pattern column (Grade Group 2 and 3 merged)
#' result <- extract_metadata(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "prostate",
#' output_dir = tempdir()
#' )
#' print(names(result))
#'
#' # Prostate preset — with pattern column (Grade Group 2 and 3 distinguished)
#' result_full <- extract_metadata(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "prostate",
#' pattern_col = "pattern1",
#' output_dir = tempdir()
#' )
#' print(names(result_full))
#'
#' # Auto mode
#' result_auto <- extract_metadata(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "auto",
#' n_groups = 3,
#' group_type = "grade",
#' output_dir = tempdir()
#' )
#' print(names(result_auto))
#'
#' # Custom thresholds
#' result_custom <- extract_metadata(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "custom",
#' thresholds = list(
#' "Stage I" = function(x) x <= 6,
#' "Stage II" = function(x) x == 7,
#' "Stage III" = function(x) x == 8,
#' "Stage IV" = function(x) x > 8
#' ),
#' output_dir = tempdir()
#' )
#' print(names(result_custom))
#'
#' @export
extract_metadata <- function(file_path,
column_name,
disease_type = "auto",
pattern_col = NULL,
n_groups = 3,
group_type = "grade",
score_min = NULL,
score_max = NULL,
thresholds = NULL,
output_dir = NULL) {
# --- Built-in presets ----------------------------------------------------
presets <- list(
prostate = list(
"Grade Group 1" = function(x) x <= 6,
"Grade Group 2" = function(x) x == 7,
"Grade Group 4" = function(x) x == 8,
"Grade Group 5" = function(x) x >= 9
),
breast = list(
"Grade 1" = function(x) x >= 3 & x <= 5,
"Grade 2" = function(x) x >= 6 & x <= 7,
"Grade 3" = function(x) x >= 8 & x <= 9
),
colorectal = list(
"Stage A" = function(x) x == "A",
"Stage B" = function(x) x == "B",
"Stage C" = function(x) x == "C",
"Stage D" = function(x) x == "D"
),
lung = list(
"Stage I" = function(x) x == 1,
"Stage II" = function(x) x == 2,
"Stage III" = function(x) x == 3,
"Stage IV" = function(x) x == 4
),
cervical = list(
"Stage I" = function(x) x == 1,
"Stage II" = function(x) x == 2,
"Stage III" = function(x) x == 3,
"Stage IV" = function(x) x == 4
),
lymphoma = list(
"Stage I" = function(x) x == 1,
"Stage II" = function(x) x == 2,
"Stage III" = function(x) x == 3,
"Stage IV" = function(x) x == 4
),
melanoma = list(
"Thin" = function(x) x <= 1.0,
"Intermediate" = function(x) x > 1.0 & x <= 4.0,
"Thick" = function(x) x > 4.0
)
)
# --- Auto group labels ---------------------------------------------------
.make_labels <- function(n, type) {
roman <- c("I", "II", "III", "IV", "V")
switch(type,
"stage" = paste("Stage", roman[seq_len(n)]),
"grade" = paste("Grade", seq_len(n)),
"risk" = switch(as.character(n),
"2" = c("low_risk", "high_risk"),
"3" = c("low_risk", "intermediate_risk", "high_risk"),
"4" = c("very_low_risk", "low_risk", "high_risk", "very_high_risk"),
"5" = c("very_low_risk", "low_risk", "intermediate_risk",
"high_risk", "very_high_risk"),
paste0("group_", seq_len(n))
),
paste0("Group ", seq_len(n))
)
}
# --- Read data -----------------------------------------------------------
df <- utils::read.csv(file_path)
grade_column <- "grade_group"
df[[grade_column]] <- NA_character_
disease_type <- tolower(disease_type)
# --- Resolve classification approach ------------------------------------
if (disease_type == "auto") {
scores <- as.numeric(df[[column_name]])
if (all(is.na(scores))) {
stop("Column '", column_name, "' could not be converted to numeric. ",
"For non-numeric scores use disease_type = 'custom' or a preset.")
}
s_min <- if (is.null(score_min)) min(scores, na.rm = TRUE) else score_min
s_max <- if (is.null(score_max)) max(scores, na.rm = TRUE) else score_max
if (s_min == s_max) {
stop("All values in '", column_name, "' are identical. ",
"Cannot compute Risk Score.")
}
risk_score <- (scores - s_min) / (s_max - s_min)
df$risk_score <- risk_score
n <- sum(!is.na(risk_score))
mean_r <- mean(risk_score, na.rm = TRUE)
sd_r <- stats::sd(risk_score, na.rm = TRUE)
skew <- if (sd_r == 0) 0 else
(sum((risk_score - mean_r)^3, na.rm = TRUE) / n) / (sd_r^3)
if (abs(skew) <= 0.5) {
method <- "equal-width"
breaks <- seq(0, 1, length.out = n_groups + 1)
breaks[1] <- -Inf
breaks[n_groups + 1] <- Inf
} else {
method <- "quantile"
probs <- seq(0, 1, length.out = n_groups + 1)
breaks <- stats::quantile(risk_score, probs = probs, na.rm = TRUE)
breaks[1] <- -Inf
breaks[n_groups + 1] <- Inf
}
message("Risk Score computed. Distribution skewness: ",
round(skew, 3), ". Using ", method, " splitting.")
labels <- .make_labels(n_groups, group_type)
df[[grade_column]] <- as.character(
cut(risk_score, breaks = breaks, labels = labels, include.lowest = TRUE)
)
} else if (disease_type == "custom") {
if (is.null(thresholds) || !is.list(thresholds)) {
stop("When disease_type = 'custom', you must supply a named list of ",
"functions via the 'thresholds' argument.")
}
for (group_name in names(thresholds)) {
matched <- tryCatch(
thresholds[[group_name]](df[[column_name]]),
error = function(e) stop("Error in threshold for '", group_name,
"': ", conditionMessage(e))
)
matched[is.na(matched)] <- FALSE
df[[grade_column]][matched] <- group_name
}
} else if (disease_type == "prostate") {
# --- Prostate: handle Grade Group 2 vs 3 distinction ------------------
active_thresholds <- presets[["prostate"]]
# First apply non-Gleason-7 thresholds
for (group_name in names(active_thresholds)) {
if (group_name == "Grade Group 2") next # handle separately below
matched <- tryCatch(
active_thresholds[[group_name]](df[[column_name]]),
error = function(e) stop("Error in threshold for '", group_name,
"': ", conditionMessage(e))
)
matched[is.na(matched)] <- FALSE
df[[grade_column]][matched] <- group_name
}
# Now handle Gleason 7 samples
gleason7 <- df[[column_name]] == 7 & !is.na(df[[column_name]])
if (!is.null(pattern_col)) {
# pattern_col provided — distinguish Grade Group 2 vs 3
if (!pattern_col %in% colnames(df)) {
stop("Column '", pattern_col, "' not found in the data. ",
"Please check the pattern_col argument.")
}
pattern <- df[[pattern_col]]
# Primary pattern 3 → Grade Group 2 (3+4=7)
gg2 <- gleason7 & pattern == 3 & !is.na(pattern)
# Primary pattern 4 → Grade Group 3 (4+3=7)
gg3 <- gleason7 & pattern == 4 & !is.na(pattern)
# Unknown primary pattern → Grade Group 2 with warning
gg_unknown <- gleason7 & !(pattern %in% c(3, 4)) & !is.na(pattern)
df[[grade_column]][gg2] <- "Grade Group 2"
df[[grade_column]][gg3] <- "Grade Group 3"
if (any(gg_unknown)) {
warning("Some Gleason 7 samples have an unrecognised primary pattern ",
"and have been assigned to Grade Group 2.")
df[[grade_column]][gg_unknown] <- "Grade Group 2"
}
message("Grade Group 2 (3+4=7): ", sum(gg2), " samples. ",
"Grade Group 3 (4+3=7): ", sum(gg3), " samples.")
} else {
# No pattern_col — assign all Gleason 7 to Grade Group 2
df[[grade_column]][gleason7] <- "Grade Group 2"
message("Note: 'pattern_col' not supplied. All Gleason 7 samples are ",
"assigned to Grade Group 2. To distinguish Grade Group 2 ",
"(3+4=7) from Grade Group 3 (4+3=7), supply the primary ",
"pattern column name via pattern_col (e.g., pattern_col = ",
"'pattern1').")
}
} else if (disease_type %in% names(presets)) {
active_thresholds <- presets[[disease_type]]
for (group_name in names(active_thresholds)) {
matched <- tryCatch(
active_thresholds[[group_name]](df[[column_name]]),
error = function(e) stop("Error in threshold for '", group_name,
"': ", conditionMessage(e))
)
matched[is.na(matched)] <- FALSE
df[[grade_column]][matched] <- group_name
}
} else {
stop("Unsupported disease_type: '", disease_type, "'. ",
"Choose one of: ",
paste(c(names(presets), "auto", "custom"), collapse = ", "), ".")
}
# --- Save output ---------------------------------------------------------
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_dir <- if (is.null(output_dir)) dirname(file_path) else output_dir
output_file_name <- sub("\\.csv$",
paste0("_with_grade_groups_", timestamp, ".csv"),
basename(file_path))
output_file_path <- file.path(output_dir, output_file_name)
utils::write.csv(df, file = output_file_path, row.names = FALSE)
message("Grade groups saved to: ", output_file_path)
# --- Return grouped sample IDs ------------------------------------------
groups <- unique(stats::na.omit(df[[grade_column]]))
grouped_samples <- lapply(groups, function(g)
df[!is.na(df[[grade_column]]) & df[[grade_column]] == g, "sample"])
names(grouped_samples) <- groups
return(grouped_samples)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.