Nothing
#' Classify Samples into Risk Categories
#'
#' Reads a CSV file containing sample metadata and assigns each sample to a
#' risk category based on a specified scoring column. Supports built-in
#' presets for seven major disease types, fully custom user-defined risk
#' boundaries, or automatic classification using a normalised Risk Score
#' derived from the data itself.
#'
#' @param file_path Character. Path to the input CSV file containing sample
#' metadata.
#' @param column_name Character. Name of the column containing the grading
#' or staging score (e.g., Gleason score, Nottingham score, TNM stage).
#' @param disease_type Character. Disease type for built-in preset risk
#' groupings. Supported values: \code{"prostate"}, \code{"breast"},
#' \code{"colorectal"}, \code{"lung"}, \code{"cervical"},
#' \code{"lymphoma"}, \code{"melanoma"}. Use \code{"custom"} to supply
#' your own groupings via the \code{risk_groups} argument. Use
#' \code{"auto"} to automatically classify using a normalised Risk Score
#' derived from the data. Default is \code{"auto"}.
#' @param n_groups Integer. Number of risk groups to create. Only used when
#' \code{disease_type = "auto"}. Must be between 2 and 5. Default is
#' \code{3}.
#' \describe{
#' \item{2 groups}{low_risk, high_risk}
#' \item{3 groups}{low_risk, intermediate_risk, high_risk}
#' \item{4 groups}{very_low_risk, low_risk, high_risk, very_high_risk}
#' \item{5 groups}{very_low_risk, low_risk, intermediate_risk,
#' high_risk, very_high_risk}
#' }
#' @param score_min Numeric or NULL. Minimum possible value of the score.
#' If NULL (default), automatically detected from the data.
#' @param score_max Numeric or NULL. Maximum possible value of the score.
#' If NULL (default), automatically detected from the data.
#' @param risk_groups Named list of functions. Required only when
#' \code{disease_type = "custom"}. Each element must be a function that
#' takes a numeric or character vector and returns a logical vector. The
#' name of each element becomes the risk group label.
#' @param output_dir Character or NULL. Directory to save the output CSV
#' file. If NULL (default), output is saved in the same directory as the
#' input file.
#'
#' @return A named list where each element corresponds to a risk group and
#' contains the sample IDs belonging to that group. The number of elements
#' matches the number of risk groups detected or specified.
#'
#' @details
#' When \code{disease_type = "auto"}, the function computes a normalised
#' Risk Score for each sample using min-max normalisation:
#'
#' \deqn{Risk Score = \frac{score - min(score)}{max(score) - min(score)}}
#'
#' The Risk Score ranges from 0 (lowest risk) to 1 (highest risk). Risk
#' group boundaries are then determined automatically:
#'
#' \itemize{
#' \item If the score distribution is approximately symmetric (skewness
#' between -0.5 and +0.5), equal-width boundaries are used, dividing
#' the 0-1 range into \code{n_groups} equal intervals.
#' \item If the score distribution is skewed (skewness outside -0.5 to
#' +0.5), quantile-based boundaries are used, ensuring approximately
#' equal numbers of samples per group.
#' }
#'
#' The splitting method chosen is reported via a message. Risk group labels
#' are generated automatically based on \code{n_groups}.
#'
#' Built-in presets use clinically validated risk stratification systems:
#' \describe{
#' \item{prostate}{D'Amico classification (D'Amico et al., 1998):
#' low_risk (<=6), intermediate_risk (7), high_risk (>=8).}
#' \item{breast}{Nottingham Prognostic Index (Galea et al., 1992):
#' low_risk (3-5), intermediate_risk (6-7), high_risk (8-9).}
#' \item{colorectal}{Dukes-based risk (Dukes, 1932):
#' low_risk (A), intermediate_risk (B/C), high_risk (D).}
#' \item{lung}{TNM stage-based (Goldstraw et al., 2016):
#' low_risk (I), intermediate_risk (II/III), high_risk (IV).}
#' \item{cervical}{FIGO stage-based (Bhatla et al., 2019):
#' low_risk (I), intermediate_risk (II/III), high_risk (IV).}
#' \item{lymphoma}{Ann Arbor/Lugano (Cheson et al., 2014):
#' limited (I/II), advanced (III/IV).}
#' \item{melanoma}{Breslow depth (Breslow, 1970):
#' low_risk (<=1.0mm), intermediate_risk (1.0-4.0mm),
#' high_risk (>4.0mm).}
#' }
#'
#' @references
#' \itemize{
#' \item D'Amico AV, et al. (1998). Biochemical outcome after radical
#' prostatectomy. \emph{JAMA}, 280(11):969-974.
#' \item Galea MH, et al. (1992). The Nottingham prognostic index.
#' \emph{Breast Cancer Res Treat}, 22(3):207-219.
#' \item Dukes CE. (1932). The classification of cancer of the rectum.
#' \emph{J Pathol Bacteriol}, 35:323-332.
#' \item Goldstraw P, et al. (2016). The IASLC Lung Cancer Staging
#' Project. \emph{J Thorac Oncol}, 11(1):39-51.
#' \item Bhatla N, et al. (2019). Revised FIGO staging for carcinoma of
#' the cervix uteri. \emph{Int J Gynaecol Obstet}, 145(1):129-135.
#' \item Cheson BD, et al. (2014). The Lugano Classification.
#' \emph{J Clin Oncol}, 32(27):3059-3068.
#' \item Breslow A. (1970). Thickness and depth of invasion in the
#' prognosis of cutaneous melanoma. \emph{Ann Surg}, 172(5):902-908.
#' }
#'
#' @examples
#' # Auto mode - let the function decide risk grouping (any disease)
#' sample_file <- system.file("extdata", "sample_data.csv",
#' package = "RiskyCNV")
#' result <- classify_risk(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "auto",
#' n_groups = 3,
#' output_dir = tempdir()
#' )
#' print(names(result))
#'
#' # Prostate cancer preset
#' result_prostate <- classify_risk(
#' file_path = sample_file,
#' column_name = "gleason_score",
#' disease_type = "prostate",
#' output_dir = tempdir()
#' )
#' print(result_prostate$low_risk)
#'
#' # Custom risk groups for any disease
#' \dontrun{
#' result_custom <- classify_risk(
#' file_path = "samples.csv",
#' column_name = "risk_score",
#' disease_type = "custom",
#' risk_groups = list(
#' "low_risk" = function(x) x <= 5,
#' "high_risk" = function(x) x > 5
#' ),
#' output_dir = tempdir()
#' )
#' }
#'
#' @export
classify_risk <- function(file_path,
column_name,
disease_type = "auto",
n_groups = 3,
score_min = NULL,
score_max = NULL,
risk_groups = NULL,
output_dir = NULL) {
# --- Built-in presets ---------------------------------------------------
presets <- list(
prostate = list(
"low_risk" = function(x) x <= 6,
"intermediate_risk" = function(x) x == 7,
"high_risk" = function(x) x >= 8
),
breast = list(
"low_risk" = function(x) x >= 3 & x <= 5,
"intermediate_risk" = function(x) x >= 6 & x <= 7,
"high_risk" = function(x) x >= 8 & x <= 9
),
colorectal = list(
"low_risk" = function(x) x == "A",
"intermediate_risk" = function(x) x == "B" | x == "C",
"high_risk" = function(x) x == "D"
),
lung = list(
"low_risk" = function(x) x == 1,
"intermediate_risk" = function(x) x == 2 | x == 3,
"high_risk" = function(x) x == 4
),
cervical = list(
"low_risk" = function(x) x == 1,
"intermediate_risk" = function(x) x == 2 | x == 3,
"high_risk" = function(x) x == 4
),
lymphoma = list(
"limited" = function(x) x == 1 | x == 2,
"advanced" = function(x) x == 3 | x == 4
),
melanoma = list(
"low_risk" = function(x) x <= 1.0,
"intermediate_risk" = function(x) x > 1.0 & x <= 4.0,
"high_risk" = function(x) x > 4.0
)
)
# --- Auto risk labels ---------------------------------------------------
.make_risk_labels <- function(n) {
switch(as.character(n),
"2" = c("low_risk", "high_risk"),
"3" = c("low_risk", "intermediate_risk", "high_risk"),
"4" = c("very_low_risk", "low_risk", "high_risk", "very_high_risk"),
"5" = c("very_low_risk", "low_risk", "intermediate_risk",
"high_risk", "very_high_risk"),
paste0("risk_group_", seq_len(n))
)
}
# --- Read data ----------------------------------------------------------
df <- utils::read.csv(file_path)
risk_column <- "risk_category"
df[[risk_column]] <- NA_character_
disease_type <- tolower(disease_type)
# --- Resolve classification approach ------------------------------------
if (disease_type == "auto") {
scores <- as.numeric(df[[column_name]])
if (all(is.na(scores))) {
stop("Column '", column_name, "' could not be converted to numeric. ",
"For non-numeric scores, use disease_type = 'custom' or a preset.")
}
# Min-max normalisation
s_min <- if (is.null(score_min)) min(scores, na.rm = TRUE) else score_min
s_max <- if (is.null(score_max)) max(scores, na.rm = TRUE) else score_max
if (s_min == s_max) {
stop("All values in '", column_name, "' are identical. ",
"Cannot compute Risk Score.")
}
risk_score <- (scores - s_min) / (s_max - s_min)
df$risk_score <- risk_score
# Skewness check to decide splitting method
n <- sum(!is.na(risk_score))
mean_r <- mean(risk_score, na.rm = TRUE)
sd_r <- stats::sd(risk_score, na.rm = TRUE)
skew <- if (sd_r == 0) 0 else
(sum((risk_score - mean_r)^3, na.rm = TRUE) / n) / (sd_r^3)
if (abs(skew) <= 0.5) {
method <- "equal-width"
breaks <- seq(0, 1, length.out = n_groups + 1)
breaks[1] <- -Inf
breaks[n_groups + 1] <- Inf
} else {
method <- "quantile"
probs <- seq(0, 1, length.out = n_groups + 1)
breaks <- stats::quantile(risk_score, probs = probs, na.rm = TRUE)
breaks[1] <- -Inf
breaks[n_groups + 1] <- Inf
}
message("Risk Score computed. Distribution skewness: ",
round(skew, 3), ". Using ", method, " splitting.")
labels <- .make_risk_labels(n_groups)
df[[risk_column]] <- as.character(
cut(risk_score, breaks = breaks, labels = labels, include.lowest = TRUE)
)
} else if (disease_type == "custom") {
if (is.null(risk_groups) || !is.list(risk_groups)) {
stop("When disease_type = 'custom', you must supply a named list of ",
"functions via the 'risk_groups' argument.")
}
active_groups <- risk_groups
for (group_name in names(active_groups)) {
matched <- tryCatch(
active_groups[[group_name]](df[[column_name]]),
error = function(e) stop("Error in risk group '", group_name,
"': ", conditionMessage(e))
)
matched[is.na(matched)] <- FALSE
df[[risk_column]][matched] <- group_name
}
} else if (disease_type %in% names(presets)) {
active_groups <- presets[[disease_type]]
for (group_name in names(active_groups)) {
matched <- tryCatch(
active_groups[[group_name]](df[[column_name]]),
error = function(e) stop("Error in risk group '", group_name,
"': ", conditionMessage(e))
)
matched[is.na(matched)] <- FALSE
df[[risk_column]][matched] <- group_name
}
} else {
stop("Unsupported disease_type: '", disease_type, "'. ",
"Choose one of: ",
paste(c(names(presets), "auto", "custom"), collapse = ", "), ".")
}
# --- Save output --------------------------------------------------------
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_dir <- if (is.null(output_dir)) dirname(file_path) else output_dir
output_file_name <- sub("\\.csv$",
paste0("_with_risk_categories_", timestamp, ".csv"),
basename(file_path))
output_file_path <- file.path(output_dir, output_file_name)
utils::write.csv(df, file = output_file_path, row.names = FALSE)
message("Risk categories saved to: ", output_file_path)
# --- Return grouped sample IDs -----------------------------------------
groups <- unique(stats::na.omit(df[[risk_column]]))
categorized_samples <- lapply(groups, function(g)
df[!is.na(df[[risk_column]]) & df[[risk_column]] == g, "sample"])
names(categorized_samples) <- groups
return(categorized_samples)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.