R/recurrent.R
In RiskyCNV: Risk Analysis of Genomic Copy Number Variation

Documented in recurrent

#' Identify Recurrent Copy Number Variations by Risk Group
#'
#' Filters a CNV data file for samples belonging to a specified risk group
#' and identifies genomic regions that recur across multiple samples above
#' a given threshold. Results are saved as a CSV file.
#'
#' @param x A named list of sample ID vectors, as returned by
#'   \code{\link{classify_risk}}. Each element name corresponds to a risk
#'   group label (e.g., \code{"low_risk"}, \code{"intermediate_risk"},
#'   \code{"high_risk"}).
#' @param risk_level Character. The risk group to analyse. Must be a name
#'   present in \code{x}.
#' @param cnv_data_file Character. Path to the CNV data file
#'   (whitespace-delimited, with a header). Must contain columns:
#'   \code{Sample}, \code{Chromosome}, \code{Start}, \code{End},
#'   \code{Num_Probes}, \code{Segment_Mean}.
#' @param threshold Numeric. Minimum number of samples a CNV region must
#'   appear in to be considered recurrent. Default is \code{2}.
#'
#' @return Character. The file path of the saved CSV file containing the
#'   recurrent CNV regions for the specified risk group.
#'
#' @details
#' Sample IDs in the CNV file are trimmed to 12 characters and hyphens are
#' replaced with dots to match standard TCGA-style identifiers. The output
#' CSV is saved inside a timestamped subdirectory under \code{recurrent_cnv/}
#' in the temporary directory. This function is cancer-type agnostic.
#'
#' @examples
#' sample_file <- system.file("extdata", "sample_data.csv", package = "RiskyCNV")
#' cnv_file    <- system.file("extdata", "cnv_data.txt",    package = "RiskyCNV")
#' risk_result <- classify_risk(
#'   file_path    = sample_file,
#'   column_name  = "gleason_score",
#'   disease_type = "prostate",
#'   output_dir   = tempdir()
#' )
#' output_path <- recurrent(
#'   x             = risk_result,
#'   risk_level    = "low_risk",
#'   cnv_data_file = cnv_file,
#'   threshold     = 2
#' )
#' print(output_path)
#'
#' @export
recurrent <- function(x, risk_level, cnv_data_file, threshold = 2) {

  cnv_data        <- utils::read.table(cnv_data_file, header = TRUE)
  cnv_data$Sample <- gsub("-", ".", substr(cnv_data$Sample, 1, 12))

  matching_samples <- cnv_data$Sample %in% substr(x[[risk_level]], 1, 12)
  sample_cnv_data  <- cnv_data[matching_samples, ]

  if (nrow(sample_cnv_data) == 0) {
    stop("No matching samples found in CNV data for risk level: ", risk_level)
  }

  cnv_counts     <- table(paste(sample_cnv_data$Chromosome,
                                 sample_cnv_data$Start,
                                 sample_cnv_data$End))
  recurrent_cnvs <- names(cnv_counts)[cnv_counts >= threshold]

  recurrent_df <- sample_cnv_data[
    paste(sample_cnv_data$Chromosome,
          sample_cnv_data$Start,
          sample_cnv_data$End) %in% recurrent_cnvs,
    c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean")
  ]

  timestamp  <- format(Sys.time(), "%Y%m%d_%H%M%S")
  output_dir <- file.path(tempdir(), "recurrent_cnv", timestamp)
  if (!file.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

  output_file <- file.path(output_dir,
                            paste0("recurrent_cnvs_", risk_level, ".csv"))
  utils::write.csv(recurrent_df, file = output_file, row.names = FALSE)
  message("Recurrent CNVs for ", risk_level, " saved to: ", output_file)

  return(output_file)
}