R/annotate.R
In RiskyCNV: Risk Analysis of Genomic Copy Number Variation

Documented in annotate

#' Annotate CNV Regions with Gene Symbols
#'
#' Finds the overlap between a gene annotation file and a recurrent CNV
#' file using genomic ranges, and annotates each CNV region with the
#' corresponding gene symbol. Requires the \pkg{GenomicRanges} package.
#'
#' @param genes_file Character. Path to the gene annotation CSV file.
#'   Must contain chromosome, start, end, and gene symbol columns
#'   (see parameters below for defaults).
#' @param risk_file Character. Path to the recurrent CNV CSV file (e.g.,
#'   the file path returned by \code{\link{recurrent}}). Must contain
#'   sample, chromosome, start, end, and segment mean columns.
#' @param output_dir Character. Directory where the annotated CSV will be
#'   saved. Default is the current directory (\code{"."}).
#' @param seqnames_field_genes Character. Column name for chromosome in
#'   the gene file. Default is \code{"Chr"}.
#' @param start_field_genes Character. Column name for start position in
#'   the gene file. Default is \code{"Start"}.
#' @param end_field_genes Character. Column name for end position in the
#'   gene file. Default is \code{"End"}.
#' @param gene_symbol_field Character. Column name for gene symbols in
#'   the gene file. Default is \code{"GeneSymbol"}.
#' @param seqnames_field_risk Character. Column name for chromosome in
#'   the CNV file. Default is \code{"Chr"}.
#' @param start_field_risk Character. Column name for start position in
#'   the CNV file. Default is \code{"Start"}.
#' @param end_field_risk Character. Column name for end position in the
#'   CNV file. Default is \code{"End"}.
#' @param sample_field Character. Column name for sample IDs in the CNV
#'   file. Default is \code{"Sample"}.
#' @param segment_mean_field Character. Column name for segment mean
#'   values in the CNV file. Default is \code{"Segment_Mean"}.
#'
#' @return A data frame containing annotated CNV regions with columns:
#'   \code{Sample}, \code{GeneSymbol}, \code{Segment_Mean}, \code{Chr},
#'   \code{Start}, \code{End}. The result is also written to a timestamped
#'   CSV file in \code{output_dir}.
#'
#' @details
#' This function uses \code{GenomicRanges::findOverlaps} with
#' \code{type = "within"} to find genes that fall entirely within each
#' CNV region. This function is cancer-type agnostic and can be applied
#' to CNV data from any solid tumour with a compatible gene annotation
#' reference file.
#'
#' @examples
#' genes_file <- system.file("extdata", "gene_annotation.csv",
#'                            package = "RiskyCNV")
#' cnv_file   <- system.file("extdata", "annotated_cnv.csv",
#'                            package = "RiskyCNV")
#' annotated  <- annotate(
#'   genes_file = genes_file,
#'   risk_file  = cnv_file,
#'   output_dir = tempdir()
#' )
#' head(annotated)
#'
#' @importFrom GenomicRanges makeGRangesFromDataFrame findOverlaps
#' @importFrom S4Vectors subjectHits queryHits
#' @export
annotate <- function(genes_file,
                     risk_file,
                     output_dir          = ".",
                     seqnames_field_genes = "Chr",
                     start_field_genes    = "Start",
                     end_field_genes      = "End",
                     gene_symbol_field    = "GeneSymbol",
                     seqnames_field_risk  = "Chr",
                     start_field_risk     = "Start",
                     end_field_risk       = "End",
                     sample_field         = "Sample",
                     segment_mean_field   = "Segment_Mean") {

  Genes  <- utils::read.csv(genes_file)
  RecCNV <- utils::read.csv(risk_file)

  sCNV <- RecCNV[, c(sample_field, seqnames_field_risk,
                      start_field_risk, end_field_risk, segment_mean_field)]
  sCNV <- sCNV[order(sCNV[[start_field_risk]]), ]
  sCNV <- sCNV[order(sCNV[[seqnames_field_risk]]), ]
  colnames(sCNV) <- c("Sample", "Chr", "Start", "End", "Segment_Mean")

  genes_GR <- GenomicRanges::makeGRangesFromDataFrame(
    Genes,
    seqnames.field     = seqnames_field_genes,
    start.field        = start_field_genes,
    end.field          = end_field_genes,
    keep.extra.columns = TRUE
  )

  sCNV_GR <- GenomicRanges::makeGRangesFromDataFrame(
    sCNV,
    seqnames.field     = "Chr",
    start.field        = "Start",
    end.field          = "End",
    keep.extra.columns = TRUE
  )

  hits <- suppressWarnings(
    GenomicRanges::findOverlaps(genes_GR, sCNV_GR, type = "within")
  )
  sCNV_ann <- cbind(sCNV[S4Vectors::subjectHits(hits), ],
                    Genes[S4Vectors::queryHits(hits), gene_symbol_field])
  colnames(sCNV_ann)[ncol(sCNV_ann)] <- "GeneSymbol"

  output <- sCNV_ann[, c("Sample", "GeneSymbol", "Segment_Mean",
                          "Chr", "Start", "End")]

  risk_file_name <- tools::file_path_sans_ext(basename(risk_file))
  timestamp      <- format(Sys.time(), "%Y%m%d_%H%M%S")
  output_file    <- file.path(output_dir,
                               paste0(risk_file_name, "_annotated_",
                                      timestamp, ".csv"))
  utils::write.csv(output, output_file, row.names = FALSE)
  message("Annotated CNV saved to: ", output_file)

  return(output)
}