Nothing
#' Annotate CNV Regions with Gene Symbols
#'
#' Finds the overlap between a gene annotation file and a recurrent CNV
#' file using genomic ranges, and annotates each CNV region with the
#' corresponding gene symbol. Requires the \pkg{GenomicRanges} package.
#'
#' @param genes_file Character. Path to the gene annotation CSV file.
#' Must contain chromosome, start, end, and gene symbol columns
#' (see parameters below for defaults).
#' @param risk_file Character. Path to the recurrent CNV CSV file (e.g.,
#' the file path returned by \code{\link{recurrent}}). Must contain
#' sample, chromosome, start, end, and segment mean columns.
#' @param output_dir Character. Directory where the annotated CSV will be
#' saved. Default is the current directory (\code{"."}).
#' @param seqnames_field_genes Character. Column name for chromosome in
#' the gene file. Default is \code{"Chr"}.
#' @param start_field_genes Character. Column name for start position in
#' the gene file. Default is \code{"Start"}.
#' @param end_field_genes Character. Column name for end position in the
#' gene file. Default is \code{"End"}.
#' @param gene_symbol_field Character. Column name for gene symbols in
#' the gene file. Default is \code{"GeneSymbol"}.
#' @param seqnames_field_risk Character. Column name for chromosome in
#' the CNV file. Default is \code{"Chr"}.
#' @param start_field_risk Character. Column name for start position in
#' the CNV file. Default is \code{"Start"}.
#' @param end_field_risk Character. Column name for end position in the
#' CNV file. Default is \code{"End"}.
#' @param sample_field Character. Column name for sample IDs in the CNV
#' file. Default is \code{"Sample"}.
#' @param segment_mean_field Character. Column name for segment mean
#' values in the CNV file. Default is \code{"Segment_Mean"}.
#'
#' @return A data frame containing annotated CNV regions with columns:
#' \code{Sample}, \code{GeneSymbol}, \code{Segment_Mean}, \code{Chr},
#' \code{Start}, \code{End}. The result is also written to a timestamped
#' CSV file in \code{output_dir}.
#'
#' @details
#' This function uses \code{GenomicRanges::findOverlaps} with
#' \code{type = "within"} to find genes that fall entirely within each
#' CNV region. This function is cancer-type agnostic and can be applied
#' to CNV data from any solid tumour with a compatible gene annotation
#' reference file.
#'
#' @examples
#' genes_file <- system.file("extdata", "gene_annotation.csv",
#' package = "RiskyCNV")
#' cnv_file <- system.file("extdata", "annotated_cnv.csv",
#' package = "RiskyCNV")
#' annotated <- annotate(
#' genes_file = genes_file,
#' risk_file = cnv_file,
#' output_dir = tempdir()
#' )
#' head(annotated)
#'
#' @importFrom GenomicRanges makeGRangesFromDataFrame findOverlaps
#' @importFrom S4Vectors subjectHits queryHits
#' @export
annotate <- function(genes_file,
risk_file,
output_dir = ".",
seqnames_field_genes = "Chr",
start_field_genes = "Start",
end_field_genes = "End",
gene_symbol_field = "GeneSymbol",
seqnames_field_risk = "Chr",
start_field_risk = "Start",
end_field_risk = "End",
sample_field = "Sample",
segment_mean_field = "Segment_Mean") {
Genes <- utils::read.csv(genes_file)
RecCNV <- utils::read.csv(risk_file)
sCNV <- RecCNV[, c(sample_field, seqnames_field_risk,
start_field_risk, end_field_risk, segment_mean_field)]
sCNV <- sCNV[order(sCNV[[start_field_risk]]), ]
sCNV <- sCNV[order(sCNV[[seqnames_field_risk]]), ]
colnames(sCNV) <- c("Sample", "Chr", "Start", "End", "Segment_Mean")
genes_GR <- GenomicRanges::makeGRangesFromDataFrame(
Genes,
seqnames.field = seqnames_field_genes,
start.field = start_field_genes,
end.field = end_field_genes,
keep.extra.columns = TRUE
)
sCNV_GR <- GenomicRanges::makeGRangesFromDataFrame(
sCNV,
seqnames.field = "Chr",
start.field = "Start",
end.field = "End",
keep.extra.columns = TRUE
)
hits <- suppressWarnings(
GenomicRanges::findOverlaps(genes_GR, sCNV_GR, type = "within")
)
sCNV_ann <- cbind(sCNV[S4Vectors::subjectHits(hits), ],
Genes[S4Vectors::queryHits(hits), gene_symbol_field])
colnames(sCNV_ann)[ncol(sCNV_ann)] <- "GeneSymbol"
output <- sCNV_ann[, c("Sample", "GeneSymbol", "Segment_Mean",
"Chr", "Start", "End")]
risk_file_name <- tools::file_path_sans_ext(basename(risk_file))
timestamp <- format(Sys.time(), "%Y%m%d_%H%M%S")
output_file <- file.path(output_dir,
paste0(risk_file_name, "_annotated_",
timestamp, ".csv"))
utils::write.csv(output, output_file, row.names = FALSE)
message("Annotated CNV saved to: ", output_file)
return(output)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.