R/get_overlap_seg_with_mask_ind.R

#' Find Overlapping Segments with a CNV Mask
#'
#' Given a TitanCNA segment data.frame and a mask segment data.frame, this 
#' function will find whether any TitanCNA segment overlaps, with any 
#' significant degree according to the overlap.prop parameter, with a CNV mask 
#' segment. It returns the row number of any segment that does.
#'
#' @param segs.df TitanCNA segments in a data.frame. This can be the 
#'   data.frame loaded from the \code{load_titan_seg} function. Or at the very
#'   least the data.frame should contain the columns: Chromosome, 
#'   Start_Position, End_Position
#' @param mask.df Mask segment in a data.frame. Must contain the columns: chr,
#'   start, end
#' @param overlap.prop Proportion that the TitanCNA segment must overlap with
#'   cnv mask segment in order to be considered overlapping
#' @return Vector containing the row number in the segs.df that overlaps with 
#'  any cnv.mask segment
#' @export
#' @examples
#' segs.df <- data.frame(chr = c("chr1", "chr2", "chr1", "chr3"),
#'                       start = 1:4, end = 7:10, 
#'                       stringsAsFactors = FALSE)
#' mask.df <- data.frame(chr = "chr1", start = 6, end = 8,
#'                           stringsAsFactors = FALSE)
#' get_overlap_seg_with_mask_ind(segs.df, mask.df)
get_overlap_seg_with_mask_ind <- function(segs.df, mask.df, 
                                          overlap.prop = 0.25) {

  if (overlap.prop > 1) {
    stop("overlap.prop cannot be greater than 1")
  }

  # Build GRanges Objects
  segs.gr <- 
    GenomicRanges::makeGRangesFromDataFrame(segs.df, 
                                            seqnames.field = "Chromosome", 
                                            start.field = "Start_Position",
                                            end.field = "End_Position")

  cnv.mask.gr <- 
    GenomicRanges::makeGRangesFromDataFrame(mask.df,
                                            seqnames.field = "chr", 
                                            start.field = "start",
                                            end.field = "end")

  # Find Overlaps
  # The nrows will be larger than the input data since a seg can overlap with 
  # multiple cnv segs. But the values will be indices to the original input 
  # data. This will be used later down the function.
  overlap.res <- GenomicRanges::findOverlaps(segs.gr, cnv.mask.gr)
  gr.hit <- S4Vectors::queryHits(overlap.res)
  subject.hit <- S4Vectors::subjectHits(overlap.res)

  # See if the overlap is at least `overlap.prop` of the segment length
  segs.cnv.overlap.prop <- 
    IRanges::width(IRanges::pintersect(segs.gr[gr.hit], 
                                       cnv.mask.gr[subject.hit])) / 
    IRanges::width(segs.gr[gr.hit])

  segs.overlap.cnv.mask.ind <- which(segs.cnv.overlap.prop >= overlap.prop)

  # Map the indices to the original input data
  unique(gr.hit[segs.overlap.cnv.mask.ind])
}
tinyheero/titanCNAutils documentation built on May 31, 2019, 3:37 p.m.