R/filter_out.R

Defines functions filter_out

Documented in filter_out

#' Filter out events located in heterogeneity areas
#'
#' @param event_data a \code{data.frame} of events such as generated by function
#'   \code{\link{getdels}}
#' @param het_regions a \code{data.frame} of genomic ranges to be used for
#'   filtering out events located in these regions.
#' @param min_overlap a single numeric value between 0 and 1. The minimal
#'   proportion of the length of the event that must overlap with a region
#'   listed in \code{het_sites} for this event to be filtered out. A value of
#'   0 results in an overlap of even a single nucleotide to be removed, whereas
#'   a value of 1 results in only CNVs entirely located in a specified region to
#'   be removed.
#'
#' @return a \code{data.frame} of read counts similar to that given as input,
#'   but with events removed according to the specified filters.
#' @export
#'
#' @examples
#' NULL
filter_out <- function(event_data, het_regions, min_overlap = 0) {

  # Converting the event_data object to a GRanges object
  g_events  <- GenomicRanges::GRanges(seqnames = event_data$chr,
                                      ranges = IRanges::IRanges(start = event_data$start,
                                                                end = event_data$end))
  # Converting the het_regions object to a GRanges object
  g_regions <- GenomicRanges::GRanges(seqnames = het_regions$chr,
                                      ranges = IRanges::IRanges(start = het_regions$start,
                                                                end = het_regions$stop))

  # Checking which of g_events overlap with any region in g_regions
  overlaps <- IRanges::overlapsAny(g_events, g_regions)

  # The event data is returned directly if no overlap is found
  if(!any(overlaps)) return(event_data)

  # If min_overlap is not 0, the percentage of the query overlapping with the
  # subject is computed and only those above min_overlap are deleted
  if(min_overlap != 0) {
    # Otherwise the percentage of the query overlapping with the subject is
    #  computed and only those above min_overlap are deleted
    overlapping_regions <- IRanges::findOverlaps(g_events, g_regions)
    # We will loop over the overlapping regions
    overlap_proportions <- numeric(length(overlapping_regions))

    for(i in 1:length(overlap_proportions)) {
      query_range <- IRanges::IRanges(start = start(g_events[overlapping_regions@from[i]]),
                            end = end(g_events[overlapping_regions@from[i]]))
      subject_range <- IRanges::IRanges(start = start(g_regions[overlapping_regions@to[i]]),
                               end = end(g_regions[overlapping_regions@to[i]]))
      overlap_proportions[i] <- width(IRanges::overlapsRanges(query_range, subject_range)) / width(query_range)
    }

    which_overlaps <- which(overlaps)
    which_overlaps <- which_overlaps[overlap_proportions >= min_overlap]
    overlaps <- rep(FALSE, length(overlaps))
    overlaps[which_overlaps] <- TRUE

  }

  return(event_data[!overlaps, ])
}
malemay/delgbs documentation built on Feb. 1, 2024, 8:38 a.m.