R/utils.R

Defines functions removeReadPileupSpikes removeDoubleSCEs insertchr transCoord collapseBins

Documented in collapseBins insertchr removeDoubleSCEs removeReadPileupSpikes transCoord

#' Collapse consecutive bins with the same ID value
#'
#' Collapse consecutive bins with the same value defined in 'id.field'. 
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @param id.field A number of metadata column to use for region merging.
#' @return A \code{\link{GRanges-class}} object.
collapseBins <- function(gr, id.field=3) {
    ##get indices of last range in a consecutive(RLE) run of the same value
    ind.last <- cumsum(S4Vectors::runLength(S4Vectors::Rle(mcols(gr)[,id.field])))
    ##get indices of first range in a consecutive(RLE) run of the same value
    ind.first <- c(1,cumsum(S4Vectors::runLength(S4Vectors::Rle(mcols(gr)[,id.field]))) + 1)
    ind.first <- ind.first[-length(ind.first)]  ##erase last index from first range indices 
    collapsed.gr <- GenomicRanges::GRanges(seqnames=seqnames(gr[ind.first]), ranges=IRanges(start=start(gr[ind.first]), end=end(gr[ind.last])), mcols=mcols(gr[ind.first]))
    names(mcols(collapsed.gr)) <- names(mcols(gr[ind.first]))
    return(collapsed.gr)
}

#' Transform genomic coordinates
#'
#' Add two columns with transformed genomic coordinates to the \code{\link{GRanges-class}} object. This is useful for making genomewide plots.
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @return The input \code{\link{GRanges-class}} with two additional metadata columns 'start.genome' and 'end.genome'.
transCoord <- function(gr) {
    cum.seqlengths <- cumsum(as.numeric(seqlengths(gr)))
    cum.seqlengths.0 <- c(0,cum.seqlengths[-length(cum.seqlengths)])
    names(cum.seqlengths.0) <- GenomeInfoDb::seqlevels(gr)
    gr$start.genome <- start(gr) + cum.seqlengths.0[as.character(seqnames(gr))]
    gr$end.genome <- end(gr) + cum.seqlengths.0[as.character(seqnames(gr))]
    return(gr)
}

#' Insert chromosome for in case it's missing
#' 
#' Add two columns with transformed genomic coordinates to the \code{\link{GRanges-class}} object. This is useful for making genomewide plots.
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @return The input \code{\link{GRanges-class}} object with an additional metadata column containing chromosome name with 'chr'. 
insertchr <- function(gr) {
    mask <- which(!grepl('chr', seqnames(gr)))
    mcols(gr)$chromosome <- as.character(seqnames(gr))
    mcols(gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(gr)$chromosome[mask])
    mcols(gr)$chromosome <- as.factor(mcols(gr)$chromosome)
    return(gr)
}

#' Process double SCE chromosomes: with internal WC region.
#' 
#' This function will take from a double SCE chromosome only WW or CC region (Longer region is taken).
#'
#' @param gr A \code{\link{GRanges-class}} object.
#' @inheritParams synchronizeReadDir
#' @return The input \code{\link{GRanges-class}} object with only WW or CC region retained. 
removeDoubleSCEs <- function(gr, collapseWidth=5000000) {
    ## Helper function ##
    fillGaps <- function(gr, collapseWidth=5000000) {
        ## Collapse gaps smaller or equal to collapseWidth
        gaps.gr <- suppressWarnings( GenomicRanges::gaps(gr, start = min(start(gr)), end = max(end(gr))) )
        gaps.gr <- gaps.gr[strand(gaps.gr) == '*']
        gr.new <- suppressWarnings( c(gr[,0], gaps.gr[width(gaps.gr) <= collapseWidth]) )
        gr.new <- GenomicRanges::reduce(gr.new)
        return(gr.new)
    }    
  
    gr <- GenomeInfoDb::keepSeqlevels(gr, value = unique(seqnames(gr)), pruning.mode = 'coarse')
    gr <- collapseBins(gr)
    if (any(gr$states == 'wc')) {
        wc.idx <- which(gr$states == 'wc')
        if (any(wc.idx > 1) & any(wc.idx < length(gr))) {
            ## Remove wc regions
            gr.new <- gr[-wc.idx]
            ## Take strand state covering largest region
            gr.new.byState <- split(gr.new, gr.new$states)
            state.widths <- lapply(gr.new.byState, function(x) sum(width(x)))
            max.state <- names(which.max(state.widths))
            gr.new$states[width(gr.new) <= collapseWidth] <- max.state
            gr.new <- gr.new[gr.new$states == names(which.max(state.widths))]
            ## Collapse gaps smaller or equal to collapseWidth
            gr.new <- fillGaps(gr.new, collapseWidth=collapseWidth)
            return(gr.new)
        } else {
            ## Remove wc regions
            gr.new <- gr[-wc.idx]
            ## Collapse gaps smaller or equal to collapseWidth
            gr.new <- fillGaps(gr.new, collapseWidth=collapseWidth)
            return(gr.new)
        }
    } else {
        gr.new <- gr
        ## Collapse gaps smaller or equal to collapseWidth
        gr.new <- fillGaps(gr.new, collapseWidth=collapseWidth)
        return(gr)
    }
}


#' Remove large spikes in short reads coverage
#' 
#' This function takes a \code{\link{GRanges-class}} object of aligned short reads
#' and removes pockets of reads that are stacked on top of each other based on the 
#' maximum number of reads allowed to pileup in 'max.pileup' parameter.
#' 
#' @param gr A \code{\link{GRanges-class}} object.
#' @param max.pileup A maximum number of reads overlapping each other to be kept.
#' @return A \code{\link{GRanges-class}} object.
#' @author David Porubsky
#' @export
#' @examples
#'## Get some files that you want to load
#'exampleFolder <- system.file("extdata", "example_results", package="breakpointRdata")
#'infile <- list.files(exampleFolder, full.names=TRUE)[1]
#'## Read in the reads
#'breakP.obj <- get(load(infile))
#'frags <- breakP.obj$fragments 
#'## Remove read spikes
#'frags <- removeReadPileupSpikes(gr=frags)
#'
removeReadPileupSpikes <- function(gr=NULL, max.pileup=30) {
  ## Get overlapping read levels
  gr.levels <- GenomicRanges::disjointBins(gr, ignore.strand=TRUE)
  gr$gr.levels <- gr.levels
  ## Get regions above set pileup threshold
  gr.spikes <- gr[gr$gr.levels >= max.pileup]
  ## Get all reads/ranges overlapping with defined pileup spikes
  gr.filt.regions <- IRanges::subsetByOverlaps(gr, reduce(gr.spikes))
  ## Return filtered ranges
  gr <- IRanges::subsetByOverlaps(gr, gr.filt.regions, invert = TRUE)
  return(gr)
}

Try the breakpointR package in your browser

Any scripts or data that you put into this service are public.

breakpointR documentation built on Nov. 8, 2020, 8:04 p.m.