superintronic: Tools for Working with Intron Signal

Documented in join_design

#' Compute long-form coverage as a GRanges object
#' 
#' @param spec a `data.frame` containing an experimental design
#' @param source a column in the data identifying the name of BAM files
#' @param .which an optional GRanges object (default = NULL) to compute coverage
#' over a region (requires the BAM file to be indexed).
#' @param .genome_info a GRanges object containing reference annotation (default = NULL)
#' @param .drop_empty Filter ranges if they have zero coverage over an entire contig/chromosome (default TRUE)
#' @param .parallel a BiocParallel object (default = `BiocParallel::bpparam()`)
#' 
#' @details This function computes coverage as a GRanges object, with
#' a `source` column containing the name of the input BAM file(s), and a
#' `score` column containing the coverage score. 
#' 
#' The `.genome_info` argument takes a GRanges object containing 
#' contig/chromosome information. If supplied the resulting GRanges will be 
#' properly annotated with a `seqinfo` slot. This is important for ensure the
#' integrity of any downstream overlap operations. 
#' 
#' @return a GRanges object
#' 
#' @importFrom BiocParallel bpparam bplapply
#' @importFrom Rsamtools BamFile BamFileList
#' @importClassesFrom Rsamtools BamFileList BamFile
#' @importFrom GenomeInfoDb seqinfo seqinfo<- keepSeqlevels
#' @export
#' @rdname compute_coverage_long
setGeneric("compute_coverage_long",
                    signature = c("spec", "source"),
                    function(spec, source, .which = NULL, .genome_info = NULL, .drop_empty = TRUE, .parallel = BiocParallel::bpparam()) {
                      standardGeneric("compute_coverage_long")
                    })


#'@export
#'@rdname compute_coverage_long 
setMethod("compute_coverage_long", 
                   signature = c("character", "missing"),
                   function(spec, source, .which = NULL, .genome_info = NULL, .drop_empty = TRUE, .parallel = BiocParallel::bpparam()) {
                     bfl <- BamFileList(spec)
                     .bamlist_coverage(bfl, .which, .genome_info, .drop_empty, .parallel)
                   })

#'@export 
#'@rdname compute_coverage_long
setMethod("compute_coverage_long",
                   signature = c("DataFrame", "character"),
                   function(spec, source, .which = NULL, .genome_info = NULL, .drop_empty = TRUE, .parallel = BiocParallel::bpparam()) {
                     bfl <- BamFileList(as.character(spec[[source]]))
                     spec[[source]] <- as(spec[[source]], "Rle")
                     S4Vectors::mcols(bfl) <- spec
                     ans <- .bamlist_coverage(bfl, 
                                              .which, 
                                              .genome_info, 
                                              .drop_empty, 
                                              .parallel)
                     # S4Vectors::mcols(ans)[[source]] <-BiocGenerics::basename(S4Vectors::mcols(ans)[[source]])
                     ans
                   })
#'@export 
#'@rdname compute_coverage_long
setMethod("compute_coverage_long",
                   signature = c("data.frame", "character"), 
                   function(spec, source, .which = NULL, .genome_info = NULL, .drop_empty = TRUE, .parallel = BiocParallel::bpparam()) {
                     spec <- methods::as(spec, "DataFrame")
                     compute_coverage_long(spec, 
                                           source, 
                                           .which, 
                                           .genome_info, 
                                           .drop_empty, 
                                           .parallel)
                   }
)

# work horse function
.bamlist_coverage <- function(bfl, .which, .genome_info, .drop_empty, .parallel) {
  
  sb <- Rsamtools::ScanBamParam()

  if (!is.null(.which)) {
    sb <- Rsamtools::ScanBamParam(which = .which)
  }
  
  cvg  <- BiocParallel::bplapply(
    bfl,
    FUN = function(x) {
        compute_coverage(x, param = sb)
      },
    BPPARAM = .parallel
    ) 
  
  cvg <- GenomicRanges::GRangesList(cvg)
  
  if (!is.null(S4Vectors::mcols(bfl))) {
    md  <- S4Vectors::mcols(bfl)
  } else {
    md <- S4Vectors::DataFrame(source = S4Vectors::Rle(names(cvg)))
  }
  
  if (!is.null(.genome_info)) {
    cvg <- GenomeInfoDb::keepSeqlevels(cvg, 
                                       GenomeInfoDb::seqnames(.genome_info), 
                                       "tidy")
    seqinfo(cvg) <- seqinfo(.genome_info)
  }
  
  if (.drop_empty) {
    .drop_rng <- methods::as(seqinfo(cvg), "GRanges")
    cvg <- S4Vectors::endoapply(cvg,
                  function(x) {
                    IRanges::subsetByOverlaps(x, .drop_rng, 
                                              type = "equal", invert = TRUE)
                  }
    )
  }
  
  inx <- S4Vectors::rep.int(seq_along(cvg), S4Vectors::elementNROWS(cvg))
  cvg <- unlist(cvg, use.names = FALSE)
  md <- md[inx,, drop = FALSE]
  mcols(cvg) <- cbind(md, mcols(cvg))
  cvg
}

#' Merge a design matrix onto a GRanges object
#' 
#' @param ranges a GRanges object 
#' @param design a DataFrame object with 
#' @param by the key column either a character vector of the common columns
#' between `ranges` and `design` or a named character vector (default = NULL).
#' 
#' @details 
#' To summarise ranges features over variables in the experimental `design`,
#' we can join the `design`` matrix to a `ranges` object. 
#' This function computes an inner join on the common key between the two 
#' objects and sorts by the key.
#' 
#' @seealso `dplyr::inner_join()`
#'
#' @export
join_design <- function(ranges, design, by = NULL) {
  # for some reason though base::merge is really slow
  # so have gone for a kludgy indexing approach
  stopifnot(any(names(mcols(ranges)) %in% "source"))
  stopifnot(is(design, "data.frame") | is(design, "DataFrame"))
  
  by <- common_mcols(ranges, design, by = by)
  if (length(by) > 1) stop("Multiple keys not supported.")
  
  key_x <- by
  key_y <- by
  is_named_by <- length(names(by)) == 1
  
  col_x <- methods::as(mcols(ranges)[[key_x]], "Rle")
  col_y <- design[[key_y]]
  
  if (is_named_by) key_x <- names(by)
  diff <- BiocGenerics::setdiff(
    S4Vectors::runValue(col_x),
    col_y
  )
  
  if (length(diff) > 0) {
    ranges <- plyranges::filter(ranges, !(!!key_x %in% !!diff))
  }
  
  ranges <- ranges[BiocGenerics::order(col_x), ]
  
  nr <- seq_len(S4Vectors::nrun(col_x))
  rl <- S4Vectors::runLength(col_x)
  
  design <- design[BiocGenerics::order(col_y), 
                   !key_y,
                   drop = FALSE]
  
  design <- design[BiocGenerics::rep.int(nr, rl), , drop = FALSE]
  
  
  mcols(ranges) <- BiocGenerics::cbind(
    mcols(ranges),
    design
  )
  ranges
}


# Helper function for keys
common_mcols <- function(x, y, by = NULL) {
  x_names <- names(mcols(x))
  y_names <- names(y)
  if (is.null(by)) {
    common_mcols <- intersect(x_names,  y_names)
    if (length(common_mcols) == 0) {
      stop("No common columns between x & y", call. = FALSE)
    }
    return(common_mcols)
  } else {
    named_by <- names(by)
    if (length(named_by) > 0) {
      stopifnot(named_by %in% x_names || by %in% y_names)
      by
      
    } else {
      stopifnot(by %in% x_names || by %in% y_names)
      by
    }
  }
}