R/aggregateStates.R

Defines functions .Rle aggregateStates

Documented in aggregateStates

#' merge segments that are all in a row, but avoid slopping across chroms
#' 
#' @param HMM     a ChromHMM GRanges
#' @param only3   keep only `name`, `thick`, `itemRgb` as mcols? (FALSE)
#' @param verbose prattle endlessly about which contig we're on? (FALSE) 
#' @param BPPARAM BiocParallel object (default is SerialParam()) 
#' 
#' @return        HMM but without unnecessary segmentations (or mcols)
#' 
#' @import        GenomicRanges
#' @import        BiocParallel
#'
#' @export
aggregateStates <- function(HMM, only3=FALSE, verbose=FALSE, BPPARAM=SerialParam()) {

  grl <- split(HMM, seqnames(HMM))[unique(seqnames(HMM))]
  unname(unlist(GRangesList(bplapply(grl, 
                                     .Rle, 
                                     only3=only3, 
                                     verbose=verbose,
                                     BPPARAM=BPPARAM))))

}


# use an Rle to encode runs of states
# this function is imperfect but works OK
.Rle <- function(HMM, only3=FALSE, verbose=FALSE) { 

  chrom <- unique(seqnames(HMM)) 
  if (length(chrom) > 1) stop("This function should be run per-contig!")
  if (verbose) message("Aggregating states on ", chrom, "... ", appendLF=FALSE)
  columns <- c("name", "thick", "itemRgb")

  ungapped <- HMM
  mc <- names(mcols(HMM))
  mcd <- setdiff(columns, mc)
  mci <- intersect(columns, mc) 
  for (m in mcd) mcols(ungapped)[, m] <- NA
  names(ungapped) <- rep("HMM", length(ungapped))
  if (only3) mcols(ungapped) <- mcols(ungapped)[, columns]
  HHMM <- ungapped
  
  gapped <- gaps(HMM)
  # apparently this must be enforced per-contig!
  gapped <- subset(gapped, seqnames(gapped) == chrom)

  ucd <- setdiff(mc, columns)
  if (length(gapped) > 0) {
    gapped$name <- "Gap" 
    gapped$thick <- ranges(gapped)
    gapped$itemRgb <- .toHex("0,0,0")
    names(gapped) <- rep("gaps", length(gapped))
    if (!only3) {
      for (u in ucd) {
        mcols(gapped)[, u] <- NA
      }
    }
    mcols(gapped) <- mcols(gapped)[, names(mcols(HHMM))]
    HHMM <- subset(sort(c(ungapped, gapped)), strand == "*")
  }

  asRle <- Rle(factor(HHMM$name))
  ends <- end(HHMM[end(asRle)])
  gr <- sort(HHMM[start(asRle)])
  stopifnot(all(start(gr) <= ends))
  end(gr) <- ends
  gr$name <- as.character(gr$name)
  gr$thick <- ranges(gr)

  gr <- unname(subset(gr, name != "Gap"))
  if (only3) {
    mcols(gr) <- mcols(gr)[, columns]
  }
  if (verbose) message("Done.")
  return(gr)

}
ttriche/chromophobe documentation built on Oct. 14, 2024, 11:59 a.m.