R/BamDataFile.R

##########################################################################/**
# @RdocClass BamDataFile
#
# @title "The BamDataFile class"
#
# \description{
#  @classhierarchy
#
#  A BamDataFile object represents a BAM file.
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Arguments passed to @see "R.filesets::GenericDataFile".}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# @author "HB"
#
# \references{
#  [1] The SAM Format Specification Working Group,
#      \emph{The SAM Format Specification}, Sept 7, 2011.\cr
# }
#
# \seealso{
#   An object of this class is typically part of an
#   @see "BamDataSet".
# }
#*/###########################################################################
setConstructorS3("BamDataFile", function(...) {
  extend(GenericDataFile(...), c("BamDataFile", uses("AromaSeqDataFile"), uses("SequenceContigsInterface")))
})


setMethodS3("as.character", "BamDataFile", function(x, ...) {
  this <- x

  s <- NextMethod("as.character")

  if (isFile(this)) {
    s <- c(s, sprintf("Has index file (*.bai): %s", hasIndex(this)))
    s <- c(s, sprintf("Is sorted: %s", isSorted(this)))
    if (hasIndex(this)) {
      s <- c(s, getSeqGenericSummary(x, ...))
      counts <- getReadCounts(this)
      counts <- c(counts, total=sum(counts, na.rm=TRUE))
      frac <- 100*counts/counts["total"]
      counts <- pi3(counts)
      s <- c(s, sprintf("Number of mapped reads: %s (%.1f%%) out of %s", counts["mapped"], frac["mapped"], counts["total"]))
      s <- c(s, sprintf("Number of unmapped reads: %s (%.1f%%) out of %s", counts["unmapped"], frac["unmapped"], counts["total"]))
    }
    s <- c(s, sprintf("Generated by: %s", getProgramString(this)))
  }

  s
}, protected=TRUE)


setMethodS3("buildIndex", "BamDataFile", function(this, ..., skip=!overwrite, overwrite=FALSE) {
  pathname <- getPathname(this)
  pathnameBAI <- sprintf("%s.bai", pathname)

  if (hasIndex(this)) {
    if (skip) {
      return(invisible(getIndexFile(this, create = FALSE)))
    }
    if (!overwrite) {
      throw("Cannot build index file (*.bai). File already exists: ", pathnameBAI)
    }
  }

  pathname <- Arguments$getReadablePathname(pathname)
  pathnameT <- Rsamtools::indexBam(pathname)

  getIndexFile(this, create = FALSE)
})


setMethodS3("getIndexFile", "BamDataFile", function(this, create = TRUE, clean = TRUE, ...) {
  pathname <- getPathname(this)
  pathnameIDX <- sprintf("%s.bai", pathname)
  pathnameIDX <- Arguments$getReadablePathname(pathnameIDX, mustExist = FALSE)
  
  if (isFile(pathnameIDX)) {
    ## If index file is outdated, deleted
    if (clean && file_test("-ot", pathnameIDX, pathname)) {
      warning("Detected outdated index file and recreated it: ", pathnameIDX)
      file.remove(pathnameIDX)
      ## Will try to recreate it
      create <- TRUE
    }
  }

  if (!isFile(pathnameIDX)) {
    if (!create) return(NULL)
    pathnameT <- Rsamtools::indexBam(pathname)
  }

  BamIndexDataFile(pathnameIDX)
})


setMethodS3("hasIndex", "BamDataFile", function(this, ...) {
  !is.null(getIndexFile(this, create = FALSE))
})



setMethodS3("getIndexStats", "BamDataFile", function(this, ..., force=FALSE) {
  stats <- this$.idxStats

  if (force || is.null(stats)) {
    pathname <- getPathname(this)
    if (!hasIndex(this)) {
      throw("Cannot get index statistics. Index does not exists: ", pathname)
    }
    # Quote pathnames
    bfr <- systemSamtools("idxstats", shQuote(pathname), stdout=TRUE, ...)
    bfr <- strsplit(bfr, split="\t", fixed=TRUE)
    seqName <- sapply(bfr, FUN=.subset, 1L)
    seqLength <- as.integer(sapply(bfr, FUN=.subset, 2L))
    countMapped <- as.numeric(sapply(bfr, FUN=.subset, 3L))
    countUnmapped <- as.numeric(sapply(bfr, FUN=.subset, 4L))

    # NOTE: samtools idxstats can return ridicolously(!) large read
    # counts.  Let's assume they are errors and set to NAs. /HB 2010-10-02
    countMapped[countMapped >= .Machine$integer.max] <- NA
    countUnmapped[countUnmapped >= .Machine$integer.max] <- NA
    countMapped <- as.integer(countMapped)
    countUnmapped <- as.integer(countUnmapped)

    stats <- data.frame(length=seqLength, mapped=countMapped, unmapped=countUnmapped)
    rownames(stats) <- seqName
    this$.idxStats <- stats
  }

  stats
})


setMethodS3("getReadCounts", "BamDataFile", function(this, ...) {
  if (isCapableOf(aroma.seq, "samtools")) {
    # Alt 1. samtools indexstats
    stats <- getIndexStats(this, ...)
    counts <- colSums(stats[,c("mapped", "unmapped")], na.rm=TRUE)
  } else {
    # Alt 1. Rsamtools
    use("Rsamtools")
    pathname <- getPathname(this)
    stats <- captureOutput({
      quickBamFlagSummary(pathname, index=pathname, main.groups.only=FALSE)
    })
    stats <- grep("record is", stats, value=TRUE)
    .stop_if_not(length(stats) == 2L)
    stats <- strsplit(stats, split="|", fixed=TRUE)
    stats <- sapply(stats, FUN=function(x) x[2L])
    counts <- as.integer(stats)
    names(counts) <- c("mapped", "unmapped")
  }

  # Sanity check
  .stop_if_not(length(counts) == 2L)

  counts
})


setMethodS3("nbrOfReads", "BamDataFile", function(this, ...) {
  ## Does always work. /HB 2014-04-18
  ## Maybe 'samtools flagstat' is better?
  counts <- getReadCounts(this, ...)
  ## Very slow?!? /HB 2014-04-19
##  counts <- Rsamtools::countBam(getPathname(this))$records
  sum(counts, na.rm=TRUE)
})

setMethodS3("nbrOfMappedReads", "BamDataFile", function(this, ...) {
  counts <- getReadCounts(this, ...)
  counts["mapped"]
})

setMethodS3("nbrOfUnmappedReads", "BamDataFile", function(this, ...) {
  counts <- getReadCounts(this, ...)
  counts["unmapped"]
})


# \details{
#   BAM headers typically contain an \code{"@HD VN:1.0 SO:<value>"} entry,
#   where \code{<value>} indicates whether the aligned reads are sorted
#   or not.  Unfortunately, this entry is neither enforced nor has it to
#   be correct [1,2].
#
#   Instead, we consider a BAM file to be sorted if and only if it has
#   an index file.  The rationale is that it is not possible to index
#   a BAM file unless it is sorted first.
# }
#
# \references{
#   [1] Question: is my BAM file sorted?, Biostar, 2011,
#   \url{http://www.biostars.org/post/show/5256/is-my-bam-file-sorted/}\cr
#   [2] Asking for suggestiona on samtools bug fixing, SEQanswers, 2010,
#   \url{http://seqanswers.com/forums/showthread.php?t=3739}\cr
# }
setMethodS3("isSorted", "BamDataFile", function(this, ...) {
  isTRUE(hasIndex(this))
})


# Argument '...' must be 2nd to match the generic base::sort() function.
setMethodS3("sort", "BamDataFile", function(x, ..., force=FALSE) {
  # To please R CMD check
  this <- x

  # Nothing todo?
  if (!force && isSorted(this)) {
    return(this)
  }

  throw("Not yet implemented!")
})

setMethodS3("getTargets", "BamDataFile", function(this, ...) {
  hdr <- getHeader(this)
  targets <- hdr$targets
  targets
})


setMethodS3("getSeqLengths", "BamDataFile", function(this, ...) {
  getTargets(this)
})

## BEGIN: DEPRECATE?
setMethodS3("nbrOfTargets", "BamDataFile", function(this, ...) {
  nbrOfSeqs(this, ...)
})

setMethodS3("getTargetNames", "BamDataFile", function(this, ...) {
  getSeqNames(this, ...)
})

setMethodS3("getTargetLengths", "BamDataFile", function(this, ...) {
  getSeqLengths(this, ...)
})

setMethodS3("getTotalTargetLength", "BamDataFile", function(this, ...) {
  getTotalSeqLength(this, ...)
})
## END: DEPRECATE?


setMethodS3("getHeader", "BamDataFile", function(this, force=FALSE, ...) {
  header <- this$.header
  if (force || is.null(header)) {
    header <- readHeader(this, ...)
    this$.header <- header
  }
  header
})

setMethodS3("readHeader", "BamDataFile", function(this, ...) {
  pathname <- getPathname(this)
  pathname <- Arguments$getReadablePathname(pathname)
  bf <- Rsamtools::BamFile(pathname, index=character(0L))
  hdr <- Rsamtools::scanBamHeader(bf)
  hdr
}, private=TRUE)


setMethodS3("getPrograms", "BamDataFile", function(this, ...) {
  hdr <- getHeader(this, ...)
  pgs <- hdr$text[names(hdr$text) == "@PG"]
  pgs <- lapply(pgs, FUN=function(pg) {
    pg <- trim(pg)
    pg <- strsplit(pg, split=":", fixed=TRUE)
    keys <- sapply(pg, FUN=function(x) x[1L])
    values <- sapply(pg, FUN=function(x) x[-1L])
    names(values) <- keys
    values
  })
  ids <- sapply(pgs, FUN=function(x) x[[1L]])
  pgs <- lapply(pgs, FUN=function(x) x[-1L])
  names(pgs) <- ids
  pgs
}, protected=TRUE)

setMethodS3("getProgram", "BamDataFile", function(this, ...) {
  pgList <- getPrograms(this, ...)
  if (length(pgList) >= 1L) {
    pgList <- pgList[[1L]]
  } else {
    pgList <- list()
  }
  pgList
}, protected=TRUE)

setMethodS3("getProgramString", "BamDataFile", function(this, full=FALSE, ...) {
  pgs <- getPrograms(this, ...)
  res <- c()
  for (kk in seq_along(pgs)) {
    pg <- pgs[[kk]]
    name <- names(pgs)[kk]
    resK <- sprintf("(%d)", kk)

    # Program name
    pn <- c()
    if (!is.null(name)) pn <- c(pn, gsub("[.][0-9]+$", "", name))
    if (is.element("PN", names(pg))) pn <- c(pn, pg["PN"])
    if (length(pn) == 2L) {
      pattern <- sprintf("^%s", pn[2])
      pn[1] <- gsub(pattern, "", pn[1])
      pn <- pn[nzchar(pn)]
    }
    pn <- paste(pn, collapse=" ")
    resK <- c(resK, pn)

    if (is.element("VN", names(pg))) resK <- c(resK, sprintf("v%s", pg["VN"]))
    if (full) {
      if (is.element("CL", names(pg))) resK <- c(resK, sprintf("(call: %s)", pg["CL"]))
      if (is.element("DS", names(pg))) resK <- c(resK, sprintf("[%s]", pg["DS"]))
    }
    resK <- paste(resK, collapse=" ")
    res <- c(res, resK)
  }
  res <- paste(res, collapse="; ")
  res
}, protected=TRUE)


# \section{Read Groups (RG) specification}{
#   The RG fields populated are ID, SM, PL and LB, where
#   ID=identifier, SM=sample, PL=platform/technology, and
#   LB=library (DNA library prep identifier).
#   ID: Each RG must have a unique ID.
#   SM: Sample. Use pool name where a pool is being sequenced.
#   PM: Platform/technology used to produce the reads. Valid values:
#       CAPILLARY, HELICOS, ILLUMINA, IONTORRENT, LS454, PACBIO, and SOLID.
#   LB: Library.
#
#   Note that tools such as GATK (Broad Institute) requires read groups
#   'ID' and 'SM'. [3].
# }
#
# \references{
#  [1] The SAM Format Specification Working Group,
#      \emph{The SAM Format Specification}, Sept 7, 2011.\cr
#  [2] "What is '@RG ID' versus '@RG SM'"
#      \url{http://seqanswers.com/forums/showthread.php?t=9784}\cr
#  [3] Geraldine van der Auwera,
#      \emph{Collected FAQs about BAM files},
#      The Broad Institute, Aug 2012.
#      \url{http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files}\cr
# }
setMethodS3("getReadGroups", "BamDataFile", function(this, ...) {
  hdr <- getHeader(this, ...)
  SamReadGroup$byScanBamHeader(hdr, ...)
})

setMethodS3("getReadGroup", "BamDataFile", function(this, ...) {
  rgList <- getReadGroups(this, ...)
  if (length(rgList) >= 1L) {
    rgList <- rgList[[1L]]
  } else {
    rgList <- SamReadGroup()
  }
  rgList
})


# @RdocMethod replaceAllReadGroups
# @title "Writes a new BAM file with all existing read groups replaced by one new read group"
#
# \description{
#   @get "title".
# }
#
# \arguments{
#  \item{sample}{Specifies the \code{SM} read group.}
#  \item{library}{Specifies the \code{LB} read group.}
#  \item{platform}{Specifies the \code{PL} read group.}
#  \item{platformUnit}{Specifies the \code{PU} read group.}
# }
setMethodS3("replaceAllReadGroups", "BamDataFile", function(this, rg="*", ..., validate=TRUE, skip=!overwrite, overwrite=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'rg':
  if (is.character(rg) && rg == "*") {
    rg <- SamReadGroup()
    keys <- names(asSamList(rg, drop=FALSE))
    for (key in keys) rg[[key]] <- "*"
  } else {
    rg <- Arguments$getInstanceOf(rg, "SamReadGroup")
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Updating Read Groups")


  pathname <- getPathname(this)

  # Output pathname
  pathnameD <- gsub("[.]bam$", ",RG.bam", pathname)
  pathnameD <- Arguments$getWritablePathname(pathnameD, mustNotExist=(!skip && !overwrite))

  if (isFile(pathnameD)) {
    if (skip) {
      verbose && cat(verbose, "Already exists. Skipping.")
      bf <- newInstance(this, pathnameD)
      buildIndex(bf, skip=TRUE, verbose=less(verbose, 10))
      verbose && exit(verbose)
      return(bf)
    }
    if (!overwrite) {
      throw("Cannot replace SAM read groups. File already exists: ", pathnameD)
    }
  }


  verbose && cat(verbose, "Arguments:")
  verbose && print(verbose, rg)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Use default values from existing read groups of the input file?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  rgList <- asSamList(rg)
  keep <- sapply(rgList, FUN=function(x) identical(x, "*"))
  keys <- names(rgList)[keep]
  if (length(keys) > 0L) {
    rgDefault <- getReadGroups(this)
    rgDefault <- Reduce(merge, rgDefault)
    # Sanity check
    .stop_if_not(inherits(rgDefault, "SamReadGroup"))

    for (key in keys) {
      rg[[key]] <- rgDefault[[key]]
    }
  } # for (key ...)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  validate(rg)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Assert mandatory fields
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  rgList <- asSamList(rg)
  mandatory <- c("SM", "LB", "PL", "PU")
  missing <- setdiff(mandatory, names(rgList))
  if (length(missing)) {
    throw("Cannot write read groups. Mandatory fields are missing: ", hpaste(missing))
  }

  verbose && cat(verbose, "Arguments:")
  verbose && str(verbose, rgList)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Write new BAM file
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  args <- list("AddOrReplaceReadGroups", I=pathname, O=pathnameD)
  args <- c(args, asString(rg, fmtstr="RG%s=%s"))
  verbose && str(verbose, args)
  args$verbose <- less(verbose, 10)

  res <- do.call(systemPicard, args)
  status <- attr(res, "status"); if (is.null(status)) status <- 0L
  verbose && cat(verbose, "Results:")
  verbose && str(verbose, res)
  verbose && cat(verbose, "Status:")
  verbose && str(verbose, status)

  bf <- newInstance(this, pathnameD)
  buildIndex(bf, overwrite=TRUE, verbose=less(verbose, 10))

  verbose && exit(verbose)

  bf
}) # replaceAllReadGroups()


# See help("scanBam", package="Rsamtools") for definition of 'pos'.
setMethodS3("extractReadStartPositions", "BamDataFile", function(this, param=ScanBamParam(what=c("rname", "pos")), flag=scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE), ..., verbose=FALSE) {
  use("Rsamtools")

  pathname <- getPathname(this)

  data <- scanBam(pathname, flag=flag, param=param, ...)
  data <- data[[1L]]
  data <- as.data.frame(data)
  data$rname <- as.character(data$rname)
  data
})


setMethodS3("readDataFrame", "BamDataFile", function(this, fields=NULL, flag=scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE), which=which, ..., verbose=FALSE) {
  use("Rsamtools")

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'fields':
  knownFields <- c("qname", "flag", "rname", "strand", "pos", "qwidth", "mapq", "cigar", "mrnm", "mpos", "isize")
  if (is.null(fields)) {
    fields <- knownFields
  }
  fields <- match.arg(fields, choices=knownFields, several.ok=TRUE)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Reading data frame")

  pathname <- getPathname(this)
  verbose && cat(verbose, "Pathname: ", pathname)

  param <- ScanBamParam(what=fields, which=which)
  verbose && cat(verbose, "scanBam() parameters:")
  verbose && print(verbose, param)

  data <- scanBam(pathname, param=param, ...)

  # Sanity check
  .stop_if_not(is.list(data) && length(data) == 1L)
  data <- data[[1L]]
  .stop_if_not(is.list(data))
  verbose && str(verbose, data)

  verbose && enter(verbose, "Coercing list to data frame")
  # Sanity check
  ns <- sapply(data, FUN=length)
  .stop_if_not(all(ns == ns[1L]))

  data <- as.data.frame(data, stringsAsFactors=TRUE)
  verbose && str(verbose, data)
  verbose && exit(verbose)

  verbose && exit(verbose)

  data
})


setMethodS3("readReadPositions", "BamDataFile", function(this, ..., flag=scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE), verbose=FALSE) {
  use("Rsamtools")

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Reading (rname,pos)")

  verbose && cat(verbose, "scanBam() flags:")
  verbose && print(verbose, flag)

  data <- readDataFrame(this, fields=c("rname", "pos"), flag=flag, ..., verbose=less(verbose, 5))

  verbose && exit(verbose)

  data
}) # readReadPositions()


###########################################################################/**
# @RdocMethod validate
# @alias validate.SamDataFile
#
# @title "Validates a BAM (or SAM) file"
#
# \description{
#   @get "title".
# }
#
# @synopsis
#
# \arguments{
#  \item{method}{A @character string specifying which validation method to use.}
#  \item{...}{Additional arguments passed to the internal validation method.}
#  \item{verbose}{See @see "R.utils::Verbose".}
# }
#
# \value{
#   Returns (invisibly) what the internal validation method returns,
#   or throws an exception if an error is detected.
# }
#
# \seealso{
#   Internally \code{picardValidateSamFile()} is used.
# }
#
# @author "HB"
#
# @keyword internal
#*/###########################################################################
setMethodS3("validate", "BamDataFile", function(this, method=c("picard"), ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'method':
  method <- match.arg(method)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, sprintf("Validating %s", class(this)[1L]))
  verbose && cat(verbose, "Validation method: ", method)

  if (method == "picard") {
    .stop_if_not(isCapableOf(aroma.seq, "picard"))
    res <- picardValidateSamFile(getPathname(this), ..., verbose=verbose)
  }

  verbose && exit(verbose)

  invisible(res)
}, protected=TRUE)



setMethodS3("writeSample", "BamDataFile", function(this, pathname, n, seed=NULL, ..., verbose=FALSE) {
  use("Rsamtools")

  # Argument 'pathname':
  pathname <- Arguments$getWritablePathname(pathname, mustNotExist=TRUE)

  # Argument 'n':
  n <- Arguments$getInteger(n, range=c(1,Inf))
  nMax <- nbrOfReads(this)
  if (n > nMax) {
    throw("Requested more reads than available. Cannot sample with replacement: ", n, " > ", nMax)
  }

  # Argument 'seed':
  if (!is.null(seed)) seed <- Arguments$getInteger(seed)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, sprintf("Writing sample of %s", class(this)[1L]))
  verbose && print(verbose, this)
  verbose && cat(verbose, "Sample size: ", n)
  verbose && cat(verbose, "seed: ", seed)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Set the random seed
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (!is.null(seed)) {
    verbose && enter(verbose, "Setting (temporary) random seed")
    oldRandomSeed <- NULL
    if (exists(".Random.seed", mode="integer")) {
      oldRandomSeed <- get(".Random.seed", mode="integer")
    }
    on.exit({
      if (!is.null(oldRandomSeed)) {
        .Random.seed <<- oldRandomSeed
      }
    }, add=TRUE)
    verbose && cat(verbose, "The random seed will be reset to its original state afterward.")
    verbose && cat(verbose, "Seed: ", seed)
    set.seed(seed)
    verbose && exit(verbose)
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Sample what to keep
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  yieldSize <- 1e6

  progress <- isVisible(verbose)
  verbose && cat(verbose, "Displaying progress: ", progress)
  cprogress <- 0L
  dprogress <- ceiling(nMax/yieldSize/100)
  sprogress <- paste(rep(".", times=max(min(100/dprogress, 100),1)), collapse="")

  verbose && enter(verbose, "Sampling read indexed to keep")
  if (n < nMax) {
    # FIXME: For a BAM file with, say, 2*10^9 reads, here we
    # allocate a vector of 8GB just to keep track of which
    # reads to keep. /HB 2014-04-19
    verbose && printf(verbose, "Allocating logical vector of length %d: %g GB\n", nMax, (4*nMax)/1024^3)
    keep <- logical(length=nMax)
    keep[sample(nMax, size=n, replace=FALSE)] <- TRUE
##    verbose && print(verbose, table(keep))

    # Offset
    offset <- 0L

    # TODO: Added ram/buffer size option. /HB 2013-07-01
    filter <- FilterRules(list(sampler=function(x) {
      # Display progress?
      if (progress) {
        if (cprogress %% dprogress == 0L) {
          message(sprogress, appendLF=FALSE)
          cprogress <<- cprogress + 1L
        }
      }

      n <- nrow(x)
##      message(sprintf("n=%d",n))

      # Nothing to do?
      if (n == 0L) return(logical(0L))

      # Available indices
      res <- keep[offset + seq_len(n)]
##      message(sprintf("n2=%d",length(res)))
      # Sanity check
      .stop_if_not(length(res) == n)

      # Update offset
      offset <<- offset + n

      if (progress) {
        if (cprogress == 100L) {
          message(" [100%]", appendLF=TRUE)
          cprogress <<- 0L
        }
      }

      res
    }))
  } else {
    verbose && cat(verbose, "Nothing to filter; keeping all reads")
    # Nothing to filter; keep everything
    filter <- NULL
  }
  verbose && exit(verbose)


  verbose && enter(verbose, "Filtering")
  # Input file
  pathnameBAM <- getPathname(this)

  # Index file
  pathnameI <- sprintf("%s.bai", pathname)

  # Write to temporary file
  pathnameT <- pushTemporaryFile(pathname)
  pathnameIT <- sprintf("%s.bai", pathnameT)

  bam <- BamFile(pathnameBAM, yieldSize=yieldSize)

  if (progress) message("[0%] ", appendLF=FALSE)
  pathname2 <- filterBam(bam, destination=pathnameT, filter=filter, indexDestination=TRUE)
  keep <- NULL # Not needed anymore
  if (progress && cprogress < 100L) message(" [100%]", appendLF=TRUE)

  verbose && cat(verbose, "Created: ", pathname2)

  bam2 <- newInstance(this, pathname2)
  .stop_if_not(nbrOfReads(bam2) == n)

  # Undo temporary files
  file.rename(pathnameIT, pathnameI)
  pathname <- popTemporaryFile(pathnameT)

  verbose && exit(verbose)

  bam <- newInstance(this, pathname)
  verbose && print(verbose, bam)

  verbose && exit(verbose)

  bam
}, protected=TRUE)


setMethodS3("tview", "BamDataFile", function(this, reference=NULL, chromosome=NULL, position=1L, ..., verbose=FALSE) {
  # Argument 'reference':
  if (!is.null(reference)) {
    if (inherits(reference, "FastaReferenceFile")) {
      pathnameR <- getPathname(reference)
      pathnameR <- Arguments$getReadablePathname(pathnameR)
    } else {
      pathnameR <- Arguments$getReadablePathname(reference)
    }
  }

  # Argument 'chromosome' & 'position':
  if (!is.null(chromosome)) {
    chromosome <- Arguments$getCharacter(chromosome)
    position <- Arguments$getInteger(position, range=c(1L, Inf))
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Interactive display of BAM file using samtools tview")
  verbose && print(verbose, this)
  pathname <- getPathname(this)
  pathname <- Arguments$getReadablePathname(pathname)
  args <- list("tview", pathname)

  if (!is.null(reference)) {
    verbose && cat(verbose, "Using reference:")
    verbose && print(verbose, reference)
    args <- c(args, pathnameR)

    if (inherits(reference, "FastaReferenceFile")) {
      buildIndex(reference, verbose=verbose)
    }
  }

  # Initial genomic position
  if (!is.null(chromosome)) {
    pos <- sprintf("%s:%d", chromosome, position)
    verbose && cat(verbose, "Initial position: %s", pos)
    args <- c(args, sprintf("-p %s", pos))
  }

  # Additional options
  args <- c(args, list(...))

  verbose && enter(verbose, "Calling samtools tview")
  verbose && str(verbose, args)
  res <- do.call(systemSamtools, args=args)
  verbose && exit(verbose)

  verbose && exit(verbose)

  invisible(res)
})


setMethodS3("getFlagStat", "BamDataFile", function(this, ..., force=FALSE) {
  stats <- this$.flagStat

  if (force || is.null(stats)) {
    pathname <- getPathname(this)
    if (!hasIndex(this)) {
      throw("Cannot get index statistics. Index does not exists: ", pathname)
    }
    bfr <- systemSamtools("flagstat", shQuote(pathname), stdout=TRUE, ...)
    bfr <- trim(bfr)

    # Parse
    pattern <- "([0-9]+)[ ]*[+][ ]*([0-9]+)[ ]+(.*)"
    stats <- lapply(bfr, FUN=function(x) {
      counts <- c(gsub(pattern, "\\1", x), gsub(pattern, "\\2", x))
      counts <- as.integer(counts)
      description <- gsub(pattern, "\\3", x)
      data.frame(description=description, passed=counts[1], failed=counts[2], stringsAsFactors=FALSE)
    })
    stats <- Reduce(rbind, stats)

    this$.flagStat <- stats
  }

  stats
})


setMethodS3("isPaired", "BamDataFile", function(this, ..., force=FALSE) {
  isPaired <- this$.isPaired
  if (is.null(isPaired) || force) {
    bai <- buildIndex(this)
    isPaired <- testPairedEndBam(getPathname(this), index = getPathname(bai))
    this$.isPaired <- isPaired
  }
  isPaired
})
HenrikBengtsson/aroma.seq documentation built on Feb. 15, 2021, 2:21 a.m.