R/htseqCount.R

###########################################################################/**
# @RdocDefault htseqCount
#
# @title "Calls the htseq-count executable to count input reads on features"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{pathnameS}{An input BAM or SAM file containing aligned reads.}
#   \item{gff}{The gene feature file, in GFF/GTF format.}
#   \item{orderedBy}{A @character string specifying how the input file has
#    been sorted, if at all.}
#   \item{sortByName}{A @character string specifying when the BAM/SAM file
#    should be sorted by name.}
#   \item{optionsVec}{A named @character @vector of options to htseq-count.}
#   \item{...}{(Not used)}
#   \item{pathnameD}{(optional) destination file to save htseq-count output.}
#   \item{command}{A @character string specifying the name of the executable.}
#   \item{verbose}{See @see "R.utils::Verbose".}
# }
#
# \value{
#   Returns what @see "systemHTSeqCount" returns.
# }
#
# \section{Backward compatibility}{
#   \code{htseq-count} (< 0.6.0) requires (i) a SAM file as input
#   that (ii) is sorted by name.  However, this method will take care of
#   that internally, iff needed.  That is, it will created a temporary SAM
#   file that is sorted by query name before passing it to \code{htseq-count}.
# }
#
# \references{
#  [1] S Anders, TP Pyl, W Huber,
#      HTSeq - A Python framework to work with high-throughput sequencing data.
#      bioRxiv 2014. doi: 10.1101/002824.\cr
# }
#
# @author "TT,HB"
#
# @keyword internal
#*/###########################################################################
setMethodS3("htseqCount", "default", function(pathnameS, gff, orderedBy=c("none", "position", "name"),
                                              sortByName=c("always", "auto"),
                                              optionsVec=c('-s'='no', '-a'='10'),  ## Anders et al default
                                              ...,
                                              pathnameD=NULL, command='htseq-count', verbose=FALSE) {
  ## ( Support a call like this: "htseq-count -s no -a 10 lib_sn.sam gff > countFile")
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Local functions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  R.utils::use("Rsamtools")

  isSAM <- function(pathname, ...) {
    (regexpr(".*[.]sam$", pathname, ignore.case=TRUE) != -1L)
  } # isSAM()


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'pathnameS'
  pathnameS <- Arguments$getReadablePathname(pathnameS)
  # Assert BAM or SAM file
  if (regexpr(".*[.](bam|sam)$", pathnameS, ignore.case=TRUE) == -1L) {
    throw("Not a BAM or SAM file: ", pathnameS)
  }

  # Argument 'gff'
  gff <- Arguments$getReadablePathname(gff)

  # Argument 'orderedBy':
  orderedBy <- match.arg(orderedBy)

  # Argument 'sortByName':
  sortByName <- match.arg(sortByName)

  # Count file
  if (is.null(pathnameD)) {
    pathnameD <- sub("[.](bam|sam)$", ".count", pathnameS, ignore.case=TRUE)
  }
  pathnameD <- Arguments$getWritablePathname(pathnameD, mustNotExist=TRUE)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
     pushState(verbose)
     on.exit(popState(verbose), add=TRUE)
  }


  verbose && enter(verbose, "Running htseqCount()")

  # Log file
  pathnameL <- sprintf("%s.log", pathnameD)
  pathnameL <- Arguments$getWritablePathname(pathnameL, mustNotExist=TRUE)


  verbose && cat(verbose, "Input BAM/SAM file: ", sQuote(pathnameS))
  verbose && cat(verbose, "Genome transcript file: ", sQuote(gff))
  verbose && cat(verbose, "Output count file: ", sQuote(pathnameD))
  verbose && cat(verbose, "Output log file: ", sQuote(pathnameL))
  verbose && cat(verbose, "htseq-count executable: ", command)

  # Locates the htseq-count executable
  bin <- findHTSeq(command=command, verbose=less(verbose, 50))
  verbose && cat(verbose, "Executable: ", bin)
  ver <- attr(bin, "version")
  ver <- numeric_version(ver)
  verbose && cat(verbose, "Version: ", ver)
  if (is.na(ver)) ver <- numeric_version("0.0.0")


  # BACKWARD COMPATIBILITY for htseq-count (< 0.6.0)
  convertToSam <- "auto"
  if (convertToSam == "auto") {
    if (ver < "0.6.0") {
      verbose && cat(verbose, "Backward compatibility: Detected htseq-count (< 0.6.0): Will convert to SAM [EXPENSIVE WORKAROUND]")
      convertToSam <- "always"
    } else {
      convertToSam <- "never"
    }
  }
  convertToSam <- match.arg(convertToSam, c("always", "never"))

  # BACKWARD COMPATIBILITY for htseq-count (< 0.6.0)
  if (sortByName == "auto") {
    if (orderedBy == "name") {
      sortByName <- "never"
    } else if (ver < "0.6.0") {
      verbose && cat(verbose, "Backward compatibility: Detected htseq-count (< 0.6.0): Will sort BAM/SAM by name [EXPENSIVE WORKAROUND]")
      sortByName <- "always"
    } else {
      if (orderedBy == "none") {
        sortByName <- "always"
      } else {
        sortByName <- "never"
      }
    }
  }
  sortByName <- match.arg(sortByName, c("always", "never"))


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Sort by query name?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (sortByName == "always") {
    verbose && enter(verbose, "Making sure to provide HTSeq with a BAM/SAM file sorted by query name [EXPENSIVE WORKAROUND]")

    # (a) Create/setup BAM input for sorting, iff SAM file is passed
    if (isSAM(pathnameS)) {
      verbose && enter(verbose, "Converting BAM to SAM")
      pathnameBAM <- sub("[.]sam$", ",tmp.bam", pathnameS, ignore.case=TRUE)
      pathnameBAM <- Arguments$getWritablePathname(pathnameBAM, mustNotExist=TRUE)
      on.exit({
        if (isFile(pathnameBAM)) file.remove(pathnameBAM)
      }, add=TRUE)
      # Convert SAM-to-BAM
      asSam(pathnameS, pathnameBAM)
      # Sanity check
      pathnameBAM <- Arguments$getReadablePathname(pathnameBAM)
      verbose && cat(verbose, "Temporary BAM file: ", pathnameBAM)
      # Update input pathname
      pathnameS <- pathnameBAM
      verbose && exit(verbose)
    } else {
      pathnameBAM <- NULL
    }

    # (b) Sort input BAM by name
    verbose && enter(verbose, "Sorting BAM by query name")
    pathnameBAMs <- sub("(|,tmp)[.]bam$", ",byName,tmp.bam", pathnameS)
    pathnameBAMs <- Arguments$getWritablePathname(pathnameBAMs, mustNotExist=TRUE)
    on.exit({
      if (isFile(pathnameBAMs)) file.remove(pathnameBAMs)
    }, add=TRUE)
    # Sort BAM
    sortBam(pathnameS, sub("[.]bam$", "", pathnameBAMs), byQname=TRUE)
    # Sanity check
    pathnameBAMs <- Arguments$getReadablePathname(pathnameBAMs)
    verbose && cat(verbose, "Sorted temporary BAM file: ", pathnameBAMs)
    # Remove temporary non-sorted BAM file
    if (isFile(pathnameBAM)) file.remove(pathnameBAM)
    # Update input pathname
    pathnameS <- pathnameBAMs
    orderedBy <- "name"
    verbose && exit(verbose)

    verbose && exit(verbose)
  } else {
    pathnameBAMs <- NULL
  } # if (sortByName == "always")

  # Sanity check
  pathnameS <- Arguments$getReadablePathname(pathnameS)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Convert BAM to SAM?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (convertToSam == "always" && !isSAM(pathnameS)) {
    verbose && enter(verbose, "Converting sorted BAM to SAM")

    pathnameSAMs <- sub("[.]bam$", ".sam", pathnameS)
    pathnameSAMs <- Arguments$getWritablePathname(pathnameSAMs)
    on.exit({
      if (isFile(pathnameSAMs)) file.remove(pathnameSAMs)
    }, add=TRUE)
    # Convert BAM-to-SAM
    samtoolsView(pathnameS, pathnameSAMs)
    # Sanity checks
    pathnameSAMs <- Arguments$getReadablePathname(pathnameSAMs)
    verbose && cat(verbose, "Sorted temporary SAM file: ", pathnameSAMs)
    # Update input pathname
    pathnameS <- pathnameBAMs

    # Remove temporary sorted BAM file
    if (isFile(pathnameBAMs)) file.remove(pathnameBAMs)

    verbose && exit(verbose)
  }

  # Sanity check
  pathnameS <- Arguments$getReadablePathname(pathnameS)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Call htseq-count
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Calling systemHTSeqCount()")
  opts <- NULL
  if (ver >= "0.6.0") {
    # SAM or BAM format?
    if (isSAM(pathnameS)) {
      opts <- c(opts, "--format=sam")
    } else {
      opts <- c(opts, "--format=bam")
    }

    # Ordered by position or by name?
    if (orderedBy == "position") {
      opts <- c(opts, "--order=pos")
    } else if (orderedBy == "name") {
      opts <- c(opts, "--order=name")
    } else {
      throw("Invalid value on argument 'orderedBy': ", orderedBy)
    }
  } # if (ver >= "0.6.0")

  # Add user options
  opts <- c(opts, optionsVec)

  # All htseq-count arguments including pathnames
  args <- c(opts, shQuote(pathnameS), shQuote(gff))

  verbose && cat(verbose, "Arguments:")
  verbose && str(verbose, args)

  # htseq-count sends results to standard output, so make sure
  # to redirect to output file.

  # Write results and log to temporary files
  pathnameDT <- pushTemporaryFile(pathnameD, verbose=verbose)
  pathnameLT <- pushTemporaryFile(pathnameL, verbose=verbose)

  status <- systemHTSeqCount(..., args=args, stdout=pathnameDT, stderr=pathnameLT, command=command, verbose=less(verbose, 1))
  verbose && cat(verbose, "Exit status: ", status)

  # Renaming temporary log file
  pathnameL <- popTemporaryFile(pathnameLT, verbose=verbose)
  pathnameL <- Arguments$getReadablePathname(pathnameL)


  # Was there a non-zero exit status?
  if (is.numeric(status) && status != 0L) {
    verbose && enter(verbose, "Non-zero exit status: ", status)

    # Remove empty count file
    fi <- file.info(pathnameDT)
    if (fi$size == 0L) {
      file.remove(pathnameDT)
      verbose && cat(verbose, "Removed empty result file: ", pathnameDT)
      pathnameDT <- NULL
    }

    # Parse log file
    log <- readLines(pathnameL)

    # Did an error occur?  Then throw an R error
    idx <- grep("^Error", log, value=FALSE, ignore.case=TRUE)[1L]
    if (!is.na(idx)) {
      msg <- log[seq(from=idx,to=length(log))]
      msg <- paste(msg, collapse="\n")
      msg <- sprintf("Error detected while running %s (v%s) on %s [exit code %d]: %s", command, ver, shQuote(pathnameS), status, msg)
      throw(msg)
    }

    # Was a warning generated?  The translate it into an R warning
    idx <- grep("^Warning", log, value=FALSE, ignore.case=TRUE)[1L]
    if (!is.na(idx)) {
      msg <- log[seq(from=idx,to=length(log))]
      msg <- paste(msg, collapse="\n")
      msg <- sprintf("Warning detected while running %s (v%s) on %s [exit code %d]: %s", command, ver, shQuote(pathnameS), status, msg)
      warning(msg)
    }
    verbose && exit(verbose)
  }

  # Renaming temporary count file
  if (!is.null(pathnameDT)) {
    pathnameD <- popTemporaryFile(pathnameDT, verbose=verbose)
    pathnameD <- Arguments$getReadablePathname(pathnameD)
  }

  res <- character(0L)
  attr(res, "status") <- status
  attr(res, "countFile") <- pathnameD
  attr(res, "logFile") <- pathnameL

  verbose && exit(verbose)

  verbose && exit(verbose)

  res
}) # htseqCount()
HenrikBengtsson/aroma.seq documentation built on Feb. 15, 2021, 2:21 a.m.