R/FastQCReporter.R

###########################################################################/**
# @RdocClass FastQCReporter
#
# @title "The FastQCReporter class"
#
# \description{
#  @classhierarchy
#
#  A FastQCReporter takes a @see "FastqDataSet" as input, possibly groups
#  the samples, and generates FastQC [1] reports.
#  How the grouping is done, can be controlled by a parameter.
# }
#
# @synopsis
#
# \arguments{
#  \item{dataSet}{An @see "FastqDataSet".}
#  \item{groupBy}{A @character string or an explicit named @list,
#   specifying which input files should be processed together.}
#  \item{...}{Additional arguments passed to @see "AromaSeqTransform".}
#  \item{.className}{A @character string specifying what class of
#   data sets to accept.}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# \section{Supported operating systems}{
#   This method is available on Linux, macOS, and Windows [1].
# }
#
# @author HB
#
# \references{
#  [1] Simon Andrews,
#      FastQC - A quality control tool for high throughput sequence data,
#      March 2014.
#      \url{http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}
# }
#
# \seealso{
#   Internally @see "Rsamtools::mergeBam" is used for merging, sorting,
#   and indexing.
# }
#*/###########################################################################
setConstructorS3("FastQCReporter", function(dataSet=NULL, groupBy=NULL, ..., .className="FastqDataSet") {
  if (!is.null(dataSet)) {
    # Argument 'groupBy':
    if (is.character(groupBy)) {
      groupBy <- match.arg(groupBy, choices=c("name"))
    } else if (is.list(groupBy)) {
      # Validated below
    } else {
      throw("Invalid argument 'groupBy': ", mode(groupBy))
    }
  }

  this <- extend(AromaSeqTransform(dataSet, groupBy=groupBy, ..., .className=.className), c("FastQCReporter", uses("FileGroupsInterface")))

  # Argument 'groupBy':
  if (is.list(groupBy)) {
    validateGroups(this, groups=groupBy)
  }

  this
})

setMethodS3("getRootPath", "FastQCReporter", function(this, ...) {
  "reports"
}, protected=TRUE)

setMethodS3("getAsteriskTags", "FastQCReporter", function(this, ...) {
  # The 'groupBy' tag
  params <- getParameters(this)
  groupBy <- params$groupBy
  if (is.character(groupBy)) {
    groupBy <- sprintf("by%s", capitalize(groupBy))
  } else {
    # Generate a short checksum
    groupBy <- getChecksum(groupBy, algo="crc32")
  }

  c("FastQC", groupBy)
}, protected=TRUE)


setMethodS3("getOutputDataSet", "FastQCReporter", function(this, onMissing=c("drop", "NA", "error"), ...) {
  # Argument 'onMissing':
  onMissing <- match.arg(onMissing)


  ## Find all existing output data files
  path <- getPath(this)
  fqcs <- FastQCDataFileSet$byPath(path, ...)

  ## Order according to grouped input data set
  names <- getGroupNames(this)
  fqcs <- extract(fqcs, names, onMissing=onMissing)

  fqcs
}) # getOutputDataSet() for FastQCReporter



setMethodS3("process", "FastQCReporter", function(this, ..., skip=TRUE, force=FALSE, verbose=FALSE) {
  # Requirements
  .stop_if_not(isCapableOf(aroma.seq, "fastqc"))

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'force':
  force <- Arguments$getLogical(force)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Generating FastQC reports")
  ds <- getInputDataSet(this)
  verbose && cat(verbose, "Input data set:")
  verbose && print(verbose, ds)

  groups <- getGroups(this)
  verbose && printf(verbose, "Merging into %d groups: %s\n", length(groups), hpaste(names(groups)))
  verbose && str(verbose, head(groups))

  if (force) {
    todo <- seq_along(groups)
  } else {
    todo <- findFilesTodo(this, verbose=less(verbose, 1))
    # Already done?
    if (length(todo) == 0L) {
      verbose && cat(verbose, "Already done. Skipping.")
      res <- getOutputDataSet(this, onMissing="error", verbose=less(verbose, 1))
      verbose && exit(verbose)
      return(invisible(res))
    }
  }

  verbose && cat(verbose, "Number of files: ", length(ds))
  verbose && cat(verbose, "Number of groups: ", length(groups))

  path <- getPath(this)
  verbose && cat(verbose, "Output path: ", path)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Apply FastQC reporting to each group of FASTQ files
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  IDXS <- groups[todo]

  res <- listenv()
  for (gg in seq_along(IDXS)) {
    idxs <- IDXS[[gg]]
    dfList <- as.list(ds[idxs])
    name <- names(IDXS)[gg]

    verbose && enter(verbose, sprintf("Generating FastQC report for sample #%d ('%s') of %d", gg, name, length(IDXS)))

    verbose && cat(verbose, "Number of FASTQ files to be analyzed: ", length(dfList))

    pathnames <- unlist(sapply(dfList, FUN=getPathname), use.names=FALSE)
    name <- names(dfList)[1L]
    pathD <- file.path(path, name)
    filenameD <- "fastqc_data.txt"
    filenameD <- Arguments$getFilename(filenameD)
    pathnameD <- file.path(pathD, filenameD)
    verbose && cat(verbose, "Output file: ", pathnameD)

    # Already done?
    if (!force && isFile(pathnameD)) {
      verbose && cat(verbose, "Already processed. Skipping.")
      res[[gg]] <- pathnameD
      verbose && exit(verbose)
      next
    }


    res[[gg]] %<-% {
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # BEGIN: ATOMIC PROCESSING
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      verbose && enter(verbose, "Atomic processing")
      pathDT <- sprintf("%s.tmp", pathD)
      verbose && cat(verbose, "Temporary output path: ", pathDT)

      # FIXME: Make this method smarter about passing the --casava option
      # From http://seqanswers.com/forums/showpost.php?p=58152&postcount=200:
      # "Those names don't look like the names generated by Casava. According
      #  to the docs I've got the fastq file names should follow the pattern:
      #
      #  <sample name>_<barcode sequence>_L<lane (0-padded to 3 digits)>_R<read number>_<set number (0-padded to 3 digits>.fastq.gz
      #
      # Which is what FastQC looks for. ..."
      opts <- "--casava"
      verbose && cat(verbose, "FastQC options: ", paste(opts, collapse=", "))
      res <- fastQC(pathnames=pathnames, outPath=pathDT, opts, verbose=less(verbose, 10))
      verbose && cat(verbose, "Output log:")
      verbose && print(verbose, res)

      # Sanity check
      .stop_if_not(isDirectory(pathDT))


      # FIXME: Do we need to pool here?  Does fastQC() return before output
      # directory is created/available on the file system? /HB 2014-07-18.

      # Scan for output files or output directories (depending on version)

      # (a) FastQC (>= 0.11.1; 2014-06-02)
      #     output is now a self-contained HTML file
      filesT <- list.files(path=pathDT, pattern="_fastqc.html$",
                           ignore.case=TRUE, full.names=FALSE)
      if (length(filesT) > 0L) {
        # Sanity check
        .stop_if_not(length(filesT) == 1L)

        # FastQC (>= 0.11.1)
        # Unzip *_fastqc.zip file
        filenameDTZ <- gsub(".html", ".zip", filesT, fixed=TRUE)
        pathnameDTZ <- Arguments$getReadablePathname(filenameDTZ, path=pathDT)

        dirT <- gsub(".html", "", filesT, fixed=TRUE)
        pathDT <- Arguments$getWritablePath(pathDT)
        unzip(pathnameDTZ, exdir=pathDT)

        # CLEANUP: Remove self-contained HTML files
        file.remove(file.path(pathDT, filesT))
      } else {
        # (b) FastQC (<= 0.10.1; 2012-05-03)
        #     output is a multi-file directory
        dirT <- list.files(path=pathDT, pattern="_fastqc$",
                           ignore.case=TRUE, full.names=FALSE)
        if (length(dirT) == 0L) {
          throw(sprintf("Directory %s contains neither a *_fastqc.html file nor a *_fastqc/ directory: %s", sQuote(pathDT), paste(sQuote(list.files(path=pathDT, full.names=FALSE)), collapse=", ")))
        }
      }

      # Sanity check
      .stop_if_not(length(dirT) > 0L)
      .stop_if_not(length(dirT) == 1L)

      pathT <- file.path(pathDT, dirT)
      .stop_if_not(isDirectory(pathT))

      # Sanity check
      pathnameDT <- file.path(pathT, filenameD)
      Arguments$getReadablePathname(pathnameDT)

      # CLEANUP
      filenameDTZ <- sprintf("%s.zip", dirT)
      pathnameDTZ <- file.path(pathDT, filenameDTZ)
      if (isFile(pathnameDTZ)) {
        file.remove(pathnameDTZ)
      }

      # Move output subdirectory
      file.rename(pathT, pathD)

      # Sanity check
      .stop_if_not(!isDirectory(pathT))

      # CLEANUP
      removeDirectory(pathDT)

      # Sanity check
      .stop_if_not(isDirectory(pathD))
      .stop_if_not(!isDirectory(pathDT))

      verbose && exit(verbose)
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # END: ATOMIC PROCESSING
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

      verbose && cat(verbose, "Generated FASTQ report: ", pathD)
      # Sanity checks
      pathD <- Arguments$getReadablePath(pathD)
      pathnameD <- Arguments$getReadablePathname(pathnameD)

      verbose && exit(verbose)

      pathnameD
    } ## %<-%
  } ## for (gg ...)

  ## Resolve futures
  res <- resolve(res)

  res <- getOutputDataSet(this, onMissing="error", verbose=less(verbose, 50))

  verbose && exit(verbose)

  invisible(res)
}) # process()



# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# TO DROP
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
setMethodS3("validateGroups", "FastQCReporter", function(this, groups, ...) {
  # Input data set
  ds <- getInputDataSet(this)
  nbrOfFiles <- length(ds)

  # Sanity checks
  idxs <- unlist(groups, use.names=FALSE)
  idxs <- Arguments$getIndices(idxs, max=nbrOfFiles)
  if (length(idxs) < nbrOfFiles) {
    throw("One or more input FASTQ files is not part of any group.")
  } else if (length(idxs) > nbrOfFiles) {
    throw("One or more input FASTQ files is part of more than one group.")
  }

  if (is.null(names(groups))) {
    throw("The list of groups does not have names.")
  }

  invisible(groups)
}, protected=TRUE)
HenrikBengtsson/aroma.seq documentation built on Feb. 15, 2021, 2:21 a.m.