R/segmentByCBS.R

###########################################################################/**
# @RdocDefault segmentByCBS
# @alias segmentByCBS.data.frame
# @alias segmentByCBS.CBS
# @alias segmentByCBS.CNA
# @alias segmentByCBS
#
# @title "Segment genomic signals using the CBS method"
#
# \description{
#  @get "title" of the \pkg{DNAcopy} package.
#  This is a convenient low-level wrapper for the \code{DNAcopy::segment()}
#  method.  It is intended to be applied to a sample at the time.
#  For more details on the Circular Binary Segmentation (CBS) method
#  see [1,2].
# }
#
# @synopsis
#
# \arguments{
#   \item{y}{A @numeric @vector of J genomic signals to be segmented.}
#   \item{chromosome}{Optional @numeric @vector of length J, specifying
#       the chromosome of each loci.  If a scalar, it is expanded to
#       a vector of length J.}
#   \item{x}{Optional @numeric @vector of J genomic locations.
#            If @NULL, index locations \code{1:J} are used.}
#   \item{index}{An optional @integer @vector of length J specifying
#     the genomewide indices of the loci.}
#   \item{w}{Optional @numeric @vector in [0,1] of J weights.}
#   \item{undo}{A non-negative @numeric.  If greater than zero, then
#       arguments \code{undo.splits="sdundo"} and \code{undo.SD=undo}
#       are passed to \code{DNAcopy::segment()}.
#       In the special case when \code{undo} is +@Inf, the segmentation
#       result will not contain any changepoints (in addition to what
#       is specified by argument \code{knownSegments}).}
#   \item{avg}{A @character string specifying how to calculating
#         segment mean levels \emph{after} change points have been
#         identified.}
#   \item{...}{Additional arguments passed to the \code{DNAcopy::segment()}
#       segmentation function.}
#   \item{joinSegments}{If @TRUE, there are no gaps between neighboring
#     segments.
#     If @FALSE, the boundaries of a segment are defined by the support
#     that the loci in the segments provides, i.e. there exist a locus
#     at each end point of each segment.  This also means that there
#     is a gap between any neighboring segments, unless the change point
#     is in the middle of multiple loci with the same position.
#     The latter is what \code{DNAcopy::segment()} returns.
#   }
#   \item{knownSegments}{Optional @data.frame specifying
#     \emph{non-overlapping} known segments.  These segments must
#     not share loci.  See @see "findLargeGaps" and @see "gapsToSegments".}
#   \item{seed}{An (optional) @integer specifying the random seed to be
#     set before calling the segmentation method.  The random seed is
#     set to its original state when exiting.  If @NULL, it is not set.}
#   \item{verbose}{See @see "R.utils::Verbose".}
# }
#
# \value{
#   Returns a @see "CBS" object.
# }
#
# \details{
#   Internally @see "DNAcopy::segment" of \pkg{DNAcopy} is used to
#   segment the signals.
#   This segmentation method support weighted segmentation.
# }
#
# \section{Reproducibility}{
#   The \code{DNAcopy::segment()} implementation of CBS uses approximation
#   through random sampling for some estimates.  Because of this,
#   repeated calls using the same signals may result in slightly
#   different results, unless the random seed is set/fixed.
# }
#
# \section{Missing and non-finite values}{
#   Signals may contain missing values (@NA or @NaN), but not
#   infinite values (+/-@Inf).  Loci with missing-value signals
#   are preserved and keep in the result.
#
#   Likewise, genomic positions may contain missing values.
#   However, if they do, such loci are silently excluded before
#   performing the segmentation, and are not kept in the results.
#   The mapping between the input locus-level data and ditto of
#   the result can be inferred from the \code{index} column of
#   the locus-level data of the result.
#
#   None of the input data may have infinite values,
#   i.e. -@Inf or +@Inf. If so, an informative error is thrown.
# }
#
# \examples{
#   @include "../incl/segmentByCBS.Rex"
#   @include "../incl/segmentByCBS,plot.Rex"
#   @include "../incl/segmentByCBS,tests.Rex"
# }
#
# @author "HB"
#
# \references{
#  [1] @include "../incl/OlshenVenkatraman_2004.Rd" \cr
#  [2] @include "../incl/VenkatramanOlshen_2007.Rd" \cr
# }
#
# \seealso{
#   To segment allele-specific tumor copy-number signals from a tumor
#   \emph{with} a matched normal, see @see "segmentByPairedPSCBS".
#   For the same \emph{without} a matched normal,
#   see @see "segmentByNonPairedPSCBS".
#
#   It is also possible to prune change points after segmentation (with
#   identical results) using
#   \code{\link[PSCBS:pruneBySdUndo.CBS]{pruneBySdUndo}()}.
# }
# @keyword IO
#*/###########################################################################
setMethodS3("segmentByCBS", "default", function(y, chromosome=0L, x=NULL, index=seq_along(y), w=NULL, undo=0, avg=c("mean", "median"), ..., joinSegments=TRUE, knownSegments=NULL, seed=NULL, verbose=FALSE) {
  R_SANITY_CHECK <- TRUE

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Local functions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # DNAcopy::getbdry() is slow for now default settings.  Below we
  # implement a memoized version of this function.
  getbdry2 <- function(eta, nperm, alpha, tol=0.01, verbose=FALSE) {
    # Explictly setup cache root here, since it's only done by 'R.cache'
    # if that package is attached.  Here we only load it. /HB 2013-09-27
    .setupCacheRootPath()

    key <- list(method="segmentByCBS",
                eta=eta, nperm=as.integer(nperm), alpha=alpha, tol=tol,
                version="0.16.1")
    dirs <- c("PSCBS", "segmentByCBS", "sbdry")
    bdry <- loadCache(key=key, dirs=dirs)
    if (!is.null(bdry)) return(bdry)

    max.ones <- floor(nperm * alpha) + 1L
    bdry <- getbdry(eta=eta, nperm=nperm, max.ones=max.ones, tol=tol)

    saveCache(bdry, key=key, dirs=dirs)

    bdry
  } # getbdry2()


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'y':
  disallow <- c("Inf")
  y <- Arguments$getDoubles(y, disallow=disallow)
  nbrOfLoci <- length(y)

  length2 <- rep(nbrOfLoci, times=2)

  # Argument 'chromosome':
  if (is.null(chromosome)) {
    chromosome <- 0L
  } else {
    disallow <- c("Inf")
    chromosome <- Arguments$getIntegers(chromosome, range=c(0,Inf), disallow=disallow)
    if (length(chromosome) > 1) {
      chromosome <- Arguments$getIntegers(chromosome, length=length2, disallow=disallow)
  ##    # If 'chromosome' is a vector of length J, then it must contain
  ##    # a unique chromosome.
  ##    chromosomes <- sort(unique(chromosome))
  ##    if (length(chromosomes) > 1) {
  ##      stop("Argument 'chromosome' specifies more than one unique chromosome: ", paste(seqToHumanReadable(chromosomes), collapse=", "))
  ##    }
  ##    chromosome <- chromosomes
    }
  }

  # For future usage
  chrom <- rep(chromosome, length.out=nbrOfLoci)

  # Argument 'x':
  if (is.null(x)) {
    x <- seq_len(nbrOfLoci)
  } else {
    disallow <- c("Inf")
    x <- Arguments$getDoubles(x, length=length2, disallow=disallow)
  }

  # Argument 'index':
  if (is.null(index)) {
    index <- seq_along(y)
  } else {
    index <- Arguments$getIndices(index)
  }

  # Argument 'w':
  hasWeights <- !is.null(w)
  if (hasWeights) {
    disallow <- c("NA", "NaN", "Inf")
    w <- Arguments$getDoubles(w, range=c(0,1), length=length2, disallow=disallow)
  }

  # Argument 'undo':
  undo <- Arguments$getDouble(undo, range=c(0,Inf))

  # Argument 'avg':
  avg <- match.arg(avg)

  # Argument 'cpFlavor':
  joinSegments <- Arguments$getLogical(joinSegments)

  # Argument 'knownSegments':
  if (is.null(knownSegments)) {
    knownSegments <- data.frame(chromosome=integer(0), start=integer(0), end=integer(0))
  } else {
#    if (!joinSegments) {
#      stop("Argument 'knownSegments' should only be specified if argument 'joinSegments' is TRUE.")
#    }
  }

  if (!is.data.frame(knownSegments)) {
    stop("Argument 'knownSegments' is not a data.frame: ", class(knownSegments)[1])
  }

  if (!all(is.element(c("chromosome", "start", "end"), colnames(knownSegments)))) {
    stop("Argument 'knownSegments' does not have the required column names: ", hpaste(colnames(knownSegments)))
  }

  # Detailed validation of 'knownSegments'.
  for (chr in sort(unique(knownSegments$chromosome))) {
    dd <- subset(knownSegments, chromosome == chr)

    # Order segments by 'start'.
    o <- order(dd$start)
    dd <- dd[o,]

    # Known segments must not share 'start' or 'end' loci
    for (field in c("start", "end")) {
      xs <- dd[[field]]
      xs <- xs[!is.na(xs)]
      if (anyDuplicated(xs) > 0) {
        print(knownSegments)
        stop(sprintf("Detected segments on chromosome %s with non-unique '%s' positions in argument 'knownSegments'", chr, field))
      }
    } # for (field ...)

    # Known segments must not overlap
    if (!all(dd$start[-1] >= dd$end[-nrow(dd)], na.rm=TRUE)) {
      print(knownSegments)
      stop("Detected overlapping segments on chromosome ", chr, " in argument 'knownSegments'.")
    }
  }

  # Argument 'seed':
  if (!is.null(seed)) {
    seed <- Arguments$getIntegers(seed)
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Segmenting by CBS")

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Setup data
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Setup up data", level=-10)
  data <- data.frame(chrom=chrom, x=x, y=y, index=index)
  if (hasWeights) {
    verbose && cat(verbose, "Adding locus-specific weights", level=-10)
    data$w <- w
  }
  verbose && str(verbose, data, level=-10)
  # Not needed anymore
  chrom <- x <- index <- y <- w <- NULL
  verbose && exit(verbose)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Drop data points without known genomic positions, because that
  # is what DNAcopy::CNA() will do otherwise.
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ok <- (!is.na(data$chrom) & !is.na(data$x))
  if (any(!ok)) {
    verbose && enter(verbose, "Dropping loci with unknown locations", level=-10)
    verbose && cat(verbose, "Number of loci dropped: ", sum(!ok), level=-10)
    data <- data[ok,,drop=FALSE]
    verbose && exit(verbose)
  }
  ok <- NULL # Not needed anymore


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Reorder data points along the genome, because that is what
  # DNAcopy::segment() will return.  At the end, we will undo
  # the sort such that the returned 'data' object is always in
  # the same order and number of loci as the input data.
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Ordering data along genome", level=-50)
  o <- order(data$chrom, data$x, decreasing=FALSE, na.last=TRUE)
  # Any change?
  if (any(o != seq_along(o))) {
    data <- data[o,,drop=FALSE]
  }
  o <- NULL # Not needed anymore
  verbose && str(verbose, data, level=-50)
  verbose && exit(verbose)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Multiple chromosomes?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Identify all chromosomes, excluding missing values
  chromosomes <- sort(unique(data$chrom), na.last=NA)
  nbrOfChromosomes <- length(chromosomes)
  if (nbrOfChromosomes > 1) {
    verbose && enter(verbose, "Segmenting multiple chromosomes")
    verbose && cat(verbose, "Number of chromosomes: ", nbrOfChromosomes)

    # Generate random seeds?
    seeds <- NULL
    if (!is.null(seed)) {
      randomSeed("set", seed=seed, kind="L'Ecuyer-CMRG")
      on.exit(randomSeed("reset"), add=TRUE)
      verbose && printf(verbose, "Random seed temporarily set (seed=c(%s), kind=\"L'Ecuyer-CMRG\")\n", paste(seed, collapse=", "))
      seeds <- randomSeed("advance", n=nbrOfChromosomes)
      verbose && printf(verbose, "Produced %d seeds from this stream for future usage\n", length(seeds))
    }

    fitList <- listenv()
    for (kk in seq_len(nbrOfChromosomes)) {
      chromosomeKK <- chromosomes[kk]
      chrTag <- sprintf("Chr%02d", chromosomeKK)
      verbose && enter(verbose, sprintf("Chromosome #%d ('%s') of %d", kk, chrTag, nbrOfChromosomes))

      seedKK <- seeds[[kk]]

      # Extract subset of data and parameters for this chromosome
      dataKK <- subset(data, chrom == chromosomeKK)
      verbose && str(verbose, dataKK, level=-10)
      chrom <- x <- index <- y <- w <- NULL
      fields <- attachLocally(dataKK, fields=c("chrom", "x", "index", "y", "w"))
      dataKK <- NULL # Not needed anymore

      knownSegmentsKK <- NULL
      if (!is.null(knownSegments)) {
        knownSegmentsKK <- subset(knownSegments, chromosome == chromosomeKK)
        if (nrow(knownSegmentsKK) == 0L) {
          knownSegmentsKK <- data.frame(chromosome=chromosomeKK, start=-Inf, end=+Inf)
        }
        verbose && cat(verbose, "Known segments:", level=-5)
        verbose && print(verbose, knownSegmentsKK, level=-5)
      }

      fitList[[chrTag]] %<-% {
        fit <- segmentByCBS(y=y,
                  chromosome=chrom, x=x,
                  w=w,
                  index=index,
                  undo=undo,
                  avg=avg,
                  joinSegments=joinSegments,
                  knownSegments=knownSegmentsKK,
                  ...,
                  seed=seedKK,
                  verbose=verbose)

        # Sanity checks
        if (R_SANITY_CHECK) {
          if (nrow(knownSegmentsKK) == 0) {
            # Since all missing data have been dropped...
            .stop_if_not(nrow(fit$data) == length(y))
            # ...and ordered along the genome already.
            .stop_if_not(all.equal(fit$data$y, y))
          }

          # Assert weights were used
          .stop_if_not(!hasWeights || !is.null(fit$data$w))
        } # if (R_SANITY_CHECK)

        verbose && print(verbose, head(as.data.frame(fit)), level=-10)
        verbose && print(verbose, tail(as.data.frame(fit)), level=-10)

        fit
      } %seed% TRUE %label% sprintf("segmentByCBS-%s", chrTag)  ## fitList[[chrTag]] <- ...

      rm(list=fields) # Not needed anymore
      verbose && exit(verbose)
    } # for (kk ...)

    verbose && enter(verbose, "Merging (independently) segmented chromosome", level=-50)
    fitList <- as.list(fitList)
    ## former Reduce() w/ append(..., addSplit = TRUE)
    fit <- do.call(c, args = c(fitList, addSplit = TRUE))
    # Not needed anymore
    fitList <- NULL

    # Update parameters that otherwise may be incorrect
    fit$params$seed <- seed

    verbose && str(verbose, fit, level=-10)
    verbose && exit(verbose)

    segs <- as.data.frame(fit)
    if (nrow(segs) < 6) {
      verbose && print(verbose, segs, level=-10)
    } else {
      verbose && print(verbose, head(segs), level=-10)
      verbose && print(verbose, tail(segs), level=-10)
    }

    verbose && exit(verbose)

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Return results
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    return(fit)
  } # if (nbrOfChromosomes > 1)

  verbose && cat(verbose, "Chromosome: ", data$chrom[1L])

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Subset 'knownSegments'
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Keeping only current chromosome for 'knownSegments'", level=-10)

  # Assume no missing values
  currChromosome <- data$chrom[1]
  verbose && cat(verbose, "Chromosome: ", currChromosome, level=-10)

  knownSegments <- subset(knownSegments, chromosome == currChromosome)
  if (nrow(knownSegments) == 0L) {
    knownSegments <- data.frame(chromosome=currChromosome, start=-Inf, end=+Inf)
  }
  nbrOfSegments <- nrow(knownSegments)

  verbose && cat(verbose, "Known segments for this chromosome:", level=-10)
  verbose && print(verbose, knownSegments, level=-10)

  verbose && exit(verbose)


  # Sanity checks
  if (R_SANITY_CHECK) {
    # Here 'knownSegments' should specify at most a single chromosome
    uChromosomes <- sort(unique(knownSegments$chromosome))
    if (length(uChromosomes) > 1) {
      stop("INTERNAL ERROR: Argument 'knownSegments' specifies more than one chromosome: ", hpaste(uChromosomes))
    }
  } # if (R_SANITY_CHECK)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Multiple segments?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Sanity check of limitation  /HB 2011-10-19
  if (nbrOfSegments > 1) {
    verbose && enter(verbose, "Segmenting multiple segments on current chromosome", level=-5)
    verbose && cat(verbose, "Number of segments: ", nbrOfSegments, level=-5)

    # Create a splitter-only CBS object
    dataS <- data.frame(y=c(0,0), chromosome=c(1,2), x=c(0,0))
    if (hasWeights) dataS$w <- 1
    splitter <- segmentByCBS(dataS)
    dataS <- NULL
    suppressWarnings({
      splitter <- extractSegment(splitter, 2)
      # Sanity check
      if (R_SANITY_CHECK) {
        .stop_if_not(nbrOfSegments(splitter, splitters=TRUE) == 1)
      } # if (R_SANITY_CHECK)
    })


    # Generate random seeds?
    seeds <- NULL
    if (!is.null(seed)) {
      randomSeed("set", seed=seed, kind="L'Ecuyer-CMRG")
      on.exit(randomSeed("reset"), add=TRUE)
      verbose && printf(verbose, "Random seed temporarily set (seed=c(%s), kind=\"L'Ecuyer-CMRG\")\n", paste(seed, collapse=", "))
      seeds <- randomSeed("advance", n=nbrOfSegments)
      verbose && printf(verbose, "Produced %d seeds from this stream for future usage\n", length(seeds))
    }

    fitList <- listenv()
    for (jj in seq_len(nbrOfSegments)) {
      seg <- knownSegments[jj,]
      chromosomeJJ <- seg$chromosome
      xStart <- seg$start
      xEnd <- seg$end
      segTag <- sprintf("chr%s:(%s,%s)", chromosomeJJ, xStart, xEnd)
      verbose && enter(verbose, sprintf("Segment #%d ('%s') of %d", jj, segTag, nbrOfSegments), level=-10)

      ## Nothing to do?
      isSplitter <- (is.na(xStart) && is.na(xEnd))
      if (isSplitter) {
        fit <- splitter
        verbose && cat(verbose, "Nothing to segment. Inserting an explicit splitter.", level=-10)
        fitList[[segTag]] <- fit
        fit <- NULL
        verbose && exit(verbose)
        next
      }

      # Extract subset of data and parameters for this segment
      dataJJ <- subset(data, chrom == chromosomeJJ & xStart <= x & x <= xEnd)
      verbose && str(verbose, dataJJ, level=-50)
      chrom <- x <- index <- y <- w <- NULL
      fields <- attachLocally(dataJJ, fields=c("chrom", "x", "index", "y", "w"))
      dataJJ <- NULL # Not needed anymore
      nbrOfLoci <- length(y)

      # Empty segment?
      # [AD HOC. Should be done by segmentCBS(). /HB 2011-10-21]
      if(nbrOfLoci == 0) {
        fit <- splitter
        fit$output$chromosome <- chromosomeJJ
        fit$output$start <- xStart
        fit$output$end <- xEnd
        fit$output$nbrOfLoci <- nbrOfLoci
        fitList[[segTag]] <- fit
        fit <- NULL
        verbose && exit(verbose)
        next
      }

      seedJJ <- seeds[[jj]]

      fitList[[segTag]] %<-% {
        fit <- segmentByCBS(y=y,
                  chromosome=chrom, x=x,
                  w=w,
                  index=index,
                  undo=undo,
                  avg=avg,
                  joinSegments=joinSegments,
                  knownSegments=seg,
                  ...,
                  seed=seedJJ,
                  verbose=less(verbose,1))

        # Sanity checks
        if (R_SANITY_CHECK) {
          .stop_if_not(nrow(fit$data) == nbrOfLoci)
          .stop_if_not(all.equal(fit$data$y, y))

          # Assert weights were used
          .stop_if_not(!hasWeights || !is.null(fit$data$w))
        } # if (R_SANITY_CHECK)

        segs <- as.data.frame(fit)
        if (nrow(segs) < 6) {
          verbose && print(verbose, segs, level=-10)
        } else {
          verbose && print(verbose, head(segs), level=-10)
          verbose && print(verbose, tail(segs), level=-10)
        }

        # Sanity check
        if (R_SANITY_CHECK) {
          .stop_if_not(TRUE && nbrOfSegments(fit, splitters=TRUE) > 0)
        } # if (R_SANITY_CHECK)

        fit
      } %seed% TRUE %label% sprintf("segmentByCBS-%s", segTag)  ## fitList[[segTag]] <- ...

      rm(list=fields) # Not needed anymore

      verbose && exit(verbose)
    } # for (jj ...)


    verbose && enter(verbose, "Merging (independently) segmented known segments", level=-10)
    verbose && cat(verbose, "Number of segments: ", length(fitList), level=-10)
    fitList <- as.list(fitList)
    verbose && str(verbose, fitList, level=-50)
    ## former Reduce() w/ append(..., addSplit = FALSE)
    fit <- do.call(c, args = c(fitList, addSplit = FALSE))
    # Not needed anymore
    fitList <- NULL

    # Update parameters that otherwise may be incorrect
    fit$params$seed <- seed

    verbose && str(verbose, fit, level=-10)
    verbose && exit(verbose)

    segs <- getSegments(fit)
    if (nrow(segs) > 6) {
      verbose && print(verbose, head(segs), level=-10)
      verbose && print(verbose, tail(segs), level=-10)
    } else {
      verbose && print(verbose, segs, level=-10)
    }

    # Sanity checks
    if (R_SANITY_CHECK) {
      segs <- getSegments(fit)
      .stop_if_not(all(segs$start[-1] >= segs$end[-nrow(segs)], na.rm=TRUE))
      .stop_if_not(all(diff(segs$start) >= 0, na.rm=TRUE)) ## FIXME: > 0
      .stop_if_not(all(diff(segs$end) >= 0, na.rm=TRUE)) ## FIXME: > 0

  #    if (nrow(fit$data) != length(y)) {
  #      print(c(nrow(fit$data), nrow(data)))
  #    }
  #    .stop_if_not(nrow(fit$data) == nrow(data))
  #    .stop_if_not(all(fit$data$chromosome == data$chromosome))
  #    .stop_if_not(all(fit$data$x == data$x))
  #    .stop_if_not(all(fit$data$index == data$index))
  #    .stop_if_not(all.equal(fit$data$y, data$y))
    } # if (R_SANITY_CHECK)

    verbose && exit(verbose)

    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Return results
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    return(fit)
  } # if (nbrOfSegments > 1)

  nbrOfSegments <- nrow(knownSegments)

  # Sanity check
  if (R_SANITY_CHECK) {
    .stop_if_not(nbrOfSegments <= 1)
  } # if (R_SANITY_CHECK)

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Specific segment?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (nbrOfSegments > 0) {
    knownSegments <- subset(knownSegments, chromosome == chromosome)
    nbrOfSegments <- nrow(knownSegments)
    # Sanity check
    if (R_SANITY_CHECK) {
      .stop_if_not(nbrOfSegments <= 1)
    } # if (R_SANITY_CHECK)
  }

  if (nbrOfSegments == 1) {
    seg <- knownSegments[1,]
    chromosomeJJ <- seg$chromosome
    xStart <- seg$start
    xEnd <- seg$end
    segTag <- sprintf("chr%s:(%s,%s)", chromosomeJJ, xStart, xEnd)
    verbose && printf(verbose, "Extracting segment '%s'", segTag, level=-50)

    # Extract subset of data and parameters for this segment
    data <- subset(data, chrom == chromosomeJJ & xStart <= x & x <= xEnd)
    verbose && str(verbose, data, level=-50)
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Retrieving segmentation function
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Retrieving the fit function", level=-50)
  # We need to attach the 'DNAcopy' package
  pkgName <- "DNAcopy"
  use(pkgName)
  pkg <- packageDescription(pkgName)
  pkgVer <- pkg$Version
  pkgDetails <- sprintf("%s v%s", pkgName, pkgVer)

  methodName <- "segment"
  verbose && cat(verbose, "Method: ", methodName, level=-50)
  verbose && cat(verbose, "Package: ", pkgDetails, level=-50)

  # Get the fit function for the segmentation method
#  fitFcn <- getExportedValue(pkgName, methodName)
  fitFcn <- getFromNamespace(methodName, pkgName)
  verbose && str(verbose, "Function: ", fitFcn, level=-50)
  formals <- formals(fitFcn)
  verbose && cat(verbose, "Formals:", level=-50)
  verbose && str(verbose, formals, level=-50)
  verbose && exit(verbose)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Setting up arguments to pass to segmentation function
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Setting up method arguments", level=-50)

  verbose && enter(verbose, "Setting up ", pkgName, " data structure", level=-50)

  sampleName <- "y" # This is going to be the name of the data field

  # Supress all warnings, in order to avoid warnings by DNAcopy::CNA()
  # on "array has repeated maploc positions".  Ideally we should filter
  # just those out. /HB 2013-10-22
  suppressWarnings({
    cnData <- CNA(
      genomdat  = data$y,
      chrom     = data$chrom,
      data.type = "logratio",
      maploc    = data$x,
      sampleid  = sampleName,
      presorted = TRUE
    )
  })
  verbose && str(verbose, cnData, level=-50)
  names(cnData)[3] <- sampleName
  verbose && str(verbose, cnData, level=-50)
  verbose && exit(verbose)

  # Sanity check
  if (R_SANITY_CHECK) {
    # (because all loci with unknown locations have already been dropped)
    .stop_if_not(nrow(cnData) == nrow(data))
  } # if (R_SANITY_CHECK)


  userArgs <- list(...)
  if (length(userArgs) > 0) {
    verbose && cat(verbose, "User arguments:", level=-50)
    verbose && str(verbose, userArgs, level=-50)
  }

  # Check if 'sbdry' can/should be precalculated.  This uses memoization
  # so that next time you segment with same 'nperm', 'alpha' and 'eta'
  # parameters, there will be much less startup overhead.
  if (length(userArgs) > 0 && !is.element("sbdry", names(userArgs))) {
    keys <- c("nperm", "alpha", "eta")
    keep <- is.element(keys, names(userArgs))
    if (any(keep)) {
      verbose && enter(verbose, "Precalculating argument 'sbdry' (with memoization)", level=-50)
      # Precalculate boundaries
      argsT <- formals[keys]
      keys <- keys[keep]
      argsT[keys] <- userArgs[keys]
      argsT$verbose <- less(verbose, 5)
      sbdry <- do.call(getbdry2, args=argsT)
      userArgs$sbdry <- sbdry
      verbose && exit(verbose)
    }
  }

  params <- list()

  if (hasWeights) {
    params$weights <- data$w
  }

  if (undo > 0) {
    params$undo.splits <- "sdundo"
    params$undo.SD <- undo
  }

  verbose && cat(verbose, "Segmentation parameters:", level=-50)
  verbose && str(verbose, params, level=-50)

  # Assign/overwrite by user arguments
  if (length(userArgs) > 0) {
    for (ff in names(userArgs)) {
      params[[ff]] <- userArgs[[ff]]
    }
  }

  verbose && cat(verbose, "Segmentation and user parameters:", level=-50)
  verbose && str(verbose, params, level=-50)

  # Cleaning out unknown parameters
  keep <- (names(params) %in% names(formals))
  params <- params[keep]

  args <- c(list(cnData), params, verbose=as.logical(verbose))
  verbose && cat(verbose, "Final arguments:", level=-50)
  verbose && str(verbose, args, level=-50)

  verbose && exit(verbose)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Calling segmentation function
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, sprintf("Calling %s() of %s", methodName, pkgName), level=-50)

  # There are a few cases where we can/need to do a dummy segmentation
  # based on a single data points:
  # (a) WORKAROUND for the case when there are no data points.
  # (b) SPEEDUP: When undo=+Inf we don't really have to segment.
  nbrOfNonMissingLoci <- sum(!is.na(cnData$y))
  if (nbrOfNonMissingLoci == 0) {
    args[[1]] <- CNA(genomdat=0, chrom=0, maploc=0)
    if (hasWeights) args$weights <- 1.0
  } else if (undo == +Inf) {
    args[[1]] <- CNA(genomdat=0, chrom=0, maploc=0)
    if (hasWeights) args$weights <- 1.0
    verbose && cat(verbose, "Skipping identification of new change points (undo=+Inf)", level=-50)
  }


  ## Set random seed?
  if (!is.null(seed)) {
    randomSeed("set", seed=seed, kind="L'Ecuyer-CMRG")
    on.exit(randomSeed("reset"), add=TRUE)
    verbose && printf(verbose, "Random seed temporarily set (seed=c(%s), kind=\"L'Ecuyer-CMRG\")\n", paste(seed, collapse=", "))
  }

  # In case the method writes to stdout, we capture it
  # Note: DNAcopy::segment() *does* this.
  stdout <- capture.output({
    # Does not work, because some internal function of the fit function
    # may only be accessible from within the namespace
    # How to do this for DNAcopy::segment()? /HB
##    fit <- do.call(fitFcn, args)
    # This works, but requires that one loads the package and that the
    # function is not masked in the search() path.
    t <- system.time({
      fit <- do.call(methodName, args)
    }, gcFirst = FALSE)
    # Drop the 'call' (because it will be huge due to the do.call() call)
    fit$call <- NULL
  })
  attr(fit, "processingTime") <- t
  attr(fit, "pkgDetails") <- pkgDetails
  attr(fit, "randomSeed") <- seed

  # WORKAROUND for the case when there are no data points.
  if (nbrOfNonMissingLoci == 0) {
    # Drop dummy data point...
    fit$data <- cnData ## fit$data[-1,,drop=FALSE]
    # ...dummy region found
    output <- fit$output
    segRows <- fit$segRows

    # Sanity check
    if (R_SANITY_CHECK) {
      .stop_if_not(nrow(output) == 1)
    } # if (R_SANITY_CHECK)

    # Was a region specified?
    if (nbrOfSegments == 1) {
      seg <- knownSegments[1,]
      output$ID <- sampleName
      output$chrom <- seg$chromosome
      if (is.finite(seg$start)) {
        output$loc.start <- seg$start
      }
      if (is.finite(seg$end)) {
        output$loc.end <- seg$end
      }
      output$num.mark <- 0L
      output$seg.mean <- NA_real_
      segRows[1,] <- NA_integer_
    } else {
      output <- output[-1,,drop=FALSE]
      segRows <- segRows[-1,,drop=FALSE]
    }
    fit$output <- output
    fit$segRows <- segRows
  } else if (undo == +Inf) {
    # Drop dummy data point...
    fit$data <- cnData ## fit$data[-1,,drop=FALSE]
    # ...dummy region found
    output <- fit$output
    segRows <- fit$segRows

    # Sanity check
    if (R_SANITY_CHECK) {
      .stop_if_not(nrow(output) == 1)
    } # if (R_SANITY_CHECK)

    # Was a region specified?
    if (nbrOfSegments == 1) {
      seg <- knownSegments[1,]
      output$ID <- sampleName
      output$chrom <- seg$chromosome
      if (is.finite(seg$start)) {
        output$loc.start <- seg$start
      } else {
        output$loc.start <- min(cnData$maploc, na.rm=TRUE)
      }
      if (is.finite(seg$end)) {
        output$loc.end <- seg$end
      } else {
        output$loc.end <- max(cnData$maploc, na.rm=TRUE)
      }
    }
    output$num.mark <- nrow(fit$data)
    output$seg.mean <- mean(fit$data$y, na.rm=TRUE)
    segRows$endRow <- nrow(fit$data)

    fit$output <- output
    fit$segRows <- segRows
  } # if (undo == +Inf)

  verbose && cat(verbose, "Captured output that was sent to stdout:", level=-50)
  stdout <- paste(stdout, collapse="\n")
  verbose && cat(verbose, stdout, level=-50)

  verbose && cat(verbose, "Fitting time (in seconds):", level=-50)
  verbose && print(verbose, t, level=-50)

  verbose && cat(verbose, "Fitting time per 1000 loci (in seconds):", level=-50)
  verbose && print(verbose, 1000*t/nbrOfLoci, level=-50)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Restructure
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Restructuring results", level=-50)

  # Coerce
  fit$output$num.mark <- as.integer(fit$output$num.mark)

  # Coerce 'chrom' to a plain integer
  fit$data$chrom <- unclass(fit$data$chrom)

  # Store genomewide index
  fit$data$index <- data$index

  # Store weights
  fit$data$w <- data$w

  # Not needed anymore
  data <- NULL

  verbose && exit(verbose)


  # Store also interesting parameters to DNAcopy::segment()
  keys <- setdiff(names(formals), c("x", "weights", "sbdry", "verbose"))
  keys <- c(keys, "undo", "seed")
  keep <- is.element(names(params), keys)
  keep <- names(params)[keep]
  params <- params[keep]
  params$undo <- undo
  params$joinSegments <- joinSegments
  params$knownSegments <- knownSegments
  params$seed <- seed
  fit$params <- params

#  class(fit) <- c("CBS", class(fit))
  class(fit) <- c("CBS", "AbstractCBS")

  # Sanity checks
  if (R_SANITY_CHECK) {
    segRows <- fit$segRows
    .stop_if_not(all(segRows[,1] <= segRows[,2], na.rm=TRUE))
    .stop_if_not(all(segRows[-nrow(segRows),2] < segRows[-1,1], na.rm=TRUE))
  } # if (R_SANITY_CHECK)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Renaming column names
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  data <- getLocusData(fit)
  names <- colnames(data)
  names <- gsub("chrom", "chromosome", names, fixed=TRUE)
  names <- gsub("maploc", "x", names, fixed=TRUE)
  colnames(data) <- names

  # Drop 'CNA' class and DNAcopy attributes
  class(data) <- c("data.frame")
  attr(data, "data.type") <- NULL

  fit$data <- data

  segs <- fit$output

  names <- colnames(segs)
  names <- gsub("ID", "sampleName", names, fixed=TRUE)
  names <- gsub("seg.mean", "mean", names, fixed=TRUE)
  names <- gsub("chrom", "chromosome", names, fixed=TRUE)
  names <- gsub("num.mark", "nbrOfLoci", names, fixed=TRUE)
  names <- gsub("loc.", "", names, fixed=TRUE) # loc.start, loc.end
  colnames(segs) <- names
  fit$output <- segs


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Join segments?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (joinSegments) {
    if (nbrOfSegments == 1) {
      starts <- knownSegments$start
      ends <- knownSegments$end
      if (is.infinite(starts)) starts <- segs$start
      if (is.infinite(ends)) ends <- segs$end
      range <- range(c(starts, ends), na.rm=TRUE)
    } else {
      range <- NULL
    }

    fit <- joinSegments(fit, range=range, verbose=less(verbose, 10))

    # Sanity checks
    if (R_SANITY_CHECK) {
      segRows <- fit$segRows
      .stop_if_not(all(segRows[,1] <= segRows[,2], na.rm=TRUE))
      .stop_if_not(all(segRows[-nrow(segRows),2] < segRows[-1,1], na.rm=TRUE))
    } # if (R_SANITY_CHECK)
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Update?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (avg != "mean") {
    verbose && enter(verbose, "Updating mean level using different estimator")
    verbose && cat(verbose, "Estimator: ", avg)
    fit <- updateMeans(fit, avg=avg, verbose=less(verbose, 20))
    verbose && exit(verbose)
  }


  verbose && cat(verbose, "Results object:", level=-10)
  verbose && str(verbose, fit, level=-10)

  verbose && exit(verbose)


  verbose && exit(verbose)

  fit
}) # segmentByCBS()


setMethodS3("segmentByCBS", "data.frame", function(y, ...) {
  # To please R CMD check
  data <- y

  y <- data$y
  if (is.null(y)) {
    y <- data$cn
    if (is.null(y)) {
      y <- data$CT
    }
  }

  segmentByCBS(y=y, chromosome=data$chromosome, x=data$x, index=data$index, w=data$w, ...)
})


setMethodS3("segmentByCBS", "CBS", function(...) {
  resegment(...)
}) # segmentByCBS()
HenrikBengtsson/PSCBS documentation built on Feb. 20, 2024, 9:01 p.m.