R/FragmentLengthNormalization.R

###########################################################################/**
# @RdocClass FragmentLengthNormalization
#
# @title "The FragmentLengthNormalization class"
#
# \description{
#  @classhierarchy
#
#  This class represents a normalization method that corrects for PCR
#  fragment length effects on copy-number chip-effect estimates.
# }
#
# @synopsis
#
# \arguments{
#   \item{dataSet}{A @see "SnpChipEffectSet".}
#   \item{...}{Additional arguments passed to the constructor of
#     @see "ChipEffectTransform".}
#   \item{target}{(Optional) A @character string or a list of @functions
#     specifying what to normalize toward.
#     For each enzyme there is one target function to which all arrays
#     should be normalized to.}
#   \item{subsetToFit}{The units from which the normalization curve should
#     be estimated.  If @NULL, all are considered.}
#   \item{lengthRange}{If given, a @numeric @vector of length 2 specifying
#     the range of fragment lengths considered.  All fragments with lengths
#     outside this range are treated as if they were missing.}
#   \item{onMissing}{Specifies how to normalize units for which the
#     fragment lengths are unknown.}
#   \item{shift}{An optional amount the data points should be shifted
#      (translated).}
#   \item{targetFunctions}{Deprecated.}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# \section{Requirements}{
#   This class requires a SNP information annotation file for the
#   chip type to be normalized.
# }
#
# \details{
#   For SNPs, the normalization function is estimated based on the total
#   chip effects, i.e. the sum of the allele signals.  The normalizing
#   is done by rescale the chip effects on the intensity scale such that
#   the mean of the total chip effects are the same across samples for
#   any given fragment length.  For allele-specific estimates, both alleles
#   are always rescaled by the same amount.  Thus, when normalizing
#   allele-specific chip effects, the total chip effects is change, but not
#   the relative allele signal, e.g. the allele B frequency.
# }
#
# @author "HB"
#*/###########################################################################
setConstructorS3("FragmentLengthNormalization", function(dataSet=NULL, ..., target=targetFunctions, subsetToFit="-XY", lengthRange=NULL, onMissing=c("median", "ignore"), shift=0, targetFunctions=NULL) {
  extraTags <- NULL

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'dataSet':
  if (!is.null(dataSet)) {
    dataSet <- Arguments$getInstanceOf(dataSet, "SnpChipEffectSet")

#    if (dataSet$combineAlleles != TRUE) {
#      throw("Currently only total copy-number chip effects can be normalized, i.e. 'combineAlleles' must be TRUE")
#    }

#    dataSet <- Arguments$getInstanceOf(dataSet, "CnChipEffectSet")

#    if (dataSet$mergeStrands != TRUE) {
#      throw("Currently only non-strands specific copy-number chip effects can be normalized, i.e. 'mergeStrands' must be TRUE")
#    }
  }

  # Argument 'target':
  if (!is.null(target)) {
    if (is.character(target)) {
      if (target == "zero") {
      } else {
        throw("Unknown value of argument 'target': ", target)
      }
    } else if (is.list(target)) {
      # Validate each element
      for (kk in seq_along(target)) {
        if (!is.function(target[[kk]])) {
          throw("One element in 'target' is not a function: ",
                                          class(target[[kk]])[1])
        }
      }
    } else {
      throw("Unknown value of argument 'target': ", class(target)[1])
    }
  }

  # Argument 'subsetToFit':
  if (is.null(subsetToFit)) {
  } else if (is.character(subsetToFit)) {
    if (subsetToFit %in% c("-X", "-Y", "-XY")) {
    } else {
      throw("Unknown value of argument 'subsetToFit': ", subsetToFit)
    }
    extraTags <- c(extraTags, subsetToFit=subsetToFit)
  } else {
    cdf <- getCdf(dataSet)
    subsetToFit <- Arguments$getIndices(subsetToFit, max=nbrOfUnits(cdf))
    subsetToFit <- unique(subsetToFit)
    subsetToFit <- sort(subsetToFit)
  }

  # Argument 'shift':
  shift <- Arguments$getDouble(shift, disallow=c("NA", "NaN", "Inf"))

  # Argument 'lengthRange':
  if (!is.null(lengthRange)) {
    lengthRange <- Arguments$getDoubles(lengthRange)
    stopifnot(lengthRange[1] <= lengthRange[2])
  }

  # Argument 'onMissing':
  onMissing <- match.arg(onMissing)


  extend(ChipEffectTransform(dataSet, ...), "FragmentLengthNormalization",
    "cached:.targetFunctions" = NULL,
    "cached:.target" = target,
    .subsetToFit = subsetToFit,
    .lengthRange = lengthRange,
    .onMissing = onMissing,
    .extraTags = extraTags,
    shift = shift
  )
})


setMethodS3("getAsteriskTags", "FragmentLengthNormalization", function(this, collapse=NULL, ...) {
  tags <- NextMethod("getAsteriskTags", collapse=NULL)

  # Extra tags?
  tags <- c(tags, this$.extraTags)

  # Add class-specific tags
  shift <- as.integer(round(this$shift))
  if (shift != 0) {
    tags <- c(tags, sprintf("%+d", shift))
  }

  # Collapse?
  tags <- paste(tags, collapse=collapse)

  tags
}, protected=TRUE)



setMethodS3("getParameters", "FragmentLengthNormalization", function(this, expand=TRUE, ...) {
  # Get parameters from super class
  params <- NextMethod("getParameters", expand=expand)

  # Get parameters of this class
  params <- c(params, list(
    subsetToFit = this$.subsetToFit,
    lengthRange = this$.lengthRange,
    onMissing = this$.onMissing,
    .target = this$.target,
    shift = this$shift
  ))


  # Expand?
  if (expand) {
    subsetToFit <- getSubsetToFit(this)
  }

  params
}, protected=TRUE)


setMethodS3("getCdf", "FragmentLengthNormalization", function(this, ...) {
  inputDataSet <- getInputDataSet(this)
  getCdf(inputDataSet)
})


setMethodS3("getOutputDataSet00", "FragmentLengthNormalization", function(this, ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Getting input data set")
  ces <- getInputDataSet(this)
  verbose && cat(verbose, "Class: ", class(ces)[1])
  verbose && exit(verbose)

  verbose && enter(verbose, "Getting output data set for ", class(this)[1])

  args <- list("getOutputDataSet")

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Inherit certain arguments from the input data set
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (inherits(ces, "CnChipEffectSet"))
    args$combineAlleles <- ces$combineAlleles
  if (inherits(ces, "SnpChipEffectSet"))
    args$mergeStrands <- ces$mergeStrands

  verbose && cat(verbose, "Calling NextMethod() with arguments:")
  verbose && str(verbose, args)

  args$verbose <- less(verbose, 10)
  res <- do.call(NextMethod, args)

  # Carry over parameters too.  AD HOC for now. /HB 2007-01-07
  if (inherits(res, "SnpChipEffectSet")) {
    verbose && enter(verbose, "Carrying down parameters for ", class(res)[1])

    res$mergeStrands <- ces$mergeStrands
    if (inherits(res, "CnChipEffectSet")) {
      res$combineAlleles <- ces$combineAlleles
    }
    verbose && exit(verbose)
  }

  # Let the set update itself
  update2(res, verbose=less(verbose, 1))

  verbose && exit(verbose)

  res
}, protected=TRUE)


setMethodS3("getFilteredFragmentLengths", "FragmentLengthNormalization", function(this, ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  verbose && enter(verbose, "Reading and filtering fragment lengths")

  verbose && enter(verbose, "Reading fragment lengths")
  # Get SNP information
  cdf <- getCdf(this)
  si <- getSnpInformation(cdf)
  verbose && print(verbose, si)

  fl <- getFragmentLengths(si, ...)
  verbose && cat(verbose, "Summary of non-filtered fragment lengths:")
  verbose && str(verbose, fl)
  verbose && summary(verbose, fl)
  verbose && exit(verbose)

  verbose && enter(verbose, "Filtering fragment lengths")
  # Get the range fragment lengths to be considered
  params <- getParameters(this, expand=FALSE)
  range <- params$lengthRange

  if (!is.null(range)) {
    naValue <- NA_real_
    for (ee in seq_len(ncol(fl))) {
      flEE <- fl[,ee]
      ok <- (!is.na(flEE))

      tooSmall <- (ok & (flEE < range[1]))
      n <- sum(tooSmall)
      if (n > 0) {
        verbose && printf(verbose, "Detected %d fragments on enzyme %d that are too short (< %.0g)\n", n, ee, range[1])
      }

      tooLarge <- (ok & (flEE > range[2]))
      n <- sum(tooLarge)
      if (n > 0) {
        verbose && printf(verbose, "Detected %d fragments on enzyme %d that are too long (> %.0g)\n", n, ee, range[2])
      }

      tooExtreme <- (tooSmall | tooLarge)
      n <- sum(tooExtreme)

      verbose && printf(verbose, "Detected %d fragments on enzyme %d with lengths outside of filtered range [%.0g,%.0g]\n", n, ee, range[1], range[2])

      fl[tooExtreme,ee] <- naValue
    } # for (ee ...)

    verbose && cat(verbose, "Summary of filtered fragment lengths:")
    verbose && summary(verbose, fl)

    verbose && exit(verbose)
  } # if (!is.null(range))

  verbose && exit(verbose)

  fl
}, protected=TRUE)


setMethodS3("getSubsetToFit", "FragmentLengthNormalization", function(this, force=FALSE, ..., verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  # Cached?
  units <- this$.units
  if (!is.null(units) && !force)
    return(units)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Identifying all potential units, i.e. SNPs and CN probes
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Identifying units that are SNP and CN probes")

  # Get SNP information
  cdf <- getCdf(this)
  si <- getSnpInformation(cdf)
  verbose && print(verbose, si)

  # Identify all SNP and CN units (==potential units to be fitted)
  verbose && enter(verbose, "Identifying SNPs and CN probes")
## OLD:
## units <- indexOf(cdf, "^(SNP|CN)")
  types <- getUnitTypes(cdf, verbose=less(verbose,1))
  verbose && print(verbose, table(types))
  units <- which(types == 2 | types == 5)
  # Not needed anymore
  types <- NULL
  verbose && cat(verbose, "units:")
  verbose && str(verbose, units)
  verbose && exit(verbose)

  onMissing <- this$.onMissing
  if (onMissing == "ignore") {
    # Keep only those for which we have PCR fragmenth-length information
    # for at least one enzyme
    verbose && enter(verbose, "Identifying finite fragment lengths")
    fl <- getFilteredFragmentLengths(this, units=units, verbose=less(verbose,3))
    keep <- rep(FALSE, nrow(fl))
    for (ee in seq_len(ncol(fl))) {
      keep <- keep | is.finite(fl[,ee])
    }
    units <- units[keep]
    verbose && printf(verbose, "Number of SNP/CN units without fragment-length details: %d out of %d (%.1f%%)\n", sum(!keep), length(keep), 100*sum(!keep)/length(keep))
    verbose && exit(verbose)
    # Not needed anymore
    fl <- NULL
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Subset with a prespecified set of units?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  subsetToFit <- this$.subsetToFit
  if (is.character(subsetToFit)) {
    if (subsetToFit %in% c("-X", "-Y", "-XY")) {
      verbose && enter(verbose, "Identify subset of units from genome information")
      verbose && cat(verbose, "subsetToFit: ", subsetToFit)

      # Look up in cache
      subset <- this$.subsetToFitExpanded
      if (is.null(subset)) {
        # Get the genome information (throws an exception if missing)
        gi <- getGenomeInformation(cdf)
        verbose && print(verbose, gi)

        # Identify units to be excluded
        if (subsetToFit == "-X") {
          subset <- getUnitsOnChromosomes(gi, 23, .checkArgs=FALSE)
        } else if (subsetToFit == "-Y") {
          subset <- getUnitsOnChromosomes(gi, 24, .checkArgs=FALSE)
        } else if (subsetToFit == "-XY") {
          subset <- getUnitsOnChromosomes(gi, 23:24, .checkArgs=FALSE)
        }

        verbose && cat(verbose, "Units to exclude: ")
        verbose && str(verbose, subset)

        # The units to keep
        subset <- setdiff(1:nbrOfUnits(cdf), subset)

        verbose && cat(verbose, "Units to include: ")
        verbose && str(verbose, subset)

        # Store
        this$.subsetToFitExpanded <- subset
      }

      subsetToFit <- subset
      # Not needed anymore
      subset <- NULL

      verbose && exit(verbose)
    }
  }

  if (!is.null(subsetToFit)) {
    # A fraction subset?
    if (length(subsetToFit) == 1 && 0 < subsetToFit && subsetToFit < 1) {
      keep <- seq(from=1, to=length(units), length=subsetToFit*length(units))
    } else {
      keep <- which(units %in% subsetToFit)
    }

    verbose && enter(verbose, "Reading fragment lengths")
    fl <- getFilteredFragmentLengths(this, units=units, verbose=less(verbose,3))
    verbose && exit(verbose)

    # Make sure to keep data points at the tails too
    extremeUnits <- c()
    for (ee in seq_len(ncol(fl))) {
      extremeUnits <- c(extremeUnits, which.min(fl[,ee]), which.max(fl[,ee]))
    }
    # Not needed anymore
    fl <- NULL

    keep <- c(keep, extremeUnits)
    keep <- unique(keep)

    # Now filter
    units <- units[keep]
    # Not needed anymore
    keep <- NULL
  }

  # Sort units
  units <- sort(units)

  # Assert correctness
  units <- Arguments$getIndices(units, range=c(1, nbrOfUnits(cdf)))

  # Cache
  this$.units <- units

  verbose && exit(verbose)

  units
}, private=TRUE)




setMethodS3("getTargetFunctions", "FragmentLengthNormalization", function(this, ..., force=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }


  target <- this$.target

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Handling argument 'force'
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # AD HOC: If forced, make sure to use the original target argument
  if (force) {
    if (!is.null(target)) {
      targetType <- attr(target, "targetType")
      if (!is.null(targetType))
        target <- targetType
      # Not needed anymore
      targetType <- NULL
    }
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Get SNP information
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  cdf <- getCdf(this)
  si <- getSnpInformation(cdf)
  verbose && print(verbose, si)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Predefined target functions?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (is.character(target)) {
    verbose && enter(verbose, "Setting up predefined target functions")
    targetType <- target
    verbose && cat(verbose, "Target type: ", targetType)

    # Infer the number of enzymes
    fl <- getFilteredFragmentLengths(this, units=1:5)
    nbrOfEnzymes <- ncol(fl)
    # Not needed anymore
    fl <- NULL

    if (identical(targetType, "zero")) {
      target <- rep(list(function(...) log2(2200)), nbrOfEnzymes)
    } else {
      throw("Unknown target function: ", targetType)
    }

    # Store the original target type in an attribute
    attr(target, "targetType") <- targetType

    this$.target <- target
    force <- FALSE
    verbose && exit(verbose)
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Target functions based on the average array?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (force || is.null(target)) {
    # Get target set
    ces <- getInputDataSet(this)
    verbose && enter(verbose, "Getting average signal across arrays")
    ceR <- getAverageFile(ces, force=force, verbose=less(verbose))
    # Not needed anymore
    ces <- NULL; # Not needed anymore
    verbose && exit(verbose)

    # Garbage collect
    gc <- gc()
    verbose && print(verbose, gc)

    # Get units to fit
    units <- getSubsetToFit(this)
    verbose && cat(verbose, "Units:")
    verbose && str(verbose, units)

    # Get target signals for SNPs
    yR <- extractTheta(ceR, units=units, verbose=less(verbose, 5))
    verbose && cat(verbose, "(Allele-specific) thetas:")
    verbose && str(verbose, yR)

    # If more than one theta per unit, sum them up to get the total signal
    if (ncol(yR) > 1) {
      # Row sums with na.rm=TRUE => NAs are treated as zeros.
      yR[is.na(yR)] <- 0
      for (cc in 2:ncol(yR)) {
        yR[,1] <- yR[,1] + yR[,cc]
      }
    }
    yR <- yR[,1,drop=TRUE]
    verbose && cat(verbose, "Total thetas:")
    verbose && str(verbose, yR)

##     data <- getDataFlat(ceR, units=units, fields="theta", verbose=less(verbose))
    # Not needed anymore
    ceR <- NULL; # Not needed anymore
##     units <- data[,"unit"]
##     yR <- data[,"theta"]
##     # Not needed anymore
##     data <- NULL; # Not needed anymore

    verbose && cat(verbose, "Summary of total signals (on the intensity scale):")
    verbose && summary(verbose, yR)

    # Shift?
    shift <- this$shift
    if (shift != 0) {
      verbose && printf(verbose, "Applying shift: %+f\n", shift)
      yR <- yR + shift
      verbose && cat(verbose, "Summary of shifted signals (on the intensity scale):")
      verbose && summary(verbose, yR)
    }

    yR <- log2(yR)
    verbose && cat(verbose, "Signals:")
    verbose && str(verbose, yR)
    verbose && cat(verbose, "Summary of signals (on the log2 scale):")
    verbose && summary(verbose, yR)

    # Get PCR fragment lengths for these
    fl <- getFilteredFragmentLengths(this, units=units, verbose=less(verbose, 3))
    # Not needed anymore
    units <- NULL; # Not needed anymore

    verbose && cat(verbose, "Fragment lengths:")
    verbose && str(verbose, fl)
    verbose && cat(verbose, "Summary of fragment lengths:")
    verbose && summary(verbose, fl)

    # Fit lowess function
    verbose && enter(verbose, "Fitting target prediction function to each enzyme exclusively")
    okYR <- is.finite(yR)
    verbose && cat(verbose, "Distribution of log2 signals that are finite:")
    verbose && summary(verbose, okYR)

    hasFL <- is.finite(fl)
    verbose && cat(verbose, "Distribution of units with known fragment lengths:")
    verbose && summary(verbose, hasFL)

    nbrOfEnzymes <- ncol(fl)
    allEnzymes <- seq_len(nbrOfEnzymes)

    fits <- list()
    for (ee in allEnzymes) {
      verbose && enter(verbose, "Enzyme #", ee, " of ", nbrOfEnzymes)

      # Fit only to units with known length and non-missing data points.
      ok <- (hasFL[,ee] & okYR)

      verbose && cat(verbose, "Distribution of units with known fragment lengths and finite signals:")
      verbose && summary(verbose, ok)

      # Exclude multi-enzyme units
      for (ff in setdiff(allEnzymes, ee)) {
        ok <- ok & !hasFL[,ff]
      }

      verbose && cat(verbose, "Distribution of units with known fragment lengths and finite signals that are exclusively on this enzyme:")
      verbose && summary(verbose, ok)


      # Sanity check
      if (sum(ok) == 0) {
        throw(sprintf("Cannot fit target function to enzyme, because there are no (finite) data points that are unique to this enzyme (%d). Consider using argument 'lengthRange' when setting up the FragmentLengthNormalization.", ee))
      }

      # Fit fragment-length effect to single-enzyme units
      fit <- lowess(fl[ok,ee], yR[ok])
      class(fit) <- "lowess"

      # Not needed anymore
      ok <- NULL

      fits[[ee]] <- fit

      # Not needed anymore
      fit <- NULL

      verbose && exit(verbose)
    } # for (ee in allEnzymes)

    # Remove as many promises as possible
    # Not needed anymore
    target <- nbrOfEnzymes <- allEnzymes <- fl <- yR <- okYR <- hasFL <- NULL

    # Create a target prediction function for each enzyme
    fcns <- vector("list", length(fits))
    for (ee in seq_along(fits)) {
      fcns[[ee]] <- function(x, ...) {
        predict(fits[[ee]], x, ...);  # Dispatched predict.lowess().
      }
    }
    verbose && str(verbose, fcns)
    verbose && exit(verbose)

    # Garbage collect
    gc <- gc()
    verbose && print(verbose, gc)

    target <- fcns
    # Not needed anymore
    fcns <- NULL
    this$.target <- target
    force <- FALSE

    verbose && exit(verbose)
  } # if (force || ...)


  target
}, private=TRUE)



###########################################################################/**
# @RdocMethod process
#
# @title "Normalizes the data set"
#
# \description{
#  @get "title".
# }
#
# @synopsis
#
# \arguments{
#   \item{...}{Additional arguments passed to
#     @see "aroma.light::normalizeFragmentLength" (only for advanced users).}
#   \item{force}{If @TRUE, data already normalized is re-normalized,
#       otherwise not.}
#   \item{verbose}{See @see "R.utils::Verbose".}
# }
#
# \value{
#  Returns a @double @vector.
# }
#
# \seealso{
#   @seeclass
# }
#*/###########################################################################
setMethodS3("process", "FragmentLengthNormalization", function(this, ..., force=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "Normalizing set for PCR fragment-length effects")

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Already done?
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (!force && isDone(this)) {
    verbose && cat(verbose, "Already normalized")
    verbose && enter(verbose, "Getting output data set")
    outputSet <- getOutputDataSet(this, verbose=less(verbose, 10))
    verbose && exit(verbose)
    verbose && exit(verbose)
    return(invisible(outputSet))
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Setup
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Get input data set
  ces <- getInputDataSet(this)

  verbose && enter(verbose, "Identifying SNP and CN units")
  # Get SNP & CN units
  cdf <- getCdf(ces)
## OLD:
##  subsetToUpdate <- indexOf(cdf, "^(SNP|CN)")
  types <- getUnitTypes(cdf, verbose=less(verbose,1))
  verbose && print(verbose, table(types))
  subsetToUpdate <- which(types == 2 | types == 5)
  # Not needed anymore
  types <- NULL
  verbose && cat(verbose, "subsetToUpdate:")
  verbose && str(verbose, subsetToUpdate)
  verbose && exit(verbose)


  verbose && enter(verbose, "Retrieving SNP information annotations")
  si <- getSnpInformation(cdf)
  verbose && print(verbose, si)
  verbose && exit(verbose)

  verbose && enter(verbose, "Identifying the subset used to fit normalization function(s)")
  # Get subset to fit
  subsetToFit <- getSubsetToFit(this, verbose=less(verbose))
  verbose && str(verbose, subsetToFit)
  verbose && exit(verbose)

  shift <- this$shift
  verbose && cat(verbose, "Shift: ", shift)

  onMissing <- this$.onMissing
  verbose && cat(verbose, "onMissing: ", onMissing)

  # Get (and create) the output path
  path <- getPath(this)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Normalize each array
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  fl <- NULL
  targetFcns <- NULL
#  map <- NULL
  cellMatrixMap <- NULL
  nbrOfArrays <- length(ces)

  res <- listenv()

  for (kk in seq_len(nbrOfArrays)) {
    ce <- ces[[kk]]
    verbose && enter(verbose, sprintf("Array #%d of %d ('%s')",
                                            kk, nbrOfArrays, getName(ce)))

    filename <- getFilename(ce)
    pathname <- filePath(path, filename)
    if (isFile(pathname)) {
      verbose && cat(verbose, "Already normalized. Skipping.")

      ## Assert validity of file
      ceN <- fromFile(ce, pathname)

      # Carry over parameters too.  AD HOC for now. /HB 2007-01-07
      if (inherits(ce, "SnpChipEffectFile")) {
        ceN$mergeStrands <- ce$mergeStrands
        if (inherits(ce, "CnChipEffectFile")) {
          ceN$combineAlleles <- ce$combineAlleles
        }
      }

      # CDF inheritance
      setCdf(ceN, cdf)

      res[[kk]] <- pathname

      verbose && exit(verbose)
      next
    }

    # Get unit-to-cell? (for optimized reading)
#    if (is.null(map)) {
#      # Only loaded if really needed.
#      verbose && enter(verbose, "Retrieving unit-to-cell map for all arrays")
#      map <- getUnitGroupCellMap(ce, units=subsetToUpdate, verbose=less(verbose))
#      verbose && str(verbose, map)
#      verbose && exit(verbose)
#    }

    if (is.null(fl)) {
      # For the subset to be fitted, get PCR fragment lengths (for all enzymes)
      fl <- getFilteredFragmentLengths(this, units=subsetToUpdate, verbose=less(verbose, 3))
      verbose && summary(verbose, fl)

      # Get the index in the data vector of subset to be fitted.
      # Note: match() only returns first match, which is why we do
      # it this way.
      subset <- match(subsetToUpdate, subsetToFit)
      subset <- subset[!is.na(subset)]
      subset <- match(subsetToFit[subset], subsetToUpdate)
      verbose && str(verbose, subset)
    }

    if (is.null(targetFcns)) {
      # Only loaded if really needed.
      # Retrieve/calculate the target function
      targetFcns <- getTargetFunctions(this, verbose=less(verbose))
    }

    if (is.null(cellMatrixMap)) {
      verbose && enter(verbose, "Getting cell matrix map")
      cellMatrixMap <- getUnitGroupCellMatrixMap(ce, units=subsetToUpdate, verbose=less(verbose, 10))
      verbose && str(verbose, cellMatrixMap)
      verbose && exit(verbose)
    }


    res[[kk]] %<-% {
      # Get target log2 signals for all SNPs to be updated
      verbose && enter(verbose, "Getting theta estimates")
      theta <- extractTheta(ce, units=cellMatrixMap, drop=FALSE, verbose=less(verbose, 5))
      verbose && str(verbose, theta)
      verbose && summary(verbose, theta)
      verbose && exit(verbose)

      verbose && enter(verbose, "Calculating total signals")
      # Get the total locus signals?
      if (ncol(theta) > 1) {
        # Row sums with na.rm=TRUE => NAs are treated as zeros.
        y <- theta
        y[is.na(y)] <- 0
        for (cc in 2:ncol(y)) {
          y[,1] <- y[,1] + y[,cc]
        }
        y <- y[,1,drop=TRUE]
      } else {
        y <- theta[,1,drop=TRUE]
      }
      verbose && cat(verbose, "Total thetas:")
      verbose && str(verbose, y)
      verbose && exit(verbose)

  #    data <- getDataFlat(ce, units=map, fields="theta", verbose=less(verbose))
  #    verbose && str(verbose, data)
  #    y0 <- data[,"theta",drop=TRUE]
  #    stopifnot(identical(y,y0))
  #    verbose && str(verbose, y)
  #    verbose && exit(verbose)


      # Extract the values to fit the normalization function
      verbose && enter(verbose, "Normalizing log2 signals")

      # Shift?
      if (shift != 0)
        y <- y + shift

      # Fit on the log2 scale
      y <- log2(y)

      verbose && cat(verbose, "Log2 signals:")
      verbose && str(verbose, y)
      yN <- .normalizeFragmentLength(y, fragmentLengths=fl,
                      targetFcns=targetFcns, subsetToFit=subset,
                      onMissing=onMissing, ...)
      verbose && cat(verbose, "Normalized log2 signals:")
      verbose && summary(verbose, yN)

      # Normalization scale factor for each unit (on the log2 scale)
      rho <- y-yN
      # Not needed anymore
      y <- yN <- NULL
      # On the intensity scale
      rho <- 2^rho
      verbose && cat(verbose, "Normalization scale factors:")
      verbose && summary(verbose, rho)

      # Sanity check
      stopifnot(length(rho) == nrow(theta))

      # Normalize the theta:s (on the intensity scale)
      ok <- which(is.finite(rho))
      verbose && str(verbose, ok)
      theta[ok,] <- theta[ok,]/rho[ok]
      # Not needed anymore
      ok <- rho <- NULL

      verbose && cat(verbose, "Normalized thetas:")
      verbose && str(verbose, theta)
      verbose && summary(verbose, theta)

      verbose && exit(verbose)

      # Create CEL file to store results, if missing
      verbose && enter(verbose, "Creating CEL file for results, if missing")

      # Write to a temporary file (allow rename of existing one if forced)
      isFile <- isFile(pathname)
      pathnameT <- pushTemporaryFile(pathname, isFile=isFile, verbose=verbose)

      ceN <- createFrom(ce, filename=pathnameT, path=NULL, verbose=less(verbose))
      verbose && exit(verbose)

      # Carry over parameters too.  AD HOC for now. /HB 2007-01-07
      if (inherits(ce, "SnpChipEffectFile")) {
        ceN$mergeStrands <- ce$mergeStrands
        if (inherits(ce, "CnChipEffectFile")) {
          ceN$combineAlleles <- ce$combineAlleles
        }
      }

      # CDF inheritance
      setCdf(ceN, cdf)

      verbose && enter(verbose, "Storing normalized signals")
  #    data[,"theta"] <- yN
  #    # Not needed anymore
  #    yN <- NULL
  #    updateDataFlat(ceN, data=data, verbose=less(verbose))
  #    # Not needed anymore
  #    data <- NULL
      ok <- which(is.finite(cellMatrixMap))
      cells <- cellMatrixMap[ok]
      data <- theta[ok]
      # Not needed anymore
      ok <- theta <- NULL

      verbose2 <- -as.integer(verbose) - 5
      pathnameN <- getPathname(ceN)
      .updateCel(pathnameN, indices=cells, intensities=data, verbose=verbose2)
      # Not needed anymore
      cells <- data <- ceN <- NULL
      verbose && exit(verbose)

      # Rename temporary file
      popTemporaryFile(pathnameT, verbose=verbose)

      ## Create checksum file
      dfZ <- getChecksumFile(pathname)

      # Garbage collect
      gc <- gc()
      verbose && print(verbose, gc)

      pathname
    } ## %<-%

    verbose && exit(verbose)
  } # for (kk in ...)

  fl <- NULL  ## Not needed anymore

  ## Resolve futures
  res <- as.list(res)
  res <- NULL

  # Garbage collect
#  clearCache(this)
  gc <- gc()
  verbose && print(verbose, gc)

  # Create the output set
  outputSet <- getOutputDataSet(this, verbose=less(verbose,5))

  verbose && exit(verbose)

  outputSet
})
HenrikBengtsson/aroma.affymetrix documentation built on Feb. 20, 2024, 9:07 p.m.