R/normalizeCurveFit.R

#########################################################################/**
# @RdocGeneric normalizeCurveFit
# @alias normalizeLoess
# @alias normalizeLowess
# @alias normalizeSpline
# @alias normalizeRobustSpline
# @alias normalizeCurveFit.matrix
# @alias normalizeLoess.matrix
# @alias normalizeLowess.matrix
# @alias normalizeSpline.matrix
# @alias normalizeRobustSpline.matrix
#
# \encoding{latin1}
#
# @title "Weighted curve-fit normalization between a pair of channels"
#
# \description{
#   @get "title".
#
#   This method will estimate a smooth function of the dependency
#   between the log-ratios and the log-intensity of the two channels and
#   then correct the log-ratios (only) in order to remove the dependency.
#   This is method is also known as \emph{intensity-dependent} or
#   \emph{lowess normalization}.
#
#   The curve-fit methods are by nature limited to paired-channel data.
#   There exist at least one method trying to overcome this limitation,
#   namely the cyclic-lowess [1], which applies the paired
#   curve-fit method iteratively over all pairs of channels/arrays.
#   Cyclic-lowess is not implemented here.
#
#   We recommend that affine normalization [2] is used instead of curve-fit
#   normalization.
# }
#
# \usage{
# @usage normalizeCurveFit,matrix
# @usage normalizeLoess,matrix
# @usage normalizeLowess,matrix
# @usage normalizeSpline,matrix
# @usage normalizeRobustSpline,matrix
# }
#
# \arguments{
#  \item{X}{An Nx2 @matrix where the columns represent the two channels
#    to be normalized.}
#  \item{weights}{If @NULL, non-weighted normalization is done.
#    If data-point weights are used, this should be a @vector of length
#    N of data point weights used when estimating the normalization
#    function.
#  }
#  \item{typeOfWeights}{A @character string specifying the type of
#    weights given in argument \code{weights}.
#  }
#  \item{method}{@character string specifying which method to use when
#    fitting the intensity-dependent function.
#    Supported methods:
#     \code{"loess"} (better than lowess),
#     \code{"lowess"} (classic; supports only zero-one weights),
#     \code{"spline"} (more robust than lowess at lower and upper
#                      intensities; supports only zero-one weights),
#     \code{"robustSpline"} (better than spline).
#  }
#  \item{bandwidth}{A @double value specifying the bandwidth of the
#   estimator used.
#  }
#  \item{satSignal}{Signals equal to or above this threshold will not
#    be used in the fitting.
#  }
#  \item{...}{Not used.}
# }
#
# \value{
#   A Nx2 @matrix of the normalized two channels.
#   The fitted model is returned as attribute \code{modelFit}.
# }
#
# \details{
#  A smooth function \eqn{c(A)} is fitted through data in \eqn{(A,M)},
#  where \eqn{M=log_2(y_2/y_1)} and \eqn{A=1/2*log_2(y_2*y_1)}. Data is
#  normalized by \eqn{M <- M - c(A)}.
#
#  Loess is by far the slowest method of the four, then lowess, and then
#  robust spline, which iteratively calls the spline method.
# }
#
# \section{Negative, non-positive, and saturated values}{
#  Non-positive values are set to not-a-number (@NaN).
#  Data points that are saturated in one or more channels are not used
#  to estimate the normalization function, but they are normalized.
# }
#
# \section{Missing values}{
#  The estimation of the normalization function will only be made
#  based on complete non-saturated observations, i.e. observations that
#  contains no @NA values nor saturated values as defined by \code{satSignal}.
# }
#
# \section{Weighted normalization}{
#  Each data point, that is, each row in \code{X}, which is a
#  vector of length 2, can be assigned a weight in [0,1] specifying how much
#  it should \emph{affect the fitting of the normalization function}.
#  Weights are given by argument \code{weights}, which should be a @numeric
#  @vector of length N. Regardless of weights, all data points are
#  \emph{normalized} based on the fitted normalization function.
#
#  Note that the lowess and the spline method only support zero-one
#  \{0,1\} weights.
#  For such methods, all weights that are less than a half are set to zero.
# }
#
# \section{Details on loess}{
#  For @see "stats::loess", the arguments \code{family="symmetric"},
#  \code{degree=1}, \code{span=3/4},
#  \code{control=loess.control(trace.hat="approximate"},
#  \code{iterations=5}, \code{surface="direct")} are used.
# }
#
# @author "HB"
#
# \references{
#   [1] M. \enc{Åstrand}{Astrand},
#       Contrast Normalization of Oligonucleotide Arrays,
#       Journal Computational Biology, 2003, 10, 95-102. \cr
#   [2] @include "../incl/BengtssonHossjer_2006.bib.Rdoc" \cr
# }
#
# \examples{
#  @include "../incl/normalizeCurveFit.matrix.Rex"
# }
#
# \seealso{
#   @see "normalizeAffine".
# }
#*/#########################################################################
setMethodS3("normalizeCurveFit", "matrix", function(X, weights=NULL, typeOfWeights=c("datapoint"), method=c("loess", "lowess", "spline", "robustSpline"), bandwidth=NULL, satSignal=2^16-1, ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 1. Verify the arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument: 'X'
  if (ncol(X) != 2)
    stop("Curve-fit normalization requires two channels only: ", ncol(X));
  if (nrow(X) < 3)
    stop("Curve-fit normalization requires at least three observations: ", nrow(X));

  # Argument: 'satSignal'
  if (satSignal < 0)
    stop("Argument 'satSignal' is negative: ", satSignal);

  # Argument: 'method'
  method <- match.arg(method);
  zeroOneWeightsOnly <- (method %in% c("lowess", "spline"));

  # Argument: 'typeOfWeights'
  typeOfWeights <- match.arg(typeOfWeights);

  # Argument: 'weights'
  datapointWeights <- NULL;
  if (!is.null(weights)) {
    # If 'weights' is an object of a class with as.double(), cast it.
    weights <- as.double(weights);

    if (anyMissing(weights))
      stop("Argument 'weights' must not contain NA values.");

    if (any(weights < 0 | weights > 1)) {
      stop("Argument 'weights' out of range [0,1]: ", paste(weights[weights < 0.0 | weights > 1.0], collapse=", "));
    }

    if (zeroOneWeightsOnly && any(weights > 0 & weights < 1)) {
      weights <- round(weights);
      warning("Weights were rounded to {0,1} since '", method, "' normalization supports only zero-one weights.");
    }

    weights <- as.vector(weights);
    if (length(weights) == 1) {
      weights <- rep(weights, length.out=nrow(X));
    } else if (length(weights) != nrow(X)) {
      stop("Argument 'weights' does not have the same length as the number of data points (rows) in the matrix: ", length(weights), " != ", nrow(X));
    }
    datapointWeights <- weights;
  } # if (!is.null(weights))


  # Argument: 'bandwidth'
  if (is.null(bandwidth)) {
    bandwidths <- c("loess"=0.75, "lowess"=0.3, "robustSpline"=0.75,
                    "spline"=0.75);
    bandwidth <- bandwidths[method];
  } else if (!is.numeric(bandwidth) || bandwidth <= 0 || bandwidth > 1) {
    stop("Argument 'bandwidth' must be in [0,1): ", bandwidth);
  } else if (length(bandwidth) != 1) {
    stop("Argument 'bandwidth' must be a scalar: ", paste(bandwidth, collapse=", "));
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 2. Prepare data
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Convert non-positive signals to NaN. If not done here, the transform
  # (R,G) -> (A,M) -> (R,G) will no it.
  X[X <= 0] <- NaN;

  # Use only positive non-saturated observations to estimate the
  # normalization function
  isValid <- (is.finite(X) & (X <= satSignal));
  isValid <- (isValid[,1] & isValid[,2]);
  Y <- X[isValid,];

  if (!is.null(datapointWeights))
    datapointWeights <- datapointWeights[isValid];

  M <- log(Y[,1]/Y[,2], base=2);
  A <- log(Y[,1]*Y[,2], base=2)/2;
  # Not needed anymore
  Y <- NULL;

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 3. Estimate the model
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (method == "lowess") {
    incl <- if (!is.null(datapointWeights)) (datapointWeights > 0) else TRUE;
    fit <- lowess(x=A[incl], y=M[incl], f=bandwidth, ...);
    fit$predictM <- function(newA) approx(fit, xout=newA, ties=mean)$y;
  } else if (method == "loess") {
    fit <- loess(formula=M ~ A, weights=datapointWeights,
                 family="symmetric", degree=1, span=bandwidth,
                 control=loess.control(trace.hat="approximate",
                 iterations=5, surface="direct"), ...);

    fit$predictM <- function(newA) predict(fit, newdata=newA);
  } else if (method == "spline") {
    incl <- if (!is.null(datapointWeights)) (datapointWeights > 0) else TRUE;
    fit <- smooth.spline(x=A[incl], y=M[incl], spar=bandwidth, ...);
    fit$predictM <- function(newA) predict(fit, x=newA)$y;
  } else if (method == "robustSpline") {
    fit <- robustSmoothSpline(x=A, y=M, w=datapointWeights, spar=bandwidth, ...);
    fit$predictM <- function(newA) predict(fit, x=newA)$y;
  }

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 4. Normalize
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Normalize all data
  M <- log(X[,1]/X[,2], base=2);
  A <- log(X[,1]*X[,2], base=2)/2;

  ok <- is.finite(A);
  M[ok] <- M[ok] - fit$predictM(A[ok]);
  # Not needed anymore
  ok <- NULL;
  X[,1] <- as.matrix(sqrt(2^(2*A+M)));
  X[,2] <- as.matrix(sqrt(2^(2*A-M)));
  # Not needed anymore
  A <- M <- NULL;

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # 5. Return the normalized data
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  attr(X, "modelFit") <- fit;
  X;
}) # normalizeCurveFit()


setMethodS3("normalizeLowess", "matrix", function(X, ...) {
  normalizeCurveFit(X, method="lowess", ...);
})

setMethodS3("normalizeLoess", "matrix", function(X, ...) {
  normalizeCurveFit(X, method="loess", ...);
})

setMethodS3("normalizeSpline", "matrix", function(X, ...) {
  normalizeCurveFit(X, method="spline", ...);
})

setMethodS3("normalizeRobustSpline", "matrix", function(X, ...) {
  normalizeCurveFit(X, method="robustSpline", ...);
})


############################################################################
# HISTORY:
# 2013-09-26
# o Now utilizing anyMissing().
# 2005-06-03
# o Added argument 'typeOfWeights' to make it similar to other normalization
#   methods, although only "datapoint" weights are allowed.
# o Removed argument '.fitOnly'.
# o renamed all "robust.spline" to "robustSpline".
# 2005-03-23
# o Updated normalizeCurveFit() so that approx() does not give warnings
#   about 'Collapsing to unique x values' when doing lowess normalization.
# 2005-02-02
# o Zero-one weights are now round off by round(w).
# o BUG FIX: Forgot to adjust weights vector in normalizeCurveFit() when
#   removing non-finite values from data.
# 2005-02-01
# o Added argument '.fitOnly'.
# 2005-01-24
# o Create an Rdoc example with MvsA and MvsM comparisons.
# 2005-01-23
# o Added aliases normalizeLowess() and normalizeLoess().
# o Created from normalizeCurveFit() in MAData().
############################################################################
HenrikBengtsson/aroma.light documentation built on July 3, 2023, 1:57 a.m.