R/incrementalRMA.R

Defines functions cleanplatformname medianPolishIncremental quantileNormalizeIncremental incrementalRMA

Documented in cleanplatformname incrementalRMA medianPolishIncremental quantileNormalizeIncremental

#' Incremental RMA
#' @description Perform incremental RMA algorithm on CEL files.
#'
#' @details
#' RMA is a robust multiarray average expression (see \code{\link[affy]{rma}}).
#' It is one of the de facto approaches for processing Affymetrix GeneChip
#' data.
#'
#' One disadvantage of RMA is that parameters are calculated on the set of
#' arrays during the normalization and probeset summarization process. This
#' works out for studies in which all samples are available at once. In
#' the case of adding new samples into a dataset, the entire set must be
#' reprocessed with RMA. Making it even more challenging, this reprocessing
#' will result in slightly different parameters. These different parameters
#' lead in turn to slightly different gene expression measurements.
#'
#' This package gains inspiration from the \code{frma} package, as well
#' as earlier efforts including refRMA and incrementalRMA. The goal of
#' the package is to use a set of data to pre-calculate necessary parameters
#' for RMA. Then, when a new sample is made available these stored parameters
#' can be applied to the new sample using the RMA approach. The result should
#' provide gene expression estimates similar to those used when the entire
#' dataset is used together.
#'
#' @note
#' A test of the package is the following: store the parameters from
#' normalizing a dataset D. Then apply incrementalRMA to each of the
#' raw samples in D. The resulting gene expression should be identical
#' in both cases.
#'
#' @param abatch An \code{\link[affy]{AffyBatch-class}} of cel files to
#' apply RMA to.
#' @param params An incremental parameter list as generated by
#'     \code{\link{parameterizeRMA}}.
#' @param calculate_error incrementalRMA errors can be calculated vs. what canonical RMA
#'     would produce.  See \code{\link{calculateIncrementalRMAError}} for a discussion
#'     of the concept and interpretation.
#'
#' @return A \code{\link[Biobase]{ExpressionSet-class}} object representing the
#' RMA normalized AffyBatch object.
#' @export
#'
#' @examples
#' \dontrun{
#' incrementalRMA(AffyBatch, params=list(probeEffects=(),normalizationVector=()))
#' }
incrementalRMA<-function(abatch, params=NULL, calculate_error = FALSE) {
  stopifnot("Input object must be an AffyBatch" = class(abatch) == "AffyBatch")
  stopifnot("Parameters do not contain all necessary values." =
              is.null(params) |
              all(c("probeEffects","normalizationVector") %in% names(params)))
  stopifnot("Error calculation cannot be done without referenceCELFiles in params."=
              ! calculate_error | utils::hasName(params, "referenceCELFiles"))
  stopifnot("Parameter vector does not match AffyBatch names." =
              is.null(params) |
              identical(affy::probeNames(abatch),
                                          names(params$normalizationVector))
  )

  # Set up needed variables for function
  se.exprs<-NULL

  ## RMA Step 1: Background correction - does not require parameterization.
  message("Background Correcting ...")
  abatch_bg <- affy::bg.correct.rma(abatch)

  ## RMA Step 2: Quantile normalization - requires parameterization
  message("Normalizing ...")
  pms <- quantileNormalizeIncremental(abatch_bg, params)

  ## RMA Step 3: Summarization with median polish - requires parameterization
  message("Summarizing ...")
  exprs <- medianPolishIncremental(pms,
                                   probe_names = affy::probeNames(abatch_bg),
                                   params)
  ### Add column names to the results, so this doesn't get lost.
  colnames(exprs) <- affy::sampleNames(abatch_bg)

  ## Extra step: If CEL files used for original calculations are included
  ## in params, we can calculate error between incremental RMA and canonical RMA.
  if ( calculate_error )
    se.exprs <- calculateIncrementalRMAError(exprs, abatch, params)

  ## Create the ExpressionSet object, copying the AffyBatch information
  ## where possible.
  if (is.null(se.exprs))
    assayData <- Biobase::assayDataNew(exprs=exprs)
  else
    assayData <- Biobase::assayDataNew(exprs = exprs,  se.exprs = se.exprs)

  e <- Biobase::ExpressionSet(assayData = assayData,
                            annotation = cleanplatformname(abatch),
                            phenoData = Biobase::phenoData(abatch),
                            experimentData = Biobase::experimentData(abatch),
                            protocolData = Biobase::protocolData(abatch))

  ## Finally, the expression set is returned.
  e

}




#' Incremental quantile normalization
#'
#' @description Use existing parameters to quantile normalize an AffyBatch.
#'
#' @details
#' An AffyBatch is normalized using a quantile normalization approach.
#' In the case of an incremental algorith, the pre-calculated quantiles
#' can be used to normalize new data against. This has the benefit of
#' not requiring calculation of quantiles across a set, instead just
#' applying a stored version.
#'
#' @note The incremental quantile normalization information is stored
#' within the params object as the list \code{normalizationVector}.
#'
#' If the params object is NULL then a default quantile normalization
#' is performed.
#'
#' @param abatch An AffyBatch to quantile normalize.
#' @param params An incremental parameter list as generated by
#'     \code{\link{parameterizeRMA}}
#'
#' @return A matrix of pm (perfect match) values that have been normalized.
#' @export
#'
#' @examples
#' \dontrun{
#' quantileNormalizeIncremental(AffyBatch, params=list("normalizationVector"=()))
#' }
quantileNormalizeIncremental<-function(abatch, params=NULL) {

  if ( utils::hasName(params,"normalizationVector"))
    preprocessCore::normalize.quantiles.use.target(affy::pm(abatch),
                                                    params$normalizationVector)
  else
    preprocessCore::normalize.quantiles(affy::pm(abatch))

}


#' Median polish fit
#'
#' @description Given an existing median polish model, apply to new data and
#' return resulting median polish.
#'
#' @details
#'
#' The RMA approach involves using median polish to summarize a set of
#' probes for a set of samples. Briefly, median polish removes row
#' medians then column medians repeatedly until a stopping criteria
#' is achieved. See \code{\link[stats]{medpolish}} for details on the
#' median polish.
#'
#' The incremental nature of this median polish approach means that
#' final state probe-wise effects have already been calculated (and
#' stored). Thus for a sample, the precalculated probe-wise effects
#' are subtracted from the signal and a probeset-wide median is
#' calculated. This is the used as the probeset expression estimate.
#'
#' @note The incremental median polish information is stored
#' within the params object as the list \code{probeEffects}.
#'
#' If the params object is NULL then a default median polish
#' is performed.
#'
#' @param pms Perfect-match matrix representing the gene expression.
#' @param probe_names Array of probenames such as returned from
#'     \code{\link[affy]{probeNames-methods}}
#' @param params An incremental parameter list as generated by
#'     \code{\link{parameterizeRMA}}
#'
#' @return Expression values corresponding to probesets defined by the probe_names parameter.
#' @export
#'
#' @examples
#'
#' \dontrun{
#' medianPolishIncremental(pm(AffyBatch), probeNames(AffyBatch), NULL)
#' }
#'
medianPolishIncremental <- function(pms, probe_names, params=NULL) {

  stopifnot("PMs must be a matrix." = is.matrix(pms),
            "Probe names must match pm matrix." = length(probe_names) == nrow(pms)
  )

  # Default if params not provided
  probeEffects<-params[["probeEffects"]] %or% rep(0,nrow(pms))

  # The algorithm is subtract the row (probe) effects from
  # signal, then take the median of the sample (per probeset).
  exprs <- preprocessCore::subColSummarizeMedianpolish(
    (log2(pms) - probeEffects), probe_names)

  exprs
}


#' Clean platform name
#'
#' @description
#' Return a cleaned up platform name from based on an
#' AffyBatch cdf name.
#'
#' @details
#' An AffyBatch records the CDF suitable for processing the
#' GeneChip data. Embedded in this cdf is the name of the
#' platform itself. This routine cleans up the cdf and
#' trims off the "cdf" on the platform name.
#'
#' The \code{\link[affy]{AffyBatch-class} cdfName} function extracts a cdf
#' and the \code{\link[affy]{cleancdfname}} function cleans
#' up problems with the cdf name. However, this function goes
#' beyond this by stripping off the cdf text. This code
#' was taken from the \code{\link[frma]{frma}} package.
#'
#' @param abatch AffyBatch object to extract platform name from.
#'
#' @return A string representing a cleaned up platform name.
#'
#' @examples
#' \dontrun{
#' cleanplatformname(AffyBatch)
#' }
#'
cleanplatformname<-function(abatch) {

  ## determine data platform (from frma)
  cdfname <- affy::cleancdfname(affy::cdfName(abatch))
  tmp <- gsub("cdf","",cdfname)
  platform <- gsub("..entrezg", "", tmp)

  platform
}
steveneschrich/IncrementalRMA documentation built on Dec. 23, 2021, 5:32 a.m.