R/readCdf.R
In affxparser: Affymetrix File Parsing SDK

Documented in readCdf

#########################################################################/**
# @RdocFunction readCdf
#
# @title "Parsing a CDF file using Affymetrix Fusion SDK"
#
# \description{
#  @get "title".
#  This function parses a CDF file using the Affymetrix Fusion SDK.
#  \emph{This function will most likely be replaced by the more
#        general \code{\link{readCdfUnits}()} function.}
# }
#
# \usage{
#  readCdf(filename, units=NULL,
#          readXY=TRUE, readBases=TRUE,
#          readIndexpos=TRUE, readAtoms=TRUE,
#          readUnitType=TRUE, readUnitDirection=TRUE,
#          readUnitNumber=TRUE, readUnitAtomNumbers=TRUE,
#          readGroupAtomNumbers=TRUE, readGroupDirection=TRUE,
#          readIndices=FALSE, readIsPm=FALSE,
#          stratifyBy=c("nothing", "pmmm", "pm", "mm"),
#          verbose=0)
# }
#
# \arguments{
#  \item{filename}{The filename of the CDF file.}
#  \item{units}{An @integer @vector of unit indices
#    specifying which units to be read.  If @NULL, all units are read.}
#  \item{readXY}{If @TRUE, cell row and column (x,y) coordinates are
#     retrieved, otherwise not.}
#  \item{readBases}{If @TRUE, cell P and T bases are retrieved, otherwise not.}
#  \item{readIndexpos}{If @TRUE, cell indexpos are retrieved, otherwise not.}
#  \item{readExpos}{If @TRUE, cell "expos" values are retrieved, otherwise not.}
#  \item{readUnitType}{If @TRUE, unit types are retrieved, otherwise not.}
#  \item{readUnitDirection}{If @TRUE, unit directions are retrieved, otherwise not.}
#  \item{readUnitNumber}{If @TRUE, unit numbers are retrieved, otherwise not.}
#  \item{readUnitAtomNumbers}{If @TRUE, unit atom numbers are retrieved, otherwise not.}
#  \item{readGroupAtomNumbers}{If @TRUE, group atom numbers are retrieved, otherwise not.}
#  \item{readGroupDirection}{If @TRUE, group directions are retrieved, otherwise not.}
#  \item{readIndices}{If @TRUE, cell indices \emph{calculated} from
#    the row and column (x,y) coordinates are retrieved, otherwise not.
#     Note that these indices are \emph{one-based}.}
#  \item{readIsPm}{If @TRUE, cell flags indicating whether the cell
#    is a perfect-match (PM) probe or not are retrieved, otherwise not.}
#  \item{stratifyBy}{A @character string specifying which and how
#    elements in group fields are returned.
#    If \code{"nothing"}, elements are returned as is, i.e. as @vectors.
#    If \code{"pm"}/\code{"mm"}, only elements corresponding to
#    perfect-match (PM) / mismatch (MM) probes are returned (as @vectors).
#    If \code{"pmmm"}, elements are returned as a matrix where the
#    first row holds elements corresponding to PM probes and the second
#    corresponding to MM probes.  Note that in this case, it is assumed
#    that there are equal number of PMs and MMs; if not, an error is
#    generated.
#    Moreover, the PMs and MMs may not even be paired, i.e. there is no
#    guarantee that the two elements in a column corresponds to a
#    PM-MM pair.}
#  \item{verbose}{An @integer specifying the verbose level. If 0, the
#    file is parsed quietly.  The higher numbers, the more details.}
# }
#
# \value{
#   A list with one component for each unit. Every component is again a
#   list with three components
#   \item{groups}{This is again a list with one component for each group
#     (also called block). The information on each group is a list with 5
#     components, \code{x}, \code{y}, \code{pbase}, \code{tbase},
#   \code{expos}.}
#   \item{type}{type of the unit.}
#   \item{direction}{direction of the unit.}
# }
#
# \note{
#   This version of the function does not return information on the QC
#   probes. This will be added in a (near) future release. In addition we
#   expect the header to be part of the returned object.
#
#   So expect changes to the structure of the value of the function in
#   next release. Please contact the developers for details.
# }
#
# \section{Cell indices are one-based}{
#   Note that in \pkg{affxparser} all \emph{cell indices} are by
#   convention \emph{one-based}, which is more convenient to work
#   with in \R.  For more details on one-based indices, see
#   @see "2. Cell coordinates and cell indices".
# }
#
# \author{
#  James Bullard and
#  Kasper Daniel Hansen.
# }
#
# \seealso{
#   It is recommended to use @see "readCdfUnits" instead of this method.
#   @see "readCdfHeader" for getting the header of a CDF file.
# }
#
# \references{
#   [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats,
#       June 14, 2005.
#       \url{http://www.affymetrix.com/support/developer/}
# }
#
# @keyword "file"
# @keyword "IO"
# @keyword "internal"
#*/#########################################################################
readCdf <- function(filename, units=NULL, readXY=TRUE, readBases=TRUE,
                    readIndexpos=TRUE, readAtoms=TRUE,
                    readUnitType=TRUE, readUnitDirection=TRUE,
                    readUnitNumber=TRUE, readUnitAtomNumbers=TRUE,
                    readGroupAtomNumbers=TRUE, readGroupDirection=TRUE,
                    readIndices=FALSE, readIsPm=FALSE,
                    stratifyBy=c("nothing", "pmmm", "pm", "mm"),
                    verbose=0) {
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Validate arguments
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    filename <- file.path(dirname(filename), basename(filename))
    if (!file.exists(filename))
        stop("File not found: ", filename)
    ## Argument 'units':
    if (! (is.null(units) || is.numeric(units))) {
        stop("Argument 'units' must be numeric or NULL: ", class(units)[1])
    }
    ## other arguments
    stratifyBy <- match.arg(stratifyBy)
    returnUnitType <- as.logical(readUnitType)
    returnUnitDirection <- as.logical(readUnitDirection)
    returnUnitAtomNumbers <- as.logical(readUnitAtomNumbers)
    returnUnitNumber <- as.logical(readUnitNumber)
    returnXY <- as.logical(readXY)
    returnIndices <- as.logical(readIndices)
    returnBases <- as.logical(readBases)
    returnAtoms <- as.logical(readAtoms)
    returnIndexpos <- as.logical(readIndexpos)
    returnIsPm <- as.logical(readIsPm)
    returnBlockDirection <- as.logical(readGroupDirection)
    returnBlockAtomNumbers <- as.logical(readGroupAtomNumbers)


    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Read the CDF structure
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # UNSUPPORTED CASE?
    if (!is.null(units) && length(units) == 0L) {
      stop("readCdf(..., units=integer(0)) is not supported.")
    }

    cdf <- .Call("R_affx_get_cdf_file",
                 filename, as.integer(units), as.integer(verbose),
                 returnUnitType, returnUnitDirection,
                 returnUnitAtomNumbers, returnUnitNumber,
                 returnXY, returnIndices,
                 returnBases, returnAtoms, returnIndexpos, returnIsPm,
                 returnBlockDirection, returnBlockAtomNumbers,
                 PACKAGE="affxparser")

    # Sanity check
    if (is.null(cdf)) {
      stop("Failed to read CDF file: ", filename);
    }

    if (stratifyBy == "nothing")
        return(cdf);


    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    # Stratify by PM/MM
    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    isPm <- readCdfIsPm(filename, units=units);

    ## Using .subset2() instead of "[["() to avoid dispatching overhead etc.
    if (stratifyBy == "pmmm") {
        dimnames <- list(c("pm", "mm"), NULL);

        for (uu in seq_along(cdf)) {
            groups <- .subset2(.subset2(cdf, uu), "groups");
            ngroups <- length(groups);
            if (ngroups == 0)
                next;

            for (gg in 1:ngroups) {
                group <- .subset2(groups, gg);
                groupNoStratifyNames <- intersect(c("groupdirection",
                                                  "natoms", "ncellsperatom"),
                                                names(group))
                if(length(groupNoStratifyNames) > 0) {
                    groupNoStratifyElements <- group[groupNoStratifyNames]
                    group <- group[setdiff(names(group), c("groupdirection",
                                                  "natoms", "ncellsperatom"))]
                }
                pm <- .subset2(.subset2(isPm, uu), gg);
                idx <- 1:length(pm);
                mm <- idx[!pm];  # Note: which(!pm) is about 60% slower! /HB
                pm <- idx[pm];
                npm <- length(pm);
                if (npm != length(mm)) {
                    ## This is not expected to happen, but just in case.
                    stop("Number of PM and MM probes differ in probeset #", uu,
                         ": ", length(pm), " != ", length(mm));
                }
                pmmm <- matrix(c(pm, mm), nrow=2L, ncol=npm, byrow=TRUE);

                ## Re-order cell elements according to PM/MM.
                ngroup <- length(group);
                if (ngroup > 0) {
                    dim <- c(2, npm);
                    for (kk in 1:ngroup) {
                        ## value <- group[[kk]][pmmm];
                        value <- .subset(.subset2(group, kk), pmmm);
                        dim(value) <- dim;
                        dimnames(value) <- dimnames;
                        group[[kk]] <- value;
                    }
                }

                if(length(groupNoStratifyNames) > 0) {
                    group <- c(group, groupNoStratifyElements)
                }
                groups[[gg]] <- group;
            } # for (gg ...)
            cdf[[uu]]$groups <- groups;
        } # for (uu ...)
    } else if (stratifyBy == "pm") {
        for (uu in seq_along(cdf)) {
            groups <- cdf[[uu]]$groups;
            ngroups <- length(groups);
            if (ngroups == 0)
                next;

            for (gg in 1:ngroups) {
                group <- groups[[gg]];
                groupNoStratifyNames <- intersect(c("groupdirection",
                                                  "natoms", "ncellsperatom"),
                                                names(group))
                if(length(groupNoStratifyNames) > 0) {
                    groupNoStratifyElements <- group[groupNoStratifyNames]
                    group <- group[setdiff(names(group), c("groupdirection",
                                                  "natoms", "ncellsperatom"))]
                }
                ngroup <- length(group);
                if (ngroup == 0)
                    next;

                pm <- .subset2(.subset2(isPm, uu), gg);
                pm <- (1:length(pm))[pm]; # Note: which(!pm) is about 60% slower!
                for (kk in 1:ngroup) {
                    group[[kk]] <- .subset(.subset2(group, kk), pm);
                }
                if(length(groupNoStratifyNames) > 0) {
                    group <- c(group, groupNoStratifyElements)
                }
                groups[[gg]] <- group;
            } # for (gg ...)
            cdf[[uu]]$groups <- groups;
        } # for (uu ...)
    } else if (stratifyBy == "mm") {
        for (uu in seq_along(cdf)) {
            groups <- cdf[[uu]]$groups;
            ngroups <- length(groups);
            if (ngroups == 0)
                next;

            for (gg in 1:ngroups) {
                group <- groups[[gg]];
                groupNoStratifyNames <- intersect(c("groupdirection",
                                                  "natoms", "ncellsperatom"),
                                                names(group))
                if(length(groupNoStratifyNames) > 0) {
                    groupNoStratifyElements <- group[groupNoStratifyNames]
                    group <- group[setdiff(names(group), c("groupdirection",
                                                  "natoms", "ncellsperatom"))]
                }
                ngroup <- length(group);
                if (ngroup == 0)
                    next;

                pm <- .subset2(.subset2(isPm, uu), gg);
                mm <- (1:length(pm))[!pm]; # Note: which(!pm) is about 60% slower!
                for (kk in 1:ngroup) {
                    group[[kk]] <- .subset(.subset2(group, kk), mm);
                }
                if(length(groupNoStratifyNames) > 0) {
                    group <- c(group, groupNoStratifyElements)
                }
                groups[[gg]] <- group;
            } # for (gg ...)
            cdf[[uu]]$groups <- groups;
        } # for (uu ...)
    }

    cdf;
} # readCdf()


############################################################################
# HISTORY:
# 2011-11-18
# o ROBUSTNESS: Added sanity check that the native code did not return NULL.
# 2011-02-15
# o DOCUMENTATION: Converted the Rd to an Rdoc comment in this file.
# 2010-12-12
# o ROBUSTNESS: Replaces .Internal(matrix(...)) with matrix().
#   In the upcoming R 2.13.0 matrix() has less overhead.
############################################################################