affxparser: Affymetrix File Parsing SDK

Documented in readCdfDataFrame

#########################################################################/**
# @RdocFunction readCdfDataFrame
#
# @title "Reads units (probesets) from an Affymetrix CDF file"
#
# @synopsis
#
# \description{
#  @get "title". Gets all or a subset of units (probesets).
# }
#
# \arguments{
#  \item{filename}{The filename of the CDF file.}
#  \item{units}{An @integer @vector of unit indices
#    specifying which units to be read.  If @NULL, all are read.}
#  \item{groups}{An @integer @vector of group indices
#    specifying which groups to be read.  If @NULL, all are read.}
#  \item{cells}{An @integer @vector of cell indices
#    specifying which cells to be read.  If @NULL, all are read.}
#  \item{fields}{A @character @vector specifying what fields to read.
#    If @NULL, all unit, group and cell fields are returned.}
#  \item{drop}{If @TRUE and only one field is read, then a @vector
#    (rather than a single-column @data.frame) is returned.}
#  \item{verbose}{An @integer specifying the verbose level. If 0, the
#    file is parsed quietly.  The higher numbers, the more details.}
# }
#
# \value{
#   An NxK @data.frame or a @vector of length N.
# }
#
# @author "HB"
#
# @examples "../incl/readCdfDataFrame.Rex"
#
# \seealso{
#   For retrieving the CDF as a @list structure, see
#   @see "affxparser::readCdfUnits".
# }
#
# \references{
#   [1] Affymetrix Inc, Affymetrix GCOS 1.x compatible file formats,
#       June 14, 2005.
#       \url{http://www.affymetrix.com/support/developer/}
# }
#
# @keyword "file"
# @keyword "IO"
# @keyword "internal"
#*/#########################################################################
readCdfDataFrame <- function(filename, units=NULL, groups=NULL, cells=NULL, fields=NULL, drop=TRUE, verbose=0) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'filename':
  filename <- file.path(dirname(filename), basename(filename));
  if (!file.exists(filename))
    stop("File not found: ", filename);

  # Argument 'units':
  if (is.null(units)) {
  } else if (is.numeric(units)) {
    units <- as.integer(units);
    if (any(units < 1))
      stop("Argument 'units' contains non-positive indices.");
  } else {
    stop("Argument 'units' must be numeric or NULL: ", class(units)[1]);
  }

  # Argument 'groups':
  if (is.null(groups)) {
  } else if (is.numeric(groups)) {
    groups <- as.integer(groups);
    if (any(groups < 1))
      stop("Argument 'groups' contains non-positive indices.");
  } else {
    stop("Argument 'groups' must be numeric or NULL: ", class(groups)[1]);
  }

  # Argument 'fields':
##   knownUnitFields <- c("unit", "unitName", "unitDirection", "nbrOfUnitAtoms", "unitSize", "unitNumber", "unitType", "nbrOfGroups", "mutationType");
##   knownGroupFields <- c("group", "groupName", "nbrOfGroupAtoms", "groupSize", "firstAtom", "lastAtom", "groupDirection");
##   knownCellFields <- c("cell", "x", "y", "probeSequence", "feat", "qual", "expos", "pos", "cbase", "pbase", "tbase", "atom", "index");

  if (is.null(fields)) {
    knownUnitFields <- c("unit", "unitName", "unitType",
                         "unitDirection", "unitNbrOfAtoms");
    knownGroupFields <- c("group", "groupName", "groupDirection",
                          "groupNbrOfAtoms");
    knownCellFields <- c("cell", "x", "y", "pbase", "tbase",
                         "indexPos", "atom", "expos");
    fields <- c(knownUnitFields, knownGroupFields, knownCellFields);
  }

  # Argument 'verbose':
  if (length(verbose) != 1)
    stop("Argument 'verbose' must be a single integer.");
  verbose <- as.integer(verbose);
  if (!is.finite(verbose))
    stop("Argument 'verbose' must be an integer: ", verbose);


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Prepare the arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  readFields <- c(fields, "cell");  # Need to read one cell field!
  readFields <- unique(readFields);

  # Unit fields
  readUnitType <- ("unitType" %in% readFields);
  readUnitDirection <- ("unitDirection" %in% readFields);
  readUnitNumber <- ("unitNumber" %in% readFields);
  readUnitAtomNumbers <- ("unitNbrOfAtoms" %in% readFields);

  # Group fields
  readGroupDirection <- ("groupDirection" %in% readFields);
  readGroupAtomNumbers <- ("groupNbrOfAtoms" %in% readFields);

  # Cell fields
  readXY <- any(c("x", "y") %in% readFields);
  readIndices <- ("cell" %in% readFields);
  readBases <- any(c("tbase", "pbase") %in% readFields);
  readIndexpos <- ("indexPos" %in% readFields);
  readExpos <- ("expos" %in% readFields);
  readAtoms <- ("atom" %in% readFields);


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Query the CDF
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # UNSUPPORTED CASE?
  if (!is.null(units) && length(units) == 0L) {
    stop("readCdfDataFrame(..., units=integer(0)) is not supported.")
  }

  cdf <- readCdf(filename, units=units, readXY=readXY, readBases=readBases,
    readIndexpos=readIndexpos, readAtoms=readAtoms,
    readUnitType=readUnitType, readUnitDirection=readUnitDirection,
    readUnitNumber=readUnitNumber, readUnitAtomNumbers=readUnitAtomNumbers,
    readGroupAtomNumbers=readGroupAtomNumbers,
    readGroupDirection=readGroupDirection, readIndices=readIndices,
    verbose=verbose-1);

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Flatten CDF list structure unit by unit
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (is.null(units))
    units <- seq_along(cdf);  ## FIX ME
  groupIdxs <- groups;

  unitNames <- names(cdf);
  for (uu in seq_along(cdf)) {
    unit <- .subset2(cdf, uu);
    unitName <- .subset(unitNames, uu);

    if (verbose >= 1) {
      if (uu %% 500 == 1) {
        unitsLeft <- length(cdf) - uu + 1;
        cat(unitsLeft, ", ", sep="");
      }
    }

    groups <- .subset2(unit, "groups");
    unit[["groups"]] <- NULL;

    # Translate unit names (has to be done here because not unique)
    names <- names(unit);
    names <- sub("ncells", "unitNbrOfCells", names);
    names <- sub("natoms", "unitNbrOfAtoms", names);
    names <- sub("unitnumber", "unitNumber", names);
    names(unit) <- names;

    unitData <- list(unit=.subset(units, uu), unitName=unitName);
    unitData <- c(unitData, unit);

    # Extract groups of interest?
    if (is.null(groupIdxs)) {
      ggs <- seq_along(groups);
    } else {
      keep <- which(seq_along(groups) %in% groupIdxs);
      groups <- .subset(groups, keep);
      ggs <- groupIdxs;
    }

    # Flatten (group, cell) data
    groupNames <- names(groups);
    for (gg in seq_along(ggs)) {
      group <- .subset2(groups, gg);
      groupName <- .subset(groupNames, gg);

      groupData <- list(group=.subset(ggs, gg), groupName=groupName);

      # Extract group fields
      keys <- c("groupdirection", "natoms", "ncellsperatom", "ncells");
      idxs <- which(names(group) %in% keys);
      if (length(idxs) > 0) {
        groupData <- c(groupData, .subset(group, idxs));
        group <- .subset(group, -idxs);
      }

      # Extract cell fields
      cellData <- as.data.frame(group, stringsAsFactors=FALSE);

      # Extract cells of interest?
      if (!is.null(cells)) {
        keep <- (seq_len(nrow(cellData)) %in% cells);
        cellData <- cellData[keep,,drop=FALSE];
      }

      # Expand group fields
      nbrOfCells <- nrow(cellData);
      for (key in names(groupData)) {
        groupData[[key]] <- rep(.subset2(groupData, key), times=nbrOfCells);
      }
      groupData <- as.data.frame(groupData, stringsAsFactors=FALSE);

      group <- cbind(groupData, cellData);

      groups[[gg]] <- group;
    }

    # Stack (rbind) groups
    stackedGroups <- NULL;
    for (gg in seq_along(groups)) {
      stackedGroups <- rbind(stackedGroups, .subset2(groups, gg));
    }

    nbrOfCells <- nrow(stackedGroups);
    for (key in names(unitData)) {
      unitData[[key]] <- rep(.subset2(unitData, key), times=nbrOfCells);
    }
    unitData <- as.data.frame(unitData, stringsAsFactors=FALSE);

    unit <- cbind(unitData, stackedGroups);

    cdf[[uu]] <- unit;
  } # for (uu ...)

  if (verbose >= 1) {
    cat("0.\n");
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Flatten the remaining list structure
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Allocate "data frame" of right size
  unitSizes <- sapply(cdf, FUN=nrow);
  nbrOfCells <- sum(unitSizes);
  dataTypes <- sapply(.subset2(cdf, 1), FUN=storage.mode);
  df <- vector("list", length(dataTypes));
  names(df) <- names(dataTypes);
  for (key in names(df)) {
    df[[key]] <- vector(.subset2(dataTypes, key), length=nbrOfCells);
  }

  # Copy values from the CDF list structure
  offset <- 0;
  for (uu in seq_along(cdf)) {
    data <- .subset2(cdf, uu);
    nrow <- nrow(data);
    idxs <- offset + 1:nrow;
    for (key in names(df)) {
      df[[key]][idxs] <- .subset2(data, key);
    }
    offset <- offset + nrow;
    cdf[[uu]] <- NA;
  }

  names <- names(df);

  # Translate unit names
  names <- sub("unittype", "unitType", names);
  names <- sub("unitdirection", "unitDirection", names);
  names <- sub("ncellsperatom", "unitNbrOfCellsPerAtom", names);
  # Translate group names
  names <- sub("groupdirection", "groupDirection", names);
  names <- sub("natoms", "groupNbrOfAtoms", names);
  names <- sub("ncellsperatom", "groupNbrOfCellsPerAtom", names);
  names <- sub("ncells", "groupNbrOfCells", names);
  # Translate cell names
  names <- sub("indices", "cell", names);
  names <- sub("indexpos", "indexPos", names);
  names(df) <- names;

  # Extract fields of interest
  unknown <- setdiff(fields, names(df))
  if (length(unknown) > 0) {
    warning("Some of the fields were not read: ",
                                   paste(unknown, collapse=", "));
  }
  fields <- intersect(fields, names(df));
  df <- .subset(df, fields);


  # Make it a valid data frame
  if (drop && length(df) == 1) {
    df <- .subset2(df, 1);
  } else {
    attr(df, "row.names") <- .set_row_names(nbrOfCells);
    attr(df, "class") <- "data.frame";
  }

  df;
} # readCdfDataFrame()


############################################################################
# HISTORY:
# 2008-04-03 [HB]
# o Now the renaming of fields is mostly done at the end and not in every
#   iteration.  That speeds up the process 5-10%.
# o Replaced all [[() and [() with .subset2() and .subset(), respectively.
#   This should make it a little bit faster.
# o For my record, some benchmarking on readCdfDataFrame():
#   Mapping10K_Xba142.cdf: 408,400x16, 23.3Mb, 610 seconds.
# 2008-03-24 [HB]
# o Created a readCdfDataFrame() that uses readCdf() to read the data and
#   then restructures it.  This will be used as a reference for the final
#   implementation.
# o Created a stub for readCdfDataFrame().
############################################################################