R/bpmapCluster2Cdf.R

###########################################################################/**
# @RdocDefault bpmapCluster2Cdf
#
# @title "Creates a CDF from tiling-array BPMAP file"
#
# \description{
#   @get "title".\cr
#
#   \emph{
#    NOTE: This method applies only to Affymetrix tiling arrays!
#    Furthermore, it is likely to be more useful for promoter tiling arrays
#    and less so for whole-genome tiling arrays.
#   }
# }
#
# @synopsis
#
# \arguments{
#  \item{pathname}{The pathname to the BPMAP file.}
#  \item{chipType, tags}{The chip type and optional tags of the CDF to
#    be written.}
#  \item{rows, cols}{Two positive @integers specifying the probe dimension
#    of the chip.  It is important to get this correct.  They can be
#    inferred from the CEL header of a CEL file for this chip,
#    cf. @see "affxparser::readCelHeader".}
#  \item{maxProbeDistance}{A positive @integer specifying the maximum
#    genomic distance (in basepairs) allowed between two probes in order
#    to "cluster" those two probes into the same CDF units.  Whenever the
#    distance is greater, the two probes end up in two different CDF units.}
#  \item{minNbrOfProbes}{A positive @integer specifying the minimum number
#    of probes required in a CDF unit.  If fewer, those probes are dropped.}
#  \item{groupName}{A @character string specifying which BPMAP sequences
#     to keep.  Sequence with this group name is kept, all others are
#     excluded.}
#  \item{field}{A @character string.}
#  \item{stringRemove}{An (optional) regular expression.}
#  \item{...}{Optional arguments passed to @see "affxparser::readBpmap".}
#  \item{flavor}{Specifying which version of BPMAP-to-CDF generator
#     to use. The default is always to use the most recent one, which
#     is also the recommended one.  Previous versions are kept only for
#     backward compatibility (and may be dropped at anytime).}
#  \item{path}{The directory where the CDF file will be written.
#     If \code{"*"} (default), it will be written to
#     \code{annotationData/chipTypes/<chipType>/}.}
#  \item{verbose}{See @see "R.utils::Verbose".}
# }
#
# \value{
#  Returns (invisibly) a the pathname of the created CDF file.
#  The created CDF is written to the current directory.
# }
#
# \details{
#   This method applies only to Affymetrix tiling arrays.  It is likely
#   to be useful for promoter tiling arrays and less so for whole-genome
#   tiling arrays.
#   Flavor \code{"v2"} replaced \code{"v1"} as aroma.affymetrix v2.5.4
#   (June 21, 2012). For details, see \code{news(package="aroma.affymetrix")}.
# }
#
# \author{
#   Henrik Bengtsson adopted from Mark Robinson standalone/online version
#   as of July 11, 2011.
# }
#
# @keyword "internal"
#*/###########################################################################
setMethodS3("bpmapCluster2Cdf", "default", function(pathname, chipType, tags=NULL, rows, cols, maxProbeDistance=3000L, minNbrOfProbes=30L, groupName=gsub("_.*", "", chipType), field="fullname", stringRemove=sprintf("%s:.*;", groupName), ..., flavor=c("v2", "v1"), path="*", verbose=-10) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Local functions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  getRangeX <- function(bpmapList) {
    pos <- lapply(bpmapList, FUN=.subset, c("mmx", "pmx"))
    pos <- unlist(pos, use.names=FALSE)
    range(pos, na.rm=TRUE)
  } # getRangeX()

  getRangeY <- function(bpmapList) {
    pos <- lapply(bpmapList, FUN=.subset, c("mmy", "pmy"))
    pos <- unlist(pos, use.names=FALSE)
    range(pos, na.rm=TRUE)
  } # getRangeY()

  getGroupNames <- function(bpmapList) {
    names <- sapply(bpmapList, FUN=function(seq) {
      seqInfo <- seq$seqInfo
      seqInfo$groupname
    })
    # Sanity check
    stopifnot(length(names) == length(bpmapList))
    names
  } # getGroupNames()

  getNumberOfProbes <- function(bpmapList) {
    sapply(bpmapList, FUN=function(u) u$seqInfo$numberOfHits)
  } # getNumberOfProbes()

  getStartPositions <- function(bpmapList) {
    lapply(bpmapList, FUN=function(u) u$startpos)
  } # getStartPositions()

  bpmapUnit2df <- function(u) {
    mmx <- u[["mmx"]]
    mmy <- u[["mmy"]]
    mmx <- if (all(mmx == 0L) || is.null(mmx)) 0L else mmx
    mmy <- if (all(mmy == 0L) || is.null(mmy)) 0L else mmy

    seqInfo <- u$seqInfo
    res <- data.frame(seqname=seqInfo[[field]], groupname=seqInfo$groupname, u[c("pmx", "pmy")], mmx=mmx, mmy=mmy, u[c("probeseq", "strand", "startpos", "matchscore")], stringsAsFactors=FALSE)

    # Order by start positions
    o <- order(u[["startpos"]])
    res <- res[o,,drop=FALSE]

    res
  } # bpmapUnit2df()


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validating arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'filename':
  pathname <- Arguments$getReadablePathname(pathname)

  # Argument 'chipType':
  chipType <- Arguments$getCharacter(chipType)

  # Argument 'tags':
  if (!is.null(tags)) {
    tags <- Arguments$getCharacters(tags)
  }

  # Argument 'minNbrOfProbes':
  minNbrOfProbes <- Arguments$getInteger(minNbrOfProbes, range=c(1,Inf))

  # Argument 'maxProbeDistance':
  maxProbeDistance <- Arguments$getInteger(maxProbeDistance, range=c(1,Inf))

  # Argument 'rows':
  rows <- Arguments$getInteger(rows, range=c(1,Inf))

  # Argument 'cols':
  cols <- Arguments$getInteger(cols, range=c(1,Inf))

  # Argument 'groupName':
  groupName <- Arguments$getCharacter(groupName)

  # Argument 'field':
  field <- Arguments$getCharacter(field)

  # Argument 'stringRemove':
  if (!is.null(stringRemove)) {
    stringRemove <- Arguments$getCharacter(stringRemove)
  }

  # Argument 'flavor':
  flavor <- match.arg(flavor)

  # Argument 'path':
  if (is.null(path)) {
    path <- "."
  } else {
    path <- Arguments$getCharacter(path)
  }

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)



  verbose && enter(verbose, "Generating CDF from BPMAP")
  chipTypeF <- paste(c(chipType, tags), collapse=",")
  verbose && cat(verbose, "Full chip type: ", chipTypeF)
  chipType <- gsub(",.*", "", chipTypeF)

  if (path == "*") {
    path <- file.path("annotationData", "chipTypes", chipType)
  }
  path <- Arguments$getWritablePath(path)
  verbose && cat(verbose, "Output path: ", path)


  cdfFilename <- sprintf("%s.cdf", chipTypeF)
  cdfFilename <- Arguments$getFilename(cdfFilename)
  verbose && cat(verbose, "CDF pathname: ", cdfFilename)
  cdfPathname <- Arguments$getWritablePathname(cdfFilename, path=path, mustNotExist=TRUE)

  ppsPathname <- sprintf("%s.pps", chipTypeF)
  ppsPathname <- Arguments$getFilename(ppsPathname)
  verbose && cat(verbose, "PPS pathname: ", ppsPathname)
  ppsPathname <- Arguments$getWritablePathname(ppsPathname, path=path, mustNotExist=TRUE)

  verbose && enter(verbose, "Reading BPMAP file")
  verbose && cat(verbose, "Pathname: ", pathname)
  hdr <- .readBpmapHeader(pathname)
  verbose && cat(verbose, "File version: ", hdr$version)
  verbose && cat(verbose, "Number of sequences: ", hdr$numSequences)
  bpmapList <- .readBpmap(pathname, readMatchScore=TRUE, ...)
  nbrOfSeqs <- length(bpmapList)
  # Sanity check
  stopifnot(nbrOfSeqs <= hdr$numSequences)
  # Not needed anymore
  hdr <- NULL; # Not needed anymore
  verbose && exit(verbose)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate the chip dimensions further.
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  maxY <- getRangeY(bpmapList)[2] + 1L
  if (maxY > rows) {
    throw("Argument 'rows' is too small. There exist probes with a Y position that is greater: ", maxY, " > ", rows)
  }

  maxX <- getRangeX(bpmapList)[2] + 1L
  if (maxX > cols) {
    throw("Argument 'cols' is too small. There exist probes with an X position that is greater: ", maxX, " > ", cols)
  }


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Filtering sequences
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Identifying sequences to be used in CDF")

  verbose && enter(verbose, "Extracting sequences with group names of interest")
  verbose && cat(verbose, "Group names to keep: ", groupName)
  nbrOfSeqs0 <- nbrOfSeqs
  verbose && cat(verbose, "Number of sequences before: ", nbrOfSeqs)
  verbose && cat(verbose, "Number of probes before: ", sum(getNumberOfProbes(bpmapList)))

  groupNames <- getGroupNames(bpmapList)
  verbose && cat(verbose, "Group names: ", hpaste(groupNames))
  t <- table(groupNames)
  verbose && cat(verbose, "Number of unique group names: ", length(t))
  verbose && print(verbose, t)
  # Sanity check
  stopifnot(length(groupNames) == nbrOfSeqs)

  keep <- (groupNames == groupName)
  bpmapList <- bpmapList[keep]
  nbrOfSeqs <- length(bpmapList)
  verbose && cat(verbose, "Number of sequences dropped: ", nbrOfSeqs0-nbrOfSeqs)
  verbose && cat(verbose, "Number of sequences after: ", nbrOfSeqs)
  verbose && cat(verbose, "Number of probes after: ", sum(getNumberOfProbes(bpmapList)))

  groupNames <- getGroupNames(bpmapList)
  verbose && cat(verbose, "Group names: ", hpaste(groupNames))
  t <- table(groupNames)
  verbose && cat(verbose, "Number of unique group names: ", length(t))
  verbose && print(verbose, t)

  verbose && exit(verbose)


  verbose && enter(verbose, "Excluding sequences that appears to be non-genomic control sequences (=all zero start positions)")
  verbose && cat(verbose, "Flavor: ", flavor)

  nbrOfSeqs0 <- nbrOfSeqs
  verbose && cat(verbose, "Number of sequences before: ", nbrOfSeqs)
  verbose && cat(verbose, "Number of probes before: ", sum(getNumberOfProbes(bpmapList)))

  spList <- getStartPositions(bpmapList)
  verbose && cat(verbose, "Start positions:")
  verbose && str(verbose, spList)

  if (flavor == "v2") {
    keep <- sapply(spList, FUN=function(pos) !all(pos == 0L))
  } else if (flavor == "v1") {
    keep <- sapply(spList, FUN=function(pos) all(pos > 0L))
  }
  bpmapList <- bpmapList[keep]
  nbrOfSeqs <- length(bpmapList)
  verbose && cat(verbose, "Number of sequences dropped: ", nbrOfSeqs0-nbrOfSeqs)
  verbose && cat(verbose, "Number of sequences after: ", nbrOfSeqs)
  verbose && cat(verbose, "Number of probes after: ", sum(getNumberOfProbes(bpmapList)))

  groupNames <- getGroupNames(bpmapList)
  verbose && cat(verbose, "Group names: ", hpaste(groupNames))
  t <- table(groupNames)
  verbose && cat(verbose, "Number of unique group names: ", length(t))
  verbose && print(verbose, t)

  verbose && exit(verbose)

  verbose && exit(verbose)




  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Coercing to CDF friendly structure
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Coercing data needed for CDF structure")
  verbose && cat(verbose, "Number of sequences: ", nbrOfSeqs)
  bpmapdfList <- lapply(bpmapList, FUN=bpmapUnit2df)
  # Sanity check
  stopifnot(length(bpmapdfList) == nbrOfSeqs)
  verbose && exit(verbose)
  # Not needed anymore
  bpmapList <- NULL; # Not needed anymore



  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Creating CDF tree structure
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Creating CDF tree structure for ", length(bpmapdfList), " BPMAP sequences")

  verbose && cat(verbose, "Maximum probe distance: ", maxProbeDistance)
  verbose && cat(verbose, "Minimum number of probes: ", minNbrOfProbes)
  if (!is.null(stringRemove)) {
    verbose && cat(verbose, "Regular expression used to infer unitname prefix: ", stringRemove)
  }

  # Allocate
  cdfList <- list()
  unitNames <- character(0L)

  startPositionList <- list()
  unitPrefixes <- character(0L)

  # All CDF unit will have a single group
  unitGroups <- vector("list", length=1L)

  uu <- 1L
  for (ii in seq_along(bpmapdfList)) {
    name <- names(bpmapdfList)[ii]
    verbose && enter(verbose, sprintf("Sequence #%d ('%s') of %d", ii, name, length(bpmapdfList)))

    # Access ones
    bpmapdf <- bpmapdfList[[ii]]

    chrII <- bpmapdf$seqname[1]
    if (!is.null(stringRemove)) {
      chrII <- gsub(stringRemove, "", chrII)
    }
    verbose && cat(verbose, "Unit name prefix: ", chrII)

    nbrOfProbesII <- nrow(bpmapdf)
    verbose && cat(verbose, "Number of probes in sequence: ", nbrOfProbesII)
    nbrOfProbesII0 <- nbrOfProbesII

    sp <- bpmapdf$startpos
    # Sanity check
    stopifnot(all(is.finite(sp)))

    # Splitting when distances between neighboring probes are too large
    d <- diff(sp)
    keep <- (d > maxProbeDistance)
    idxsII <- which(keep)
    nbrOfSplitsII <- length(idxsII)
    if (nbrOfSplitsII > 0L) {
      verbose && printf(verbose, "Splitting into %d subsequence because there were %d too large (>%d) gaps between neighboring probes.\n", nbrOfSplitsII+1L, nbrOfSplitsII, maxProbeDistance)
    }

    # (start,end):s of subsequences
    starts <- c(1L, idxsII+1L)
    ends <- c(idxsII, nbrOfProbesII)
    counts <- (ends-starts)+1L
##    verbose && print(verbose, data.frame(start=starts, end=ends, nbrOfProbes=counts))
    # Sanity check
    stopifnot(all(is.finite(starts)))
    stopifnot(all(is.finite(ends)))
    stopifnot(all(counts > 0L))

    # Dropping probesets with too few probes
    if (flavor == "v2") {
      keep <- (counts >= minNbrOfProbes)
    } else if (flavor == "v1") {
      keep <- (counts >= minNbrOfProbes + 2L)
    }
    rowsII <- which(keep)
    nbrOfUnitsII <- length(rowsII)
    nbrOfDroppedSeqs <- length(starts) - nbrOfUnitsII
    if (nbrOfDroppedSeqs > 0L) {
      verbose && printf(verbose, "Dropped %d subsequences with too few (<%d) probes.\n", nbrOfDroppedSeqs, minNbrOfProbes)
    }

    verbose && cat(verbose, "Number of subsequences (=CDF units) extracted: ", nbrOfUnitsII)
    nbrOfProbesII <- sum(counts[rowsII])
    verbose && printf(verbose, "Number of probes extracted: %d (%.2f%%) of %d\n", nbrOfProbesII, 100*nbrOfProbesII/nbrOfProbesII0, nbrOfProbesII0)

    # Access once
    pmx <- bpmapdf$pmx
    pmy <- bpmapdf$pmy
    mmx <- bpmapdf$mmx
    mmy <- bpmapdf$mmy
    isPmOnly <- all(mmx == 0L)

    # Only for verbose output
    unitNamesII <- character(length=nbrOfUnitsII)

    for (jj in seq_len(nbrOfUnitsII)) {
      rowsJJ <- rowsII[jj]
      startJJ <- starts[rowsJJ]
      endJJ <- ends[rowsJJ]

      idxsJJ <- startJJ:endJJ
      nbrOfProbesJJ <- length(idxsJJ)
      # Sanity check
      stopifnot(nbrOfProbesJJ >= minNbrOfProbes)

      spJJ <- sp[idxsJJ]

      atom <- 0L:(nbrOfProbesJJ-1L)
      indexpos <- 0L:(nbrOfProbesJJ-1L)

      if (isPmOnly) {
        # PM only
        unitGroups[[1L]] <- list(x=pmx[idxsJJ], y=pmy[idxsJJ], pbase=rep("A", times=nbrOfProbesJJ), tbase=rep("T", times=nbrOfProbesJJ), atom=atom, indexpos=indexpos, groupdirection="sense", natoms=nbrOfProbesJJ, ncellsperatom=1L)
      } else {
        # PM+MM
        unitGroups[[1L]] <- list(x=c(pmx[idxsJJ],mmx[idxsJJ]), y=c(pmy[idxsJJ],mmy[idxsJJ]), pbase=rep("A", times=2*nbrOfProbesJJ), tbase=rep(c("T","A"), each=nbrOfProbesJJ), atom=rep(atom, times=2), indexpos=rep(indexpos, times=2), groupdirection="sense", natoms=nbrOfProbesJJ, ncellsperatom=2L)
      }

      groupNames <- sprintf("%sFROM%sTO%s", chrII, spJJ[1L], spJJ[nbrOfProbesJJ])
      names(unitGroups) <- groupNames

      nbrOfAtomsJJ <- sum(sapply(unitGroups, FUN=function(u) u$natoms))
      nbrOfCellsJJ <- sum(sapply(unitGroups, FUN=function(u) u$natoms*u$ncellsperatom))

      if (flavor == "v2") {
        unitIdx <- uu
      } else if (flavor == "v1") {
        unitIdx <- ii
      }

      cdfList[[uu]] <- list(unittype=1L, unitdirection=1L, groups=unitGroups, natoms=nbrOfAtomsJJ, ncells=nbrOfCellsJJ, ncellsperatom=nbrOfCellsJJ/nbrOfAtomsJJ, unitnumber=unitIdx)
      startPositionList[[uu]] <- spJJ

      unitName <- groupNames
      unitNames[uu] <- unitName
      unitPrefixes[uu] <- chrII

      unitNamesII[jj] <- unitName

      # Next CDF unit
      uu <- uu + 1L
    } # for (jj ...)
    verbose && printf(verbose, "CDF units included: [%d] %s\n", length(unitNamesII), hpaste(unitNamesII))

    verbose && exit(verbose)
  } # for (ii ...)
  verbose && exit(verbose)
  nbrOfUnits <- length(cdfList)

  # Sanity check
  stopifnot(nbrOfUnits == length(unitNames))
  stopifnot(nbrOfUnits == length(startPositionList))

  names(cdfList) <- unitNames
  names(startPositionList) <- unitPrefixes


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Writing
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Writing PPS file")
  verbose && cat(verbose, "Output pathname: ", ppsPathname)
  saveObject(startPositionList, file=ppsPathname)
  # Not needed anymore
  startPositionList <- NULL; # Not needed anymore
  verbose && exit(verbose)

  verbose && enter(verbose, "Writing CDF file")
  verbose && cat(verbose, "Output pathname: ", cdfPathname)

  cdfHeader <- list(probesets=nbrOfUnits, qcprobesets=0L, reference="", chiptype=chipType, filename=cdfPathname, nqcunits=0L, nunits=nbrOfUnits, rows=rows, cols=cols, refseq="", nrows=rows, ncols=cols)
  verbose && cat(verbose, "CDF header:")
  verbose && str(verbose, cdfHeader)

  # Write to a temporary file
  pathnameT <- pushTemporaryFile(cdfPathname, verbose=verbose)

  .writeCdf(pathnameT, cdfheader=cdfHeader, cdf=cdfList, cdfqc=NULL, overwrite=TRUE, verbose=verbose)

  # Rename temporary file
  popTemporaryFile(pathnameT, verbose=verbose)

  ## Create checksum file
  dfZ <- getChecksumFile(cdfPathname)

  verbose && exit(verbose)
  verbose && exit(verbose)

  invisible(pathname)
}) # bpmapCluster2Cdf()

Try the aroma.affymetrix package in your browser

Any scripts or data that you put into this service are public.

aroma.affymetrix documentation built on May 29, 2024, 4:32 a.m.