R/cmpH5Utils.R

## Copyright (c) 2010, Pacific Biosciences of California, Inc.

## All rights reserved.
 
## Redistribution and use in source and binary forms, with or without
## modification, are permitted (subject to the limitations in the
## disclaimer below) provided that the following conditions are met:
 
##     * Redistributions of source code must retain the above copyright
##        notice, this list of conditions and the following disclaimer.
 
##     * Redistributions in binary form must reproduce the above
##        copyright notice, this list of conditions and the following
##        disclaimer in the documentation and/or other materials provided
##        with the distribution.
  
##     * Neither the name of Pacific Biosciences nor the names of its
##        contributors may be used to endorse or promote products derived
##        from this software without specific prior written permission.
 
## NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
## GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY PACIFIC
## BIOSCIENCES AND ITS CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
## WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
## MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
## DISCLAIMED. IN NO EVENT SHALL PACIFIC BIOSCIENCES OR ITS CONTRIBUTORS
## BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
## CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
## SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
## BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
## WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
## OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
## IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#############################################################################
##
## Accessors
##
#############################################################################
getReadLength <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  alnIndex(cmpH5)[idx, "rEnd"] - alnIndex(cmpH5)[idx, "rStart"] + 1
}

getZScore <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  if (h5DatasetExists(cmpH5, "AlnInfo/ZScore"))
    getH5Dataset(cmpH5, "AlnInfo/ZScore")[idx]
  else
    stop("cmpH5 does not contain ZScore dataset.")
}

getFullRefNames <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$fullRefName[idx]
}

getRefNames <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$refName[idx]
}

setMethod("getHoleNumbers", "PacBioCmpH5", function(h5Obj, idx = 1:nrow(h5Obj)) {
  h5Obj$holeNumber[idx]
})

getAccuracy <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)), denominator = getReadLength) {
  1 - rowSums(alnIndex(cmpH5)[idx, c("nMisMatches", "nInsertions", "nDeletions")])/
    denominator(cmpH5, idx)
}

getTemplateStrand <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$alignedStrand[idx]
}

getTemplateStart <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$tStart[idx]
}

getTemplateEnd <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$tEnd[idx]
}

getReadStart <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$rStart[idx]
}

getReadEnd <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$rEnd[idx]
}

getMachineName <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  sapply(strsplit(alnIndex(cmpH5)[idx, "movieName"], "_"), "[", 3)
}

getTemplateSpan <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$tEnd[idx] - cmpH5$tStart[idx] + 1
}

getAlignedLength <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  cmpH5$offsetEnd[idx] - cmpH5$offsetStart[idx] + 1
}

getAdvanceTime <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  .Call("PBR_advance_time", getCumulativeAdvanceTime(cmpH5, idx),
        getAlignmentsRaw(cmpH5, idx), PACKAGE = "pbh5")
}

getPolymerizationRate <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  totalTime <- sapply(getCumulativeAdvanceTime(cmpH5, idx = idx), function(a) {
    a[length(a)] - a[1]
  })
  getTemplateSpan(cmpH5, idx)/totalTime
}

getLocalPolymerizationRate <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)),
                                       binFunction = function(aln) {
                                         cut(seq.int(1, nrow(aln)), 10)
                                       }) {
 alns <- getAlignments(cmpH5, idx = idx)
  stimes <- getCumulativeAdvanceTime(cmpH5, idx = idx)
  mapply(function(aln, stime) {
    tapply(seq.int(1, nrow(aln)), binFunction(aln), function(i) {
      sum(aln[i,2] != '-')/(max(stime[i], na.rm = TRUE) - min(stime[i], na.rm = T))
    })
  }, alns, stimes, SIMPLIFY = FALSE)
}

getNumPasses <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  stopifnot(h5DatasetExists(cmpH5, "AlnInfo/NumPasses"))
  getH5Dataset(cmpH5, "AlnInfo/NumPasses")[idx]
}

#############################################################################
##
## PulseH5 + CmpH5 access
##
#############################################################################
doWithPlsAndCmp <- function(cmpH5, plsH5s, fx, ..., SIMPLIFY = TRUE) {
  names(plsH5s) <- sapply(plsH5s, function(plsH5) {
    getMovieName(plsH5)
  })
  stopifnot(all(getMovieName(cmpH5) %in% names(plsH5s)))

  plsFactor <- factor(getMovieName(cmpH5), names(plsH5s))
  idx <- seq.int(1, nrow(cmpH5))
  idxByMovie <- split(idx, plsFactor)
  
  lst <- mapply(function(idxs, plsH5) {
    if (length(idxs) <= 0)
      return(NULL)
    fx(cmpH5, plsH5, idxs)
  }, idxByMovie, plsH5s, SIMPLIFY = FALSE)

 
  ## this is necessary to produce a list in the same order as the
  ## original AlignmentIndex.
  if (SIMPLIFY) {
    lst <- Filter(Negate(is.null), lst)
    if (! is.null(dim(lst[[1]]))) {
      lst <- do.call(rbind, lst)
      rownames(lst) <- do.call(c, idxByMovie)
      lst <- lst[as.character(idx),,drop=FALSE]
    } else {
      lst <- do.call(c, lst)
      names(lst) <- do.call(c, idxByMovie)
      lst <- lst[as.character(idx)]
    }
  }
  return(lst)
}

setMethod("getBpzvarw", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, fx = function(cmpH5, plsH5, idxs) {
    bpzvarw <- getBpzvarw(plsH5)
    bpzvarw <- llply(bpzvarw, function(d) d[match(cmpH5$holeNumber[idxs], getHoleNumbers(plsH5)),,drop = FALSE])
    bpzvarw
  }, SIMPLIFY=F)
})

setMethod("getBpzvar", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, fx = function(cmpH5, plsH5, idxs) {
    bpzvar <- getBpzvar(plsH5)
    bpzvar[match(cmpH5$holeNumber[idxs], getHoleNumbers(plsH5)),,drop = FALSE]
  })
})

setMethod("getPkzvarw", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, fx = function(cmpH5, plsH5, idxs) {
    pkzvarw <- getPkzvarw(plsH5)
    pkzvarw <- llply(pkzvarw, function(d) d[match(cmpH5$holeNumber[idxs], getHoleNumbers(plsH5)),,drop = FALSE])
    pkzvarw
  }, SIMPLIFY=F)
})

setMethod("getPkzvar", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, fx = function(cmpH5, plsH5, idxs) {
    pkzvar <- getPkzvar(plsH5)
    pkzvar[match(cmpH5$holeNumber[idxs], getHoleNumbers(plsH5)),,drop = FALSE]
  })
})

setMethod("getSNR", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, fx = function(cmpH5, plsH5, idxs) {
    snr <- getSNR(plsH5)
    snr[match(cmpH5$holeNumber[idxs], getHoleNumbers(plsH5)),,drop = FALSE]
  })
})

setMethod("getReadScore", "PacBioCmpH5", function(h5Obj, plsH5s) {
  doWithPlsAndCmp(h5Obj, plsH5s, function(cmp, pls, idx) {
    mtch <- match(cmp$holeNumber[idx], getBaseEvents(pls)[,"holeNumber"])
    getReadScore(pls)[mtch]
  })
})

getMoleculeIndex <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  p <- paste(cmpH5$movieName[idx], cmpH5$holeNumber[idx], sep = ".")
  factor(p, levels = p[!duplicated(p)])
}

doByMolecule <- function(cmpH5, f, g = I, idx = 1:nrow(cmpH5),
                         alongIdx = TRUE) {
  stopifnot(length(idx) >= 1)
  
  midx <- getMoleculeIndex(cmpH5, idx)
  bymol <- split(1:length(idx), midx)
  fres <- f(cmpH5, idx)
  
  if (! alongIdx) {
    res <- lapply(bymol, function(v) {
      g(fres[v])
    })
  } else {
    res <- vector(class(fres[1]), length(idx))
    for (i in 1:length(bymol)) {
      res[bymol[[i]]] <- g(fres[bymol[[i]]])
    }
  }
  return(simplify2array(res))
}

getUnrolledTemplateSpan <- function(cmpH5, idx = 1:nrow(cmpH5), unique = TRUE) {
  doByMolecule(cmpH5, f = getTemplateSpan, g = sum, idx = idx,
               alongIdx = ! unique)
}

getMoleculeReadStart <- function(cmpH5, idx = 1:nrow(cmpH5), ...) {
  doByMolecule(cmpH5, f = getReadStart, g = min, idx = idx, ...)
}

getSubreadIndex <- function(cmpH5, idx = 1:nrow(cmpH5), ...) {
  doByMolecule(cmpH5, f = getReadStart, g = rank, idx = idx, ...)
}

getMoleculeIndexList <- function(cmpH5, idx = seq.int(1, nrow(cmpH5))) {
  split(1:nrow(cmpH5), getMoleculeIndex(cmpH5))
}
  
getByMolecule <- function(cmpH5, idx, fx = getIPD) {
  tapply(idx, getMoleculeIndex(cmpH5, idx), function(z) {
    vals <- fx(cmpH5, z[order(getReadStart(cmpH5, z))])
    if (is.list(vals)) do.call(c, vals) else vals
  })
}

getPkmidPhotons <- function(cmpH5, plsH5s, SIMPLIFY = TRUE) {
  doWithPlsAndCmp(cmpH5, plsH5s, function(cmp, pls, jdx) {
    acq <- getAcqParams(pls)
    fct <- acq$CameraGain/acq$AduGain
    lapply(getPkmid(cmp, idx = jdx), function(z) z/fct)
  }, SIMPLIFY = SIMPLIFY)
}

#############################################################################
##
## kMer Code.
##
#############################################################################
.makeKmerLabelCache <- function(maxSize = 12, alphabet = c("A", "C", "G", "T")) {
  kMerLabelCache <- new.env(hash = T, parent = emptyenv())
  lapply(1:12, function(i) assign(as.character(i), NULL, kMerLabelCache))
  function(k) {
    key <- as.character(k)
    if (is.null(get(key, kMerLabelCache))) {
      M <- as.matrix(expand.grid(lapply(1:k, function(i) alphabet),
                                 stringsAsFactors = FALSE)[,k:1])
      colnames(M) <- sprintf("P%02d", 1:k)
      assign(key, list(M = M, S = apply(M, 1, base:::paste, collapse = "")),
             kMerLabelCache)
    }
    get(key, kMerLabelCache)
  }
}
.kMerLabelCache <- .makeKmerLabelCache()

.kMerTable <- function(lst, k = 2) {
  res <- .Call("PBR_k_mer_tabulator", lst, "ACGT", as.integer(k), PACKAGE = "pbh5")
  res <- matrix(res, nrow = length(lst), byrow = TRUE)
  colnames(res) <- .kMerLabelCache(k)$S
  return(res)
}

.kMerLabeler <- function(lst, k = 2, collapse = FALSE) {
  stopifnot(k > 1 || k > 12)
  res <- .Call("PBR_k_mer_labeler", lst, "ACGT", as.integer(k), PACKAGE = "pbh5")
  nms <- .kMerLabelCache(k)
  if (collapse) {
    lapply(res, function(a) nms$S[a+1])
  } else {
    lapply(res, function(a) nms$M[a+1,])
  }
}

getKmerContext <- function(cmpH5, idx = 1:nrow(cmpH5), up = 2, down = 2) {
  k <- up + down + 1
  alns <- getAlignments(cmpH5, idx)
  msks <- lapply(alns, function(a) a[, 2] != "-")
  alns <- mapply(msks, alns, FUN = function(m, d) d[m, 2], SIMPLIFY = FALSE)
  labels <- .kMerLabeler(alns, k = k, collapse = TRUE)
  mapply(labels, msks, FUN = function(a, b) {
    o <- rep(NA, length(b))
    o[b] <- a
    o
  }, SIMPLIFY = FALSE)
}

associateWithContext <- function(cmpH5, idx = 1:nrow(cmpH5), f = getIPD, up = 2, down = 2,
                                 useReference = TRUE, collapse = FALSE) {    
  k <- up + down + 1
  col <- if (useReference) 2 else 1
  hasDim <- !is.null(dim(f(cmpH5, idx[1])[[1]]))
  
  alns <- getAlignments(cmpH5, idx)
  msks <- lapply(alns, function(a) a[, col] != "-")
  alns <- mapply(msks, alns, FUN = function(m, d) d[m, col], SIMPLIFY = FALSE)
  elts <- mapply(msks, f(cmpH5, idx), FUN = function(m, d) if (hasDim) d[m,] else d[m],
                 SIMPLIFY = FALSE)
  labels <- .kMerLabeler(alns, k = k, collapse = collapse)
  r <- mapply(elts, labels, alns, FUN = function(elt, lab, aa) {
    r1 <- (up+1):((if (hasDim) nrow(elt) else length(elt)) - down)
    r2 <- 1:length(r1)
    list(elt = if (hasDim) elt[r1,] else elt[r1],
         label = if (collapse) lab[r2] else lab[r2,])
  }, SIMPLIFY = FALSE)
    
  data.frame(elt = do.call(if (hasDim) rbind else c, lapply(r, "[[", 1)),
             context = do.call(if (collapse) c else rbind, lapply(r, "[[", 2)),
             stringsAsFactors = FALSE)
}

summarizeByContext <- function(cmpH5, idx = 1:nrow(cmpH5), up = 2, down = 2,
                               statF = getIPD, summaryF = function(a) median(a, na.rm = T),
                               ...) {
  ctx <- associateWithContext(cmpH5, up = up, down = down, idx = idx, f = statF,
                              collapse = TRUE, ...)
  levels <- .kMerLabelCache(up + down + 1)$S
  fct <- factor(ctx[,"context"], levels = levels)
  dta <- data.frame(count = as.numeric(table(fct)),
                    value = tapply(ctx[,"elt"], fct, summaryF), stringsAsFactors = FALSE)
  rownames(dta) <- levels(fct)
  return(dta)
}

makeContextDataTable <- function(cmpH5, idx = 1:nrow(cmpH5), up = 2, down = 2,
                                 fxs = list(ipd = getIPD, pw = getPulseWidth,
                                   tpos = getTemplatePosition), ...) {
  f <- function(cmpH5, idxs) {
    lst <- lapply(fxs, function(f) {
      f(cmpH5, idxs)
    })
    lapply(1:length(idxs), function(i) {
      d <- do.call(base:::cbind, lapply(1:length(fxs), function(j) {
        lst[[j]][[i]]
      }))
      colnames(d) <- names(fxs)
      return(d)
    })
  }
  associateWithContext(cmpH5, idx, f, up = up, down = down, ...)
}

#############################################################################
##
## Positional Access.
##
#############################################################################
getReadsInRange <- function(cmpH5, refSeq, refStart = 1, refEnd = NA, idx = 1:nrow(cmpH5)) {
  refBlock <- .getRefBlock(cmpH5, refSeq)
  ranges <- alnIndex(cmpH5)[seq.int(refBlock[1], refBlock[2]), c("tStart", "tEnd",
                                                                 "nBackRead", "nOverlap")]
  if (is.na(refEnd)) refEnd <- getRefLength(cmpH5, refSeq)
  idxs <- .Call("PBR_indices_within_range", ranges[,1], ranges[,2], ranges[,3], ranges[,4],
                as.integer(refStart), as.integer(refEnd), PACKAGE = "pbh5")
  ## XXX: This shouldn't happen, but it has.
  idxs[length(idxs)] <- ifelse(idxs[length(idxs)] >= refBlock[2], refBlock[2] - 1,
                               idxs[length(idxs)])
  idxs <- idxs + 1 + ifelse(0 == refBlock[1], 0, refBlock[1] - 1)
  return(intersect(idxs, idx))
}

getCoverageInRange <- function (cmpH5, refSeq, refStart = 1, refEnd = NA, idx = NULL) {
  refBlock <- .getRefBlock(cmpH5, refSeq)
  refEnd <- if (is.na(refEnd)) getRefLength(cmpH5, refSeq) else refEnd
  if (is.null(idx)) {
    idx <- seq.int(1, nrow(cmpH5))
  }
  w <- sort(intersect(idx, seq.int(refBlock[1], refBlock[2])))
  ranges <- alnIndex(cmpH5)[w, c("tStart", "tEnd", "nBackRead", "nOverlap")]
  
  if (nrow(ranges) == 0) {
    ## XXX : this occurs because there are no reads for this reference
    ## sequence among the indices that were specified, either all or
    ## user-specified.
    return(integer(refEnd - refStart + 1))
  } else {
    .Call("PBR_coverage_within_range", ranges[, 1], ranges[,2], ranges[, 3], ranges[, 4],
          as.integer(refStart), as.integer(refEnd), PACKAGE = "pbh5")
  }
}

#############################################################################
##
## Template Position.
##
#############################################################################
templatePositionFromAln <- function(alnLst, strand, start, end) {
  .Call("PBR_get_template_position", alnLst, strand, start, end, PACKAGE = "pbh5")
}

.reverse <- function(x, strand) {
  if (strand == 1) {
    if (! is.null(dim(x))) {
      x[nrow(x):1, ]
    } else {
      rev(x)
    }
  } else {
    x
  }
}

.reverseComplement <- function(x, strand) {
  if (strand == 0) {
    x
  } else {
    rev(c("T", "G", "C", "A", "-")[match(x, c("A", "C", "G", "T", "-"))])
  }
}

getTemplatePosition <- function(cmpH5, idx = 1:nrow(cmpH5), withAlignments = FALSE,
                                asDataFrame = FALSE) {
  alns <- getAlignmentsRaw(cmpH5, idx)
  strands <- cmpH5$alignedStrand[idx]
  tpos <- templatePositionFromAln(alns, strands, cmpH5$tStart[idx], cmpH5$tEnd[idx])

  if (! withAlignments) {
    return(tpos)
  }
  mapply(function(a, strand, id, p) {
    ref  <- .reverse(.bMap[a+1, 2], strand)
    read <- .reverse(.bMap[a+1, 1], strand)
    z <- list(position = .reverse(p, strand), read = read, ref = ref,
              idx = rep(id, length(ref)), strand = rep(strand, length(ref)))
    if (asDataFrame) as.data.frame(z, stringsAsFactors = FALSE) else z
  }, alns, strands, idx, tpos, SIMPLIFY = FALSE)
}

getByTemplatePosition <- function(cmpH5, idx = 1:nrow(cmpH5), f = getIPD) {
  if (is.list(f)) {
    return(.getByTemplatePositionList(cmpH5, idx, f))
  }
  tpos <- getTemplatePosition(cmpH5, idx, withAlignments = TRUE)
  dta <- f(cmpH5, idx)
  strands <- getTemplateStrand(cmpH5, idx)
  x <- mapply(strands, dta, FUN = function(s, d) .reverse(d, s), SIMPLIFY = FALSE)
  x <- do.call(if (! is.null(dim(x[[1]]))) rbind else c, x)
  e0 <- tpos[[1]]
  c0 <- names(e0)
  d <- lapply(1:length(c0), function(i) {
    do.call(c, lapply(tpos, function(e) e[[i]]))
  })
  names(d) <- c0
  d$elt <- x
  as.data.frame(d, row.names = NULL, stringsAsFactors = FALSE)
}

.getByTemplatePositionList <- function (cmpH5, idx = 1:nrow(cmpH5),
                                        fxs = list(IPD = getIPD,
                                          PulseWidth = getPulseWidth,
                                          refName = getFullRefNames)) {
  tpos <- getTemplatePosition(cmpH5, idx, withAlignments = TRUE)
  strands <- getTemplateStrand(cmpH5, idx)
  dta <- lapply(fxs, function(f) {
    do.call(c, mapply(strands, tpos, f(cmpH5, idx), FUN = function(s, tp, d) {
      if (length(d) == 1) {
        ## to allow for the possibility of scalar vals per read.
        rep(d, length(tp[[1]]))
      } else {
        if (s == 0) d else d[length(d):1]
      }
    }, SIMPLIFY = FALSE))
  })
  e0 <- tpos[[1]]
  c0 <- names(e0)
  tpos <- lapply(1:length(c0), function(i) {
    do.call(c, lapply(tpos, function(e) e[[i]]))
  })
  names(tpos) <- c0
  as.data.frame.list(c(tpos, dta), row.names = NULL, stringsAsFactors = FALSE)
}

getAlignmentBlock <- function(cmpH5, ref, refStart, refEnd, idx = 1:nrow(cmpH5)) {
  reads <- getReadsInRange(cmpH5, ref, refStart, refEnd, idx = idx)
  tpos <- getTemplatePosition(cmpH5, reads, TRUE, TRUE)
  refSpan <- seq.int(refStart, refEnd)
  do.call(rbind, lapply(tpos, function(a) {
    w <- a$position %in% refSpan
    if (a$strand[w][1] == 1)
      read <- c("T", "G", "C", "A", "-")[match(a$read[w], c("A", "C", "G", "T", "-"))]
    else
      read <- a$read[w]
    tapply(read, factor(a$position[w], refSpan), paste, collapse = "")
  }))
}

#############################################################################
##
## Consensus Code.
##
#############################################################################
getCallsForReads <- function(rawAlns, refStart, refEnd, strands, starts, ends) {
  stopifnot(length(rawAlns) == length(starts)  && length(rawAlns) == length(strands) &&
            length(rawAlns) == length(ends))
  .Call("PBR_compute_consensus", rawAlns, as.integer(refStart), as.integer(refEnd),
        as.integer(strands), as.integer(starts), as.integer(ends), PACKAGE = "pbh5")
}

getConsensusForReads <- function(rawAlns, refStart, refEnd, strands, starts,
                                 ends, fast = TRUE) {
  if (fast) {
    stopifnot(length(rawAlns) == length(starts)  && length(rawAlns) == length(strands) &&
              length(rawAlns) == length(ends))
    .Call("PBR_compute_consensus2", rawAlns, as.integer(refStart), as.integer(refEnd),
          as.integer(strands), as.integer(starts), as.integer(ends), PACKAGE = "pbh5")
  } else {
    computeConsensus(getCallsForReads(rawAlns, refStart, refEnd, strands, starts, ends))
  }
}

getConsensusForIdxs <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  refNames <- getFullRefNames(cmpH5, idx)
  stopifnot(length(unique(refNames)) == 1)
  starts <- getTemplateStart(cmpH5, idx)
  ends <- getTemplateEnd(cmpH5, idx)
  strands <- getTemplateStrand(cmpH5, idx)
  getConsensusForReads(getAlignmentsRaw(cmpH5, idx), min(starts), max(ends), strands,
                       starts, ends)
}

getConsensusInRange <- function(cmpH5, refSeq, refStart = 1,
                                refEnd = getRefLength(cmpH5, refSeq),
                                idx = 1:nrow(cmpH5),
                                blockSize = NA) {
  if (is.na(blockSize)) {
    idxs <- getReadsInRange(cmpH5, refSeq, refStart, refEnd, idx)
    getConsensusForReads(getAlignmentsRaw(cmpH5, idxs), refStart, refEnd,
                         getTemplateStrand(cmpH5, idxs), getTemplateStart(cmpH5, idxs),
                         getTemplateEnd(cmpH5, idxs))
  } else {
    intervals <- unique(c(seq.int(from=refStart, to=refEnd, by = blockSize),
                          refEnd+1))
    do.call(c, lapply(2:length(intervals), function(j) {
      idxs <- getReadsInRange(cmpH5, refSeq, refStart = intervals[j-1],
                              refEnd = intervals[j]-1, idx)
      getConsensusForReads(getAlignmentsRaw(cmpH5, idxs), refStart=intervals[j-1],
                           refEnd = intervals[j]-1,
                           getTemplateStrand(cmpH5, idxs),
                           getTemplateStart(cmpH5, idxs),
                           getTemplateEnd(cmpH5, idxs))
    }))
  }
}

getCallsInRange <- function(cmpH5, refSeq, refStart = 1, refEnd = getRefLength(cmpH5, refSeq),
                            idx = 1:nrow(cmpH5)) {
  idxs <- getReadsInRange(cmpH5, refSeq, refStart, refEnd, idx)
  getCallsForReads(getAlignmentsRaw(cmpH5, idxs), refStart, refEnd,
                   getTemplateStrand(cmpH5, idxs), getTemplateStart(cmpH5, idxs),
                   getTemplateEnd(cmpH5, idxs))
}

computeConsensus <- function(calls) {
  f <- base:::table # avoid dispatch and lookup.
  sapply(calls, function(a) {
    if (length(a) <= 0) return(NA)
    t <- f(a)
    m <- max(t) == t
    if (sum(m) == 1) {
      names(t)[m]
    } else { 
      sample(names(t)[m], size = 1)
    }
  })
}

.computeConsensusRle <- function(calls, reference) {
  runs <- rle(reference)
  rlabels <- rep(1:length(runs$length), runs$length)
  res <- tapply(calls, rlabels, function(a) {
    nbases <- round(sum(sapply(a, function(b) {
      if (is.null(b))
        0
      else
        mean(ifelse(b == '-', 0, ifelse(nchar(b) > 2, 2, 1)))
    })))
    
    calls <- do.call(c, a)
    calls <- calls[calls != '-']
    wbase <- names(sort(table(calls), decreasing = T))[1]
    bases <- rep(wbase, nbases)
    
    if (nbases < length(a)) {
      c(rep('-', length(a) - nbases), bases)
    } else if (nbases > length(a)) {
      if (length(a) == 1)
        paste(bases, collapse = '')
      else 
        c(bases[1:(length(a)-1)], paste(bases[length(a):length(bases)],
                                        collapse = ""))
    } else {
      bases
    }
  })
  as.character(do.call(c, res))
}

#############################################################################
##
## Access and Manipulation
##
#############################################################################
getAlignmentsWithFeatures <- function(cmpH5, idx = 1:nrow(cmpH5), fxs = getIPD,
                                      collapse = FALSE) {
  if (is.list(fxs)) {
    .getAlignmentsWithFeaturesList(cmpH5, idx, fxs, collapse)
  } else {
    alns <- getAlignments(cmpH5, idx)
    dta <- do.call(c, fxs(cmpH5, idx))
    idx <- rep(idx, sapply(alns, nrow))
    data.frame(do.call(rbind, alns), idx, dta, row.names = NULL, stringsAsFactors = FALSE)
  }
}

.getAlignmentsWithFeaturesAsDataFrame <- function(cmpH5, idx, features) {
   alns <- getAlignments(cmpH5, idx)
   ds <- lapply(features, function(fx) {
     do.call(c, fx(cmpH5, idx)) 
   })
   idx <- rep(idx, sapply(alns, nrow))
   alns <- do.call(rbind, alns)
   data.frame(alns, idx, ds, row.names = NULL, stringsAsFactors = FALSE)
}

.getAlignmentsWithFeaturesList <- function(cmpH5, idx = seq.int(1, nrow(cmpH5)),
                                           fxs = list('ipd' = getIPD), collapse = FALSE) {
  if (collapse) {
    dta <- .getAlignmentsWithFeaturesAsDataFrame(cmpH5, idx, fxs)
  } else {
    alns <- getAlignments(cmpH5, idx)
    ds <- lapply(fxs, function(fx) {
      fx(cmpH5, idx)
    })
    ds <- lapply(1:length(idx), function(i) {
      do.call(cbind, lapply(ds, function(zds) zds[[i]]))
    })
    names(ds) <- names(alns)
    
    dta <- mapply(function(d, a) {
      data.frame(d, a, stringsAsFactors = FALSE)
    }, ds, alns, SIMPLIFY = FALSE)
  }
  return(dta)
}



#############################################################################
##
## Barcode
##
#############################################################################

## XXX: Both Barcode and Edna (and for that matter anything under
## AlnInfo might need to be 'restricted' -- the right way to do that
## is to use the .getRestrictedDataset.
.getRestrictedDataset <- function(cmpH5, name, idx = 1:nrow(cmpH5)) {
  nm <- paste("AlnInfo", name, sep = "/")
  stopifnot(h5DatasetExists(cmpH5, nm))
  d <- getH5Dataset(cmpH5, nm)
  if (length(cmpH5@mask) > 0) {
    v <- d[which(cmpH5@mask),]
  } else {
    v <- d
  }
  list(data = v[idx,], h5 = d) # return both because its useful.
}

getBarcodeMatrix <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  bcDS <- .getRestrictedDataset(cmpH5, "Barcode")

  id <- getH5Dataset(cmpH5, "BarcodeInfo/ID")[]
  nm <- getH5Dataset(cmpH5, "BarcodeInfo/Name")[]
  cn <- getH5Attribute(bcDS$h5, "ColumnNames")[]
  ds <- as.data.frame(bcDS$data)
  colnames(ds) <- cn

  if (ncol(ds) == 3) {
    ## old
    ds$name <- nm[match(ds$index, id)]
  } else {
    ## new
    ds$name1 <- nm[match(ds$index1, id)]
    ds$name2 <- nm[match(ds$index2, id)]
  }
  return(ds[idx,])
}

getBarcodeLabels <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  if (h5DatasetExists(cmpH5, "BarcodeInfo/ID")) {
    mat <- getBarcodeMatrix(cmpH5, idx)
    if ("name1" %in% colnames(mat))
      mat[,"name1"]
    else
      mat[,"name"]
  } else {
    stop("No barcode information in file.")
  }
}

getBarcodeScores <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  if (h5DatasetExists(cmpH5, "BarcodeInfo/ID")) {
    mat <- getBarcodeMatrix(cmpH5, idx)
    if ("name1" %in% colnames(mat))
      mat[,"score1"]
    else
      mat[,"score"]
  } else {
    stop("No barcode information in file.") 
  }
}

#############################################################################
##
## EDNA
##
#############################################################################
getEdna <- function(cmpH5, idx = 1:nrow(cmpH5)) {
  ednaDS <- .getRestrictedDataset(cmpH5, "Edna", idx)
  data <- ednaDS$data
  colnames(data) <- getH5Attribute(ednaDS$h5, "ColumnNames")[]
  data
}
PacificBiosciences/R-pbh5 documentation built on May 7, 2019, 11:54 p.m.