R/conseq.R

# Consensus sequences -----------------------------------------------------


#' Construct a consensus sequence
#'
#' @param x \code{pileup} object.
#' @param name Name for consensus sequence.
#' @param type One of "prob", "ambig" or "simple".
#' @param threshold If \code{type = "ambig"}, threshold to call an ambiguous
#' consensus call.
#' @param suppressAllGaps If \code{type = "prob"} or \code{type = "ambig"},
#' suppress all gaps from consensus calling irrespective of the frequency of the gap.
#' @param suppressInsGaps If \code{type = "prob"}, suppress gaps at insertion position
#' from consensus calling based on \code{columnOccupancy}.
#' @param columnOccupancy Minimum occupancy (1 - fraction of gap) below which
#' bases at insertion position are excluded from from consensus calling.
#' @param ... Additional arguments.
#' @return A \code{\linkS4class{BStringSet}} object with a metadata list
#' containing the slots:
#' \describe{
#'   \item{zscore}{}
#'   \item{freq}{}
#'   \item{ambigs}{}
#'   \item{insertions}{}
#'   \item{deletions}{}
#'   \item{consmat}{}
#' }
#' @export
#' @examples
#' ###
conseq <- function(x,
                   name            = "conseq",
                   type            = c("prob", "ambig", "simple"),
                   threshold       = NULL,
                   suppressAllGaps = FALSE,
                   suppressInsGaps = TRUE,
                   columnOccupancy = 0.4,
                   ...)
  UseMethod("conseq")

#' @export
conseq.pileup <- function(x,
                          name            = "conseq",
                          type            = c("prob", "ambig", "simple"),
                          threshold       = NULL,
                          suppressAllGaps = FALSE,
                          suppressInsGaps = TRUE,
                          columnOccupancy = 0.4,
                          ...) {
  if (is.null(threshold)) {
    threshold <- x$threshold
  }
  x <- consmat(x, freq = FALSE)
  conseq(x, name = name, type = type, threshold = threshold,
         suppressAllGaps = suppressAllGaps, suppressInsGaps = suppressInsGaps,
         columnOccupancy = columnOccupancy, ...)
}

#' @export
conseq.matrix <- function(x,
                          name            = NULL,
                          type            = c("prob", "ambig", "simple"),
                          threshold       = NULL,
                          suppressAllGaps = FALSE,
                          suppressInsGaps = TRUE,
                          columnOccupancy = 0.4,
                          ...) {
  type <- match.arg(type, c("prob", "ambig", "simple"))
  if (type == "ambig" && is.null(threshold)) {
    stop("Must set threshold for ambiguity consensus calling!")
  }
  x <- consmat(x, freq = FALSE)
  conseq <- switch(type,
    prob   = .makeProbConsensus_(x,
                                 suppressAllGaps = suppressAllGaps,
                                 suppressInsGaps = suppressInsGaps,
                                 columnOccupancy = columnOccupancy,
                                 ...),
    ambig  = .makeAmbigConsensus_(x, threshold, suppressAllGaps, ...),
    simple = .makeSimpleConsensus_(x)
  )
  names(conseq) <- name
  conseq
}

## strict consensus based on z-scores
## if <suppressAllGaps> all gap counts are set to zero.
## if <suppressInsGaps> only gap counts at insertion position are affected
## and if the maximum base frequency >= <columnOccupancy> at an insertion position
## the gap count is set to zero.
.makeProbConsensus_ <- function(x,
                                suppressAllGaps = FALSE,
                                suppressInsGaps = TRUE,
                                columnOccupancy = 0.4,
                                asRle = FALSE,
                                ...) {
  xOri <- x
  ## always suppress insertions for consensus call!
  x[, "+"] <- 0
  ## remove rows which have been set to zero
  if (length(i <- which(n(x) == 0L)) > 0) {
    x <- x[-i, ]
  }
  ## suppress all deletions
  if (suppressAllGaps) {
    x[, "-"] <- 0
  }
  ## or suppress deletions specifically at insertion positions
  else if (suppressInsGaps && length(ins_ <- as.character(ins(x))) > 0) {
    x <- .suppressGaps_(x, ins = ins_, columnOccupancy = columnOccupancy)
  }
  ## never allow gaps at the beginning and end of a sequence
  if (!suppressAllGaps) {
    maxbases <- colnames(x)[apply(x, 1, which.max)]
    maxbase  <- which(maxbases != "-")
    maxgap   <- which(maxbases == "-")
    if (length(maxgap) > 0) {
      if (min(maxgap) < min(maxbase)) {
        excludeFromStart <- min(
          which(maxbases == "-")):(min(which(maxbases != "-")) - 1)
        x[excludeFromStart, "-"] <- 0
      }
      if (max(maxgap) > max(maxbase)) {
        excludeFromEnd <- (max(
          which(maxbases != "-")) + 1):max(which(maxbases == "-"))
        x[excludeFromEnd, "-"] <- 0
      }
    }
  }

  rowsd <- mean(.rowSums(x, NROW(x), NCOL(x)))/2 ## WHY?
  z <- sweep(sweep(x, 1, .rowMeans(x, NROW(x), NCOL(x)), `-`), 1, rowsd, `/`)
  bases <- colnames(x)[apply(z, 1, which.max)]

  if (asRle) {
    return(rle(bases))
  }

  dels <- bases == "-"
  seq  <- Biostrings::BStringSet(paste0(bases[!dels], collapse = ""))
  # fix zscore; rm del positions
  z <- z[!dels, ]

  S4Vectors::metadata(seq) <- list(
    zscore     = unname(apply(z, 1, max)),
    freq       = NULL,
    ambigs     = NULL,
    insertions = ins(xOri),
    deletions  = unname(which(dels)),
    consmat    = xOri
  )

  return(seq)
}

## consensus with ambiguities
## <suppressAllGaps> affects behaviour at polymorphic positions:
## if <!suppressAllGaps> a gap ambiguity may be called (small letter)
## if <suppressAllGaps> the alternate base will be called irrespective
## of the frequency of the gap
.makeAmbigConsensus_ <- function(x,
                                 threshold,
                                 suppressAllGaps = FALSE,
                                 asString = FALSE,
                                 ...) {
  ## always suppress insertions for consensus call!
  x[, "+"] <- 0
  ## remove rows which have been set to zero
  if (length(i <- which(n(x) == 0L)) > 0) {
    x <- x[-i, ]
  }
  ## suppress all deletions
  if (suppressAllGaps) {
    x[, "-"] <- 0
  }
  ## Filter all bases with a frequency > threshold
  cmf <- consmat(x, freq = TRUE)
  baselist <- apply(cmf, 1, function(m) {
    rs <- m[i <- m > threshold]
    names(rs) <- names(m)[i]
    list(rs)
  })
  s <- lapply(baselist, function(b) {
    b <- unlist(b)
    if (length(b) == 0) {
      ## this arises if all bases are below threshold
      ## we've seen this with promethION data:
      # Consensus Matrix: 3 x 6
      # nucleotide
      # pos            G          A T         C         -          +
      #   3055 0.1166667 0.28333333 0 0.2500000 0.1888889 0.16111111
      list(base = "N", freq = b)
    }
    else if (length(b) == 1) {
      list(base = names(b), freq = unname(b))
    }
    else if (length(b) > 1) {
      ## sort by name
      NUC <- paste0(names(b[order(names(b))]), collapse = "")
      list(
        base = names(CODE_MAP())[charmatch(NUC, CODE_MAP())] %|na|% "N",
        freq = b
      )
    }
  })
  bases <- vapply(s, `[[`, "base", FUN.VALUE = "")
  if (asString) {
    return(paste0(bases, collapse = ""))
  }
  ambigs <- x[which(!bases %in% DNA_BASES()), ]
  attr(ambigs, "ambiguities") <- unname(bases[which(!bases %in% DNA_BASES())])
  dels <- bases == "-"
  seq  <- Biostrings::BStringSet(paste0(bases[!dels], collapse = ""))

  S4Vectors::metadata(seq) <- list(
    zscore     = NULL,
    freq       = unname(vapply(s, function(x) sum(x[["freq"]]), 0)),
    ambigs     = ambigs,
    insertions = ins(x),
    deletions  = unname(which(dels)),
    consmat    = x
  )

  return(seq)
}

.makeSimpleConsensus_ <- function(x) {
  cmf <- consmat(x, freq = TRUE)
  if (any(na_ <- is.na(rowSums(cmf)))) {
    cmf[na_, ] <- 0
    cmf <- cbind(cmf, N = ifelse(na_, 1, 0))
  }
  paste0(colnames(cmf)[apply(cmf, 1, which.max)], collapse = "")
}

## Helper functions
.suppressGaps_ <- function(x, ins, columnOccupancy = 0.4) {
  x0 <- x[dimnames(x)$pos %in% ins, ]
  ## if frequency of the most frequent base is greater or equal to the
  ## columnOccupancy (1 - gap frequency) set the gap count to zero
  ## (i.e. suppress the gap)
  i <- which(apply(x0[, c("A", "C", "G", "T")], 1, max) /
               x0[, "-"] >= columnOccupancy)
  if (length(j <- dimnames(x0)$pos[i]) > 0) {
    x[j, "-"] <- 0
  }
  x
}

.writeConseq <- function(x, name, type, threshold = NULL, suppressAllGaps = TRUE,
                         replaceIndel = "", conspath, ...) {
  conseq0 <- conseq(consmat(x, freq = FALSE), name = name, type = type,
                   threshold = threshold, suppressAllGaps = suppressAllGaps, ...)
  conseq1 <- Biostrings::DNAStringSet(stripIndel(conseq0, replace = replaceIndel))
  Biostrings::writeXStringSet(conseq1, conspath, append = FALSE, compress = FALSE)
  return(invisible(conseq1))
}
gschofl/DR2S documentation built on May 17, 2019, 8:40 a.m.