R/qaVariants.R

Defines functions MedianDistFromNearestEndFilter MaskFilter IndelsNotSupportedFilter NeighborCountFilter DistanceToNearestFilter ReadPositionTTestFilter t.test_welch InternalReadPosBinFilter StrandFETFilter ReadPosCountFilter NonRefFilter NonNRefFilter VariantQAFilters qaVariants

Documented in qaVariants VariantQAFilters

### =========================================================================
### Variant QA (as opposed to the calling filters)
### -------------------------------------------------------------------------

qaVariants <- function(x, qa.filters = VariantQAFilters(...), ...)
{
  softFilter(x, qa.filters)
}

VariantQAFilters <- function(fisher.strand.p.value = 1e-4, min.mdfne = 10L)
{
  FilterRules(c(mdfne = MedianDistFromNearestEndFilter(min.mdfne),
                fisherStrand = StrandFETFilter(fisher.strand.p.value)))
}

## With new gmapR, this is only necessary for filtering the ref N's.
## In theory, we could keep positions with ref N and at least two alts.
## But this is OK for now.
NonNRefFilter <- function() {
  function(x) {
    as.character(ref(x)) != 'N'
  }
}

## Drops the ref rows (should not be necessary)
NonRefFilter <- function() {
  function(x) {
    !is.na(alt(x))
  }
}

ReadPosCountFilter <- function(read.pos.count = 1L) {
  function(x) {
    (if (!is.null(x$ncycles)) x$ncycles else x$n.read.pos) >= read.pos.count
  }
}

StrandFETFilter <- function(p.value = 1e-4) {
  function(x) {
    p <- with(mcols(x),
              fisher_p_vectorized(count.plus.ref,
                                  count.minus.ref,
                                  (count.plus.ref + count.plus),
                                  (count.minus.ref + count.minus)))
    p > p.value
  }
}

InternalReadPosBinFilter <- function(min.count = 1L) {
  function(x) {
    read_pos_columns <- grep("readPosCount", colnames(mcols(x)), value = TRUE)
    alt_columns <- grep("ref", read_pos_columns, invert = TRUE, value = TRUE)
    internal_columns <- tail(head(alt_columns, -1), -1)
    if (length(internal_columns) > 0L)
      rowSums(as.matrix(mcols(x)[internal_columns])) >= min.count
    else rep.int(TRUE, length(x))
  }
}

t.test_welch <- function(m1, m2, s1, s2, n1, n2) {
  s <- s1 / n1 + s2 / n2
  t <- (m1 - m2) / sqrt(s)
  v <- s^2 / ((s1 / n1)^2 / (n1 - 1L) + (s2 / n2)^2 / (n2 - 1L))
  pt(-abs(t), v) * 2L
}

ReadPositionTTestFilter <- function(p.value.cutoff = 1e-4) {
  function(x) {
    p <- with(mcols(x), t.test_welch(read.pos.mean, read.pos.mean.ref,
                                     read.pos.var, read.pos.var.ref,
                                     rawDepth(x), totalDepth(x)))
    ans <- p > p.value.cutoff
    ans[is.na(ans)] <- TRUE
    ans
  }
}

DistanceToNearestFilter <- function(min.dist = 10L) {
  function(x) {
    mcols(distanceToNearest(x))$distance > min.dist
  }
}

NeighborCountFilter <- function(max.count = 2L, window.size = 75L) {
  function(x) {
    countOverlaps(resize(x, window.size, fix = "center"), x) <= max.count
  }
}

IndelsNotSupportedFilter <- function() {
  function(x) {
    nzchar(ref(x)) & nzchar(alt(x))
  }
}

MaskFilter <- function(mask) {
  function(x) {
    !overlapsAny(x, mask, ignore.strand = TRUE)
  }
}

MedianDistFromNearestEndFilter <- function(min.mdfne) {
  function(x) {
    !is.na(x$mdfne) & x$mdfne >= min.mdfne
  }
}
lawremi/VariantTools documentation built on March 4, 2024, 11:54 a.m.