R/shearwaterML.R

#' ShearwaterML
#' 
#' Maximum likelihood version of Shearwater producing p-values instead of Bayes factors.
#'
#' @param counts The array of counts typically generated by loadAllData.
#' @param rho Use this variable to fix the dispersion parameter to a value of interest. Default: NULL, rho will be estimated from the data.
#' @param truncate Samples with variant allele frequencies higher than "truncate" will be excluded from the background error model.
#' @param rho.min If rho=NULL, rho will be estimated from the data in the interval [rho.min,rho.max].
#' @param rho.max If rho=NULL, rho will be estimated from the data in the interval [rho.min,rho.max].
#' @param maxvaf Sites with an average rate of mimatches higher than maxvaf will not be considered (e.g. SNPs or reference sites).
#' @param mindepth Minimum coverage required to test a site.
#' @param maxtruncate Maximum number of samples that can be excluded from the background error model by truncate for a site to be tested.
#' @return A list with two arrays for P- and Q-values.
#' @examples
#' # code to be added
#' @export
#' @author Inigo Martincorena and Moritz Gerstung
#' @references Martincorena I, Roshan A, Gerstung M, et al. (2015). High burden and pervasive positive selection of somatic mutations in normal human skin. _Science_ (Under consideration).

betabinLRT = function (counts, rho = NULL, truncate = 0.05, rho.min = 1e-04, rho.max = 0.8, maxvaf = 0.3, mindepth = 10, maxtruncate = 0.5) {
  
  # deepSNV functions
  estimateRho = deepSNV:::estimateRho
  logbb = deepSNV:::logbb  
  
  pseudo = .Machine$double.eps
  ncol = dim(counts)[3]/2
  x.fw = counts[, , 1:ncol]
  x.bw = counts[, , 1:ncol + ncol]
  n.fw = rep(rowSums(x.fw, dims = 2), dim(x.fw)[3])
  n.bw = rep(rowSums(x.bw, dims = 2), dim(x.bw)[3])
  dim(n.fw) = dim(x.fw)
  dim(n.bw) = dim(x.bw)
  
  x = x.fw + x.bw
  n = n.fw + n.bw
  mu = pmax(x,pseudo)/pmax(n,pseudo)
  
  ix = (mu < truncate)
  
  # Calculation of rho
  if (is.null(rho)) {
    rho = estimateRho(x, mu, ix)
    rho = pmin(pmax(rho, rho.min), rho.max)
    rho[is.na(rho)] = rho.min
  }
  disp = (1 - rho)/rho
  rdisp = rep(disp, each = nrow(counts))
  mu = mu * rdisp
  
  tr.fw = x.fw * ix
  tr.bw = x.bw * ix
  
  X.fw = rep(colSums(tr.fw, dims = 1), each = nrow(counts)) - tr.fw
  N.fw = rep(colSums(n.fw * ix), each = nrow(counts)) - n.fw * ix
  
  X.bw = rep(colSums(tr.bw, dims = 1), each = nrow(counts)) - tr.bw
  N.bw = rep(colSums(n.bw * ix), each = nrow(counts)) - n.bw * ix
  
  prob0.fw = (X.fw + x.fw)/(N.fw + n.fw); prob0.fw[prob0.fw==0] = pseudo
  prob1s.fw = x.fw/(n.fw+pseudo); prob1s.fw[prob1s.fw==0] = pseudo
  prob1c.fw = X.fw/(N.fw+pseudo); prob1c.fw[prob1c.fw==0] = pseudo
  prob1s.fw = pmax(prob1s.fw,prob1c.fw) # Min error rate is that of the population (one-sided test)
  nu0.fw = prob0.fw * rdisp; nu1s.fw = prob1s.fw * rdisp; nu1c.fw = prob1c.fw * rdisp; 
  
  prob0.bw = (X.bw + x.bw)/(N.bw + n.bw); prob0.bw[prob0.bw==0] = pseudo
  prob1s.bw = x.bw/(n.bw+pseudo); prob1s.bw[prob1s.bw==0] = pseudo
  prob1c.bw = X.bw/(N.bw+pseudo); prob1c.bw[prob1c.bw==0] = pseudo
  prob1s.bw = pmax(prob1s.bw,prob1c.bw) # Min error rate is that of the population (one-sided test)
  nu0.bw = prob0.bw * rdisp; nu1s.bw = prob1s.bw * rdisp; nu1c.bw = prob1c.bw * rdisp; 
    
  
  # Beta-Binomial LRT
  prob1s.both = (x.fw+x.bw)/(n.fw+n.bw+pseudo); prob1s.both[prob1s.both==0] = pseudo
  prob1s.both = pmax(prob1s.both,pmax(prob1c.fw,prob1c.bw)) # Min error rate is that of the population (one-sided test)
  nu1s.both = prob1s.both * rdisp
  LL.both = logbb(x.fw, n.fw, nu0.fw, rdisp) + logbb(X.fw, N.fw, nu0.fw, rdisp) + logbb(x.bw, n.bw, nu0.bw, rdisp) + logbb(X.bw, N.bw, nu0.bw, rdisp) - logbb(x.fw, n.fw, nu1s.both, rdisp) - logbb(X.fw, N.fw, nu1c.fw, rdisp) - logbb(x.bw, n.bw, nu1s.both, rdisp) - logbb(X.bw, N.bw, nu1c.bw, rdisp) 
  pvals_both = pchisq(-2*LL.both, df=2, lower.tail=F)/2 # We divide by 2 as we are performing a 1-sided test

  # Masking out undesired tests (excluding sites with too low coverage can boost power by reducing multiple testing)
  pvals_both[(n.fw+n.bw)<mindepth] = NA
  pvals_both[(prob0.bw>maxvaf) & (prob0.fw>maxvaf)] = NA
  truncfilt = rep(apply(ix,c(2,3),mean)<maxtruncate, each=dim(ix)[1])
  dim(truncfilt) = dim(ix)
  pvals_both[truncfilt] = NA
  
  # False discovery rate (it should be applied to the entire collection of sites tested)
  qvals = p.adjust(pvals_both, method="BH")
  dim(qvals) = dim(pvals_both)
  
  return(list(pvals=pvals_both, qvals=qvals))
}



#' Function to create a \code{\link{VCF}} object with variant calls from an array of q-values.
#' 
#' This function thresholds the q-values computed by the shearwater algorithm and creates a \code{\link{VCF}} object as output.
#' @param qvals array of q-values from \code{\link{betabinLRT}}.
#' @param counts array of counts from \code{\link{loadAllData}}.
#' @param regions \code{\link{GRanges}} with the regions corresponding to counts and qvals.
#' @param samples vector of samples names.
#' @param cutoff Cutoff for the q-values below which a variant is considered to be true (default = 0.05)
#' @param mvcf boolean flag, if TRUE compute a large VCF with as many genotype columns as samples. Default TRUE. Otherwise use duplicate rows and only one genotype column. The sample is then provided by the info:PD field. Can be inefficient for large sample sizes.
#' @param err Optional matrix of error rates, otherwise recomputed from counts.
#' @param mu Optional matrix of relative frequencies, otherwise recomputed from counts.
#' @return A \code{\link{VCF}} object
#' 
#' @author mg14
#' @note Experimental code, subject to changes
#' @export
## TODO: check for cases with zero result
qvals2Vcf <- function(qvals, counts, regions, samples = 1:nrow(counts), err = NULL, mu = NULL, cutoff = 0.05, mvcf=TRUE){
  coordinates <- regions2Coordinates(regions)
  w = which(qvals < cutoff, arr.ind=TRUE)
  w = w[order(w[,2], w[,3], w[,1]),,drop=FALSE]
  
  ## If no variants found, select first possible and set to NULL later (avoids a lot of errors in the following)
  if(dim(w)[1]==0) {
    w = matrix(1,ncol = 3)
    isNull = TRUE
  }else{
    isNull = FALSE
  }
  
  totCounts = counts[,,1:5] + counts[,,6:10]
  
  if(is.null(err)){
    err = colSums(totCounts)
    err = err / (rowSums(err) + .Machine$double.eps)
  }
  if(is.null(mu))
    mu = (totCounts)/rep(rowSums(counts, dims=2)+.Machine$double.eps, 5)
  
  
  #coordinates = regions2Coordinates(regions.GR)
  ref = factor(apply(err,1,which.max), levels = 1:5, labels=c("A","T","C","G","-"))
  #	gr = GRanges(paste(w[,1], w[,3]==5, sep="."), IRanges(w[,2], width=1), alt=w[,3], AF = select(w,subcl))
  #	o = order(gr)
  #	rd = reduce(gr)
  #	m = match(gr[o], rd)
  #	alt = lapply(split(c("A","T","C","G","")[w[o,3]], m), paste, collapse="")
  if(!mvcf){
    v = VCF(
      rowRanges=GRanges(coordinates$chr[w[,2]], 
                      IRanges(coordinates$pos[w[,2]] - (w[,3]==5), width=1 + (w[,3]==5)), ## If del make one longer..
      ),
      fixed = DataFrame(
        REF = DNAStringSet(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), ref[w[,2]], sep="")),
        ALT = do.call(DNAStringSetList,as.list(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), c("A","T","C","G","")[w[,3]], sep=""))),
        #ALT = DNAStringSet(paste(ifelse(w[,3]==5,as.character(ref[w[,2]-1]),""), c("A","T","C","G","")[w[u,3]], sep="")),
        QUAL = round(-10*log10(select(w,qvals))),
        FILTER = "PASS"
      ),
      info = DataFrame(
        #paramRangeID=NA,
        PD = samples[w[,1]], 
        AF = select(w, mu),
        ER = select(w[,-1], err),
        FW = select(w, counts[,,1:5]),
        BW = select(w, counts[,,6:10]),	
        DP = select(w[,-3], rowSums(totCounts, dims=2)),
        QV = select(w, qvals),
        LEN = 1),
      metadata = list(header = scanVcfHeader(system.file("extdata", "shearwater.vcf", package="deepSNV"))),
      collapsed = FALSE
    )}else{
      u = !duplicated(w[,-1, drop=FALSE])
      wu = w[u,,drop=FALSE]
      pp = mapply(function(i,j){ qvals[,i,j]}, wu[,2],wu[,3])
      
      geno = SimpleList(
        GT = t(pp < cutoff)+0,
        GQ = t(round(-10*log10(pp))),
        QV = signif(t(mapply(function(i,j){ qvals[,i,j]}, wu[,2],wu[,3])),3),
        VF = signif(t(mapply(function(i,j){ mu[,i,j]}, wu[,2],wu[,3])),3),
        FW = t(mapply(function(i,j){ counts[,i,j]}, wu[,2],wu[,3])),
        BW = t(mapply(function(i,j){ counts[,i,j+5]}, wu[,2],wu[,3])),
        #				DP = t(mapply(function(i,j){ rowSums(totCounts[,i,])}, wu[,2],wu[,3]))
        FD = t(mapply(function(i,j){ rowSums(counts[,i,1:5])}, wu[,2],wu[,3])),
        BD = t(mapply(function(i,j){ rowSums(counts[,i,6:10])}, wu[,2],wu[,3]))
      )
      
      #rownames(w) = samples[w[,1]]
      v = VCF(
        rowRanges=GRanges(coordinates$chr[wu[,2]], 
                        IRanges(coordinates$pos[wu[,2]] - (wu[,3]==5), width=1 + (wu[,3]==5)), 
        ),
        fixed = DataFrame(
          REF = DNAStringSet(paste(ifelse(wu[,3]==5,as.character(ref[wu[,2]-1]),""), ref[wu[,2]], sep="")), ##TODO: Warning if w[u,3][1]==5 & w[u,2]==1...
          ALT = do.call(DNAStringSetList,as.list(paste(ifelse(wu[,3]==5,as.character(ref[wu[,2]-1]),""), c("A","T","C","G","")[wu[,3]], sep=""))),
          #ALT = DNAStringSet(paste(ifelse(w[u,3]==5,as.character(ref[w[u,2]-1]),""), c("A","T","C","G","")[w[u,3]], sep="")),
          QUAL = 1,
          FILTER = "PASS"
        ),
        info = DataFrame(
          #paramRangeID=NA,
          ER = select(wu[,-1], err),
          AF = rowMeans(geno$GT),
          LEN = 1),
        geno = geno,
        metadata = list(header = scanVcfHeader(system.file("extdata", "shearwaterML.vcf", package="deepSNV"))),
        colData = DataFrame(samples=1:length(samples), row.names=samples),
        collapsed = TRUE
      )
      colnames(v) = samples
    }
  metadata(v)$header@samples <- as.character(samples)
  metadata(v)$header@header$META["date",1] <- paste(Sys.time())
  
  ## If no variants found set to zero..
  if(isNull)
    v <- v[NULL]
  
  return(v)
}
mg14/deepSNV-old documentation built on May 22, 2019, 8:52 p.m.