R/qc.sample.R

Defines functions qc.sample

Documented in qc.sample

##' Compute QC metrics for each sample informing how much it fits the reference samples. It can compute: the average distance between a sample and the 10% closest reference samples; the proportion of single-bin calls; the proportion of calls with copy number estimate close to 2. These metrics might be useful to interpret the calls. For example, an excess of calls in a sample which is globally an outlier in those QC is not so surprising.
##' @title QC samples
##' @param bins.df a data.frame with information about the bins. Columns 'chr', 'start', 'end' are required.
##' @param files.df a data.frame with information about the files, usually created by 'init.filenames'.
##' @param cnv.df a data.frame with PopSV calls, created by 'call.abnormal.cov'.
##' @param ref.samples a vector with the names of the samples used as reference. If NULL (default), all samples in 'files.df' are used.
##' @param n.subset the number of bins to use to compute the metrics. Default is 1000.
##' @param nb.cores number of cores to use. If higher than 1, \code{parallel}
##' package is used to parallelized the counting.
##' @return a data.frame with qc metrics for each sample of the input 'files.df'.
##' @author Jean Monlong
##' @export
qc.sample <- function(bins.df, files.df=NULL, cnv.df=NULL, ref.samples=NULL, n.subset=1000, nb.cores=1){

  if(!all(c("chr","start","end") %in% colnames(bins.df))){
    stop("Missing columns in 'bins.df'. Columns 'chr', 'start', 'end' are required.")
  }
  if(is.null(files.df) & is.null(cnv.df)){
    stop("At least one of 'files.df' or 'cnv.df' parameters must be non-NULL.")
  }
  if(is.null(ref.samples) & !is.null(files.df)){
    ref.samples = files.df$sample
  }

  res = NULL
  if(!is.null(files.df)){
    bc.df = quick.count(files.df, bins.df, col.files="bc.gc.gz", nb.rand.bins=n.subset, nb.cores=nb.cores)
    samples = files.df$sample
    bc.df = med.norm(bc.df, nb.cores=nb.cores, norm.stats.comp=FALSE)$bc.norm
    d.geom = as.matrix(stats::dist(t(bc.df[,samples])))
    colnames(d.geom) = samples
    ## d.o = rowMeans(d.geom[,ref.samples])/mean(d.geom[,ref.samples])
    d.o = apply(d.geom[,ref.samples], 1, function(d.f) mean(d.f[d.f<stats::quantile(d.f, probs=.1, na.rm=TRUE)], na.rm=TRUE)) / mean(d.geom[d.geom<stats::quantile(d.geom, probs=.1, na.rm=TRUE)], na.rm=TRUE)
    res = data.frame(sample=samples, dist.bc.ref=d.o)

    if(any(colnames(bins.df)=="m")){
      lowcov.bins.df = bins.df[order(bins.df$m),]
      lowcov.bins.df = lowcov.bins.df[utils::head(which(!is.na(lowcov.bins.df$sd)),n.subset),]
      bc.df = quick.count(files.df, lowcov.bins.df, col.files="bc.gc.gz", nb.rand.bins=n.subset, nb.cores=nb.cores)
      d.geom = as.matrix(stats::dist(scale(t(scale(bc.df[,samples], scale=FALSE)))))
      colnames(d.geom) = samples
      d.o = rowMeans(d.geom[,ref.samples])/mean(d.geom[,ref.samples])
      res$dist.bc.ref.lowcov = d.o
    }
  }

  if(!is.null(cnv.df)){
    nb.bin.cons = fc = NULL ## Uglily silence R checks
    res.cnv = dplyr::summarize(dplyr::group_by(cnv.df, sample), single.bin.prop=mean(nb.bin.cons==1, na.rm=TRUE), cn2.prop = mean(abs(fc[which(nb.bin.cons>2)])<.25, na.rm=TRUE))
    if(!is.null(res)){
      res = merge(res, res.cnv, all=TRUE)
      rownames(res) = NULL
    } else {
      res = res.cnv
    }
  }

  return(res)
}
jmonlong/PopSV documentation built on Sept. 15, 2019, 9:29 p.m.