PopSV: Population-based detection of structural variants from Read-Depth signal

Documented in draw.controls

##' Find random genomic regions with the same properties as a set of input regions. The properties are defined using annotation tracks. For example, we can simulate random regions that overlap genes as much as the input regions. It can be used to perform enrichment analysis while controling for specific properties.
##' @title Select control genomic regions for enrichment analysis
##' @param cnv.gr the input regions
##' @param feat.grl a list of the GRanges defining the features to fit. E.g. gene annotation, centromere, etc
##' @param nb.class the number of size class to speed up the computation. Default is 100. Inf will ensure exactly the same overlap proportion but might take some time if the size distribution is diverse.
##' @param nb.cores the number of cores to use. Default is 3.
##' @param redo.duplicates should duplicate regions be reselected. Default is TRUE.
##' @param seed.nb.max the maximum number of genomic position to use as seeds. Default is 1e5.
##' @param min.nb.gr the minimum number of control regions. If NULL (default), as many controls as input are simulated.
##' @param chr.prefix the chromosome name prefix. Default is "" (no prefix). Other value could be "chr" if chromosome are defined as 'chr1', 'chr2', etc.
##' @param dist.gr a GRanges defining the feature for which we want to control the distance to. Default is NULL, i.e. no control.
##' @return a GRanges object defining the control regions
##' @author Jean Monlong
##' @export
draw.controls <- function(cnv.gr, feat.grl, nb.class=100, nb.cores=3, redo.duplicates=TRUE, seed.nb.max=1e5, min.nb.gr=NULL, chr.prefix="", dist.gr=NULL){

  randGR.bp <- function(n=10){
    seql.1.22 = seqlengths(BSgenome.Hsapiens.UCSC.hg19::BSgenome.Hsapiens.UCSC.hg19)[paste0("chr",1:22)]
    chrs = apply(stats::rmultinom(n, 1, (seql.1.22/1e3)/sum(seql.1.22/1e3)),2,function(e)which(e==1))
    starts = stats::runif(n, 0, seql.1.22[chrs])
    return(GenomicRanges::GRanges(paste0(chr.prefix, chrs), IRanges::IRanges(starts, width=1)))
  }
  if(is.data.frame(cnv.gr)){
    cnv.gr = GenomicRanges::makeGRangesFromDataFrame(cnv.gr, keep.extra.columns = TRUE)
  }
  if(!is.null(min.nb.gr) && min.nb.gr > length(cnv.gr)){
    cnv.gr = cnv.gr[sample.int(length(cnv.gr), min.nb.gr, replace=TRUE)]
  }
  if(is.list(feat.grl) && is.data.frame(feat.grl[[1]])){
    feat.grl = lapply(feat.grl, GenomicRanges::makeGRangesFromDataFrame)
  }
  d.gr = randGR.bp(max(seed.nb.max, 3*length(cnv.gr)))
  d.l = parallel::mclapply(feat.grl, function(feat.gr){
    dtn = GenomicRanges::distanceToNearest(d.gr, feat.gr)
    ret = rep(stats::median(as.data.frame(dtn)$distance), length(d.gr))
    ret[S4Vectors::queryHits(dtn)] = as.data.frame(dtn)$distance
    ret
  }, mc.cores=nb.cores)
  d.df = as.data.frame(d.l)
  ol.l = parallel::mclapply(feat.grl, function(feat.gr)IRanges::overlapsAny(cnv.gr, feat.gr), mc.cores=nb.cores)
  if(is.null(cnv.gr$sample)) {
    cnv.gr$sample = ""
  }
  if(!is.null(dist.gr)){
    dtn = GenomicRanges::distanceToNearest(cnv.gr, dist.gr, ignore.strand=TRUE)
    cnv.gr$dist.feat = stats::median(as.data.frame(dtn)$distance)
    cnv.gr$dist.feat[S4Vectors::queryHits(dtn)] = as.data.frame(dtn)$distance
    dtn = GenomicRanges::distanceToNearest(d.gr, dist.gr)
    d.gr$dist.feat = stats::median(as.data.frame(dtn)$distance)
    d.gr$dist.feat[S4Vectors::queryHits(dtn)] = as.data.frame(dtn)$distance
  }
  cnv.df = data.frame(as.data.frame(ol.l), sample=cnv.gr$sample)
  ol.prof = unique(cnv.df[,1:length(feat.grl), drop=FALSE])
  null.gr = parallel::mclapply(1:nrow(ol.prof), function(ii){
    gr.ii = cnv.gr[which(apply(t(as.matrix(cnv.df[,1:length(feat.grl), drop=FALSE]))==unlist(ol.prof[ii,]),2,all))]
    widths = width(gr.ii)
    ## w.class = cut(widths, breaks=c(0,unique(quantile(widths, probs=ppoints(nb.class))), Inf))
    ## w.class = factor(w.class, levels=unique(w.class))
    w.uniq = unique(widths)
    if(length(w.uniq) < nb.class){
      w.class = widths
    } else {
      w.class = stats::cutree(stats::hclust(stats::dist(w.uniq), method="ward.D"), nb.class)
      names(w.class) = as.character(w.uniq)
      w.class = w.class[as.character(widths)]
    }
    tt = tapply(1:length(gr.ii), w.class, function(iii){
      w.i = widths[iii]
      demi.w.min = min(w.i, na.rm=TRUE) / 2
      demi.w.max = max(w.i, na.rm=TRUE) / 2
      good.d = colSums((t(as.matrix(d.df))-ifelse(unlist(ol.prof[ii,]),demi.w.min,demi.w.max))*ifelse(unlist(ol.prof[ii,]),1,-1)<0)
      nb.feat.good = ncol(ol.prof)
      while(all(good.d!=nb.feat.good)){
        nb.feat.good = nb.feat.good - 1
      }
      good.d = which(good.d==nb.feat.good)
      if(!is.null(dist.gr) & length(good.d) > length(w.i)){
        good.d = utils::head(good.d[sapply(gr.ii$dist.feat[iii], function(d)which.min(abs(d-d.gr$dist.feat[good.d])))], length(w.i))
      } else {
        good.d = sample(good.d, length(w.i), length(good.d) < length(w.i))
      }
      gr = GenomicRanges::resize(d.gr[good.d], w.i, fix="center")
      gr$sample = gr.ii$sample[iii]
      gr
    })
    res = unlist(do.call(GenomicRanges::GRangesList, tt[which(!unlist(lapply(tt,is.null)))]))
    names(res) = NULL
    res
  }, mc.cores=nb.cores)
  null.gr = unlist(do.call(GenomicRanges::GRangesList, null.gr))
  if(all(unique(null.gr$sample)=="")) {
    null.gr$sample = NULL
  }
  null.gr$dist.feat = NULL
  if(redo.duplicates && any((dup = duplicated(as.data.frame(null.gr))))){
    redo.gr = draw.controls(null.gr[which(dup)], feat.grl, nb.class=nb.class, nb.cores=nb.cores, redo.duplicates=FALSE, min.nb.gr=min.nb.gr, chr.prefix = chr.prefix, dist.gr=dist.gr)
    null.gr = c(null.gr[which(!dup)], redo.gr)
  }
  null.gr
    
}

jmonlong/PopSV documentation built on Sept. 15, 2019, 9:29 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

jmonlong/PopSV
Population-based detection of structural variants from Read-Depth signal

R/draw.controls.R
In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal

Defines functions draw.controls

Documented in draw.controls

R Package Documentation

Browse R Packages

We want your feedback!

jmonlong/PopSV Population-based detection of structural variants from Read-Depth signal

R/draw.controls.R In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal

Defines functions draw.controls

Documented in draw.controls

R Package Documentation

Browse R Packages

We want your feedback!

jmonlong/PopSV
Population-based detection of structural variants from Read-Depth signal

R/draw.controls.R
In jmonlong/PopSV: Population-based detection of structural variants from Read-Depth signal