R/shattered.regions.r

Defines functions shattered.eval shattered.regions

Documented in shattered.eval shattered.regions

#' Class to store shattered regions and information produced by shattered.regions and shattered.regions.cnv functions
#' @param regions.summary (list): a list of data.frames sumarizing the information of shattered regions found in each sample
#' @param high.density.regions (matrix): a numeric matrix representing high breakpoint density genomic bins in each sample (values 1 = high density break; 0 = normal)
#' @param high.density.regions.hc (matrix): a numeric matrix representing high breakpoint density genomic bins in each sample (values 1 = high density break; 0 = normal). 
#' Only those bins that overlap with high confidence regions defined in regions.summary are set to = 1 
#' @param cnv.brk.dens (matrix): a numeric matrix representing the number of CNV segmentation breakpoints found in at genomic bins in each sample
#' @param svc.brk.dens (matrix): a numeric matrix representing the number of SV breakpoints found at genomic bins in each sample
#' @param cnv.brk.common.dens (matrix): a numeric matrix representing the number of CNV breakpoints colocalizing SV breakpoints found at genomic bins in each sample
#' @param svc.brk.common.dens (matrix): a numeric matrix representing the number of SV breakpoints colocalizing CNV breakpoints found at genomic bins in each sample
#' @param cnvbrk (S4): on object generated by cnv.breaks function 
#' @param svcbrk (S4): on object generated by svc.breaks function 
#' @param common.brk (list): on object generated by match.breaks function 
#' @param cnv (S4) an object of class svcnvio containing data type 'cnv' validated by validate.cnv
#' @param svc (S4) an object of class svcnvio containing data type 'svc' validated by validate.svc
#' @param param (list): list of configuration parameters provided or set as default
#' @return an instance of the class 'chromo.regs' containing breakpoint mapping onto genes
#' @export
chromo.regs <- setClass("chromo.regs",
                        representation(
                            regions.summary  = "list",
                            high.density.regions = "matrix",
                            high.density.regions.hc = "matrix",
                            cnv.brk.dens = "matrix",
                            svc.brk.dens = "matrix",
                            cnv.brk.common.dens = "matrix",
                            svc.brk.common.dens = "matrix",
                            cnvbrk = 'breaks',
                            svcbrk = "breaks",
                            common.brk = "list",
                            cnv = "svcnvio",
                            svc = "svcnvio",
                            param = "list"
                        ))


setMethod("show","chromo.regs",function(object){
    
    conf <- table(names(unlist(lapply(unname(object@regions.summary),function(x) table(x[,"conf"])))))
    
    writeLines(paste("An object of class chromo.regs from svpluscnv containing the following stats:",
                "\nNumber of samples tested=",nrow(object@high.density.regions),
                "\nNumber of samples with shattered regions=",length(object@regions.summary),
                "\nNumber of samples with high-confidence shattered regions=",conf["HC"],
               "\nNumber of samples with low-confidence shattered regions=",conf["lc"]))
})

#' Evaluate shattered regions based on interleaved breaks and breakpoint dispersion parameters
#' @param chromo.regs.obj (chromo.regs) An object of class chromo.regs 
#' @param interleaved.cut (numeric) the percentage of non interleaved structural variant calls 
#' @param dist.iqm.cut (numeric) interquantile average of the distance between breakpoints within a shattered region
#' @param verbose (logical)
#' @return an instance of the class 'chromo.regs' containing breakpoint mapping onto genes
#' @export
#' 


shattered.eval <- function(chromo.regs.obj,
                           interleaved.cut=0.5,
                           dist.iqm.cut=100000,
                           verbose=TRUE){
    
    svcdat <- chromo.regs.obj@svc@data
    cnvbrk  <- chromo.regs.obj@cnvbrk
    svcbrk <- chromo.regs.obj@svcbrk
    
    if(verbose){
        message(paste("Evaluating shattered regions in ",length(names(chromo.regs.obj@regions.summary))," samples",sep="") )
        pb <- txtProgressBar(style=3)
        cc <-0
        tot <- length(chromo.regs.obj@regions.summary)
    }
    
    for(cl in names(chromo.regs.obj@regions.summary)){
        
        if(verbose) cc <- cc+1
        if(verbose) setTxtProgressBar(pb, cc/tot)
        
        regions <-   chromo.regs.obj@regions.summary[[cl]]
        br1 <- cnvbrk@breaks[which(cnvbrk@breaks$sample == cl),2:3]
        br2 <- svcbrk@breaks[which(svcbrk@breaks$sample == cl),2:3]
        colnames(br1) <- colnames(br2) <- c("chrom","pos")
        br1.gr <- with(br1, GRanges(chrom, IRanges(start=pos, end=pos)))
        br2.gr <- with(br2, GRanges(chrom, IRanges(start=pos, end=pos)))
        
        c.br1 <- chromo.regs.obj@common.brk$brk1_match[which(chromo.regs.obj@common.brk$brk1_match$sample == cl),2:3]
        c.br2 <- chromo.regs.obj@common.brk$brk2_match[which(chromo.regs.obj@common.brk$brk2_match$sample == cl),2:3]
        colnames(c.br1) <- colnames(c.br2) <- c("chrom","pos")
        c.br1.gr <- with(c.br1, GRanges(chrom, IRanges(start=pos, end=pos)))
        c.br2.gr <- with(c.br2, GRanges(chrom, IRanges(start=pos, end=pos)))
        
        regions_gr <- with(regions, GRanges(chrom, IRanges(start=start, end=end)))
        hits_1 = GenomicAlignments::findOverlaps(regions_gr,br1.gr)
        hits_2 = GenomicAlignments::findOverlaps(regions_gr,br2.gr)
        
        c.hits_1 = GenomicAlignments::findOverlaps(regions_gr,c.br1.gr)
        c.hits_2 = GenomicAlignments::findOverlaps(regions_gr,c.br2.gr)
        
        svdat_i <- svcdat[which(svcdat$sample == cl),]
        sv_ranges_ori <-   with(svdat_i, GRanges(chrom1, IRanges(start=pos1, end=pos1)))
        sv_ranges_dest <-   with(svdat_i, GRanges(chrom2, IRanges(start=pos2, end=pos2)))
        hits_ori = GenomicAlignments::findOverlaps(regions_gr,sv_ranges_ori)
        hits_dest = GenomicAlignments::findOverlaps(regions_gr,sv_ranges_dest)
        
        # calculate interleaved score, n.breaks, dispersion, and median distance between consecutive breaks
        start <- end <- reg.size <- dist.iqm.cnv <- dist.iqm.sv <- n.brk.cnv <- n.brk.sv <- n.orth.cnv <- n.orth.sv <- interleaved <- rep(0,nrow(regions))
        conf <- rep("HC",nrow(regions))
        for(i in 1:nrow(regions)){
            sites1 <- sort(unique(br1[subjectHits(hits_1)[which(queryHits(hits_1) == i)],]$pos))
            dist.iqm.cnv[i]  <- IQM(sites1[2:length(sites1)] - sites1[1:(length(sites1)-1) ],lowQ = 0.2,upQ = 0.8)
            n.brk.cnv[i] <- length(sites1)  
            
            sites2 <- sort(unique(br2[subjectHits(hits_2)[which(queryHits(hits_2) == i)],]$pos))
            dist.iqm.sv[i]  <- IQM(sites2[2:length(sites2)] - sites2[1:(length(sites2)-1) ],lowQ = 0.2,upQ = 0.8)
            n.brk.sv[i] <- length(sites2)  
            
            c.sites1 <- sort(unique(c.br1[subjectHits(c.hits_1)[which(queryHits(c.hits_1) == i)],]$pos))
            n.orth.cnv[i] <- length(c.sites1)  
            
            c.sites2 <- sort(unique(c.br2[subjectHits(c.hits_2)[which(queryHits(c.hits_2) == i)],]$pos))
            n.orth.sv[i] <- length(c.sites2)  
            
            start[i] <- min(c(sites1,sites2))
            end[i] <- max(c(sites1,sites2))
            
            a<-svdat_i[subjectHits(hits_ori)[which(queryHits(hits_ori) == i)],]
            b<-svdat_i[subjectHits(hits_dest)[which(queryHits(hits_dest) == i)],]
            apos<-a$pos1
            names(apos) <- rownames(a)
            bpos<-b$pos2
            names(bpos) <- rownames(b)
            brkids <- names(sort(c(apos,bpos)))
            reg.size <- regions$end-regions$start
            ct<-0
            for(x in 2:length(brkids)) if(brkids[x] == brkids[x-1]) ct<-ct+1
            interleaved[i] <- ct/length(unique(brkids))
        }
        chrom <-regions[,"chrom"]
        nbins<-regions[,"nbins"]
        regions <- data.table(chrom,start,end,nbins)
        
        # Find links between shattered regions   
        if(nrow(chromo.regs.obj@regions.summary[[cl]]) > 1 ){
            record_mat<- matrix(nrow=0,ncol=2)
            links <- rep("",nrow(regions))
            for(i in 1:nrow(regions)){
                for(j in 1:nrow(regions)){
                    region_a_hits <- subjectHits(hits_ori)[which(queryHits(hits_ori) == i)]
                    region_b_hits <- subjectHits(hits_dest)[which(queryHits(hits_dest) == j)]
                    if(length(intersect(region_a_hits,region_b_hits)) > 0 ){
                        record_mat <- rbind(record_mat,c(i,j),c(j,i))
                    }
                }
            }
            for(i in 1:nrow(regions)){
                links[i] <- paste(setdiff(as.character(sort(unique(c(i, record_mat[which(record_mat[,1] == i),2]) ))),as.character(i)),collapse=",")
                if(length(grep("[0-9]",links[i])) == 0) links[i]<-"-"
            }
        }else{
            links <- "-"
        }
        if(!is.null(interleaved.cut)) conf[which(interleaved >= interleaved.cut)] <- "lc"
        if(!is.null(dist.iqm.cut)) conf[which(apply(cbind(dist.iqm.cnv,dist.iqm.sv),1,mean) < dist.iqm.cut)] <- "lc"
        conf[which(links != "-")] <- "HC"
        chromo.regs.obj@regions.summary[[cl]] <- data.table(regions,links,reg.size,dist.iqm.cnv,dist.iqm.sv,n.brk.cnv,n.brk.sv,n.orth.cnv,n.orth.sv,interleaved,conf)
    }
    if(verbose) close(pb)
    
    
    bins <- data.table(do.call(rbind,strsplit(colnames(chromo.regs.obj@high.density.regions)," ")),colnames(chromo.regs.obj@high.density.regions))
    colnames(bins) <- c("chrom","start","end","binid")
    bins$start <- as.numeric(bins$start)
    bins$end <- as.numeric(bins$end)

    binsGR <- with(bins, GRanges(chrom, IRanges(start=start, end=end)))
    highDensityRegionsHC <- chromo.regs.obj@high.density.regions
    for(cl in names(chromo.regs.obj@regions.summary)){
        lc <- chromo.regs.obj@regions.summary[[cl]][which(chromo.regs.obj@regions.summary[[cl]]$conf == "lc"),]
        if(nrow(lc) > 0){
            lcGR<- with(lc, GRanges(chrom, IRanges(start=start, end=end)))
            hits = GenomicAlignments::findOverlaps(binsGR,lcGR)
            highDensityRegionsHC[cl,bins$bins[unique(queryHits(hits)),]] <- 0
        }
    }
    chromo.regs.obj@high.density.regions.hc <- highDensityRegionsHC
    
    return(chromo.regs.obj)
}



#' Caller for shattered genomic regions based on breakpoint densities
#' @param cnv (S4) an object of class svcnvio containing data type 'cnv' validated by validate.cnv
#' @param svc (S4) an object of class svcnvio containing data type 'svc' validated by validate.svc
#' @param fc.pct (numeric) inherited from cnv.breaks(); copy number change between 2 consecutive segments: i.e (default) cutoff = 0.2 represents a fold change of 0.8 or 1.2
#' @param min.cnv.size (numeric) inherited from cnv.breaks(); The minimun segment size (in base pairs) to include in the analysis 
#' @param min.num.probes (numeric) inherited from cnv.breaks(); The minimun number of probes per segment to include in the analysis 
#' @param low.cov (data.frame) inherited from cnv.breaks(), svc.breaks() and match.breaks; a data.frame (chr, start, end) indicating low coverage regions to exclude from the analysis
#' @param clean.brk (numeric) inherited from cnv.breaks(); n of redundant breakpoints to filter out 
#' @param window.size (numeric) size in megabases of the genmome bin to compute break density 
#' @param slide.size (numeric) size in megabases of the sliding genmome window
#' @param num.cnv.breaks (numeric) number of segmentation breakpoints per segments to be considered high-density break 
#' @param num.cnv.sd (numeric) number of standard deviations above the sample average for num.cnv.breaks
#' @param num.svc.breaks (numeric) number of svc breakpoints per segments to be considered high-density break  
#' @param num.svc.sd (numeric) number of standard deviations above the sample average for num.svc.breaks
#' @param num.common.breaks (numeric) number of common SV and segmentation breakpoints per segments to be considered high-density break
#' @param num.common.sd (numeric) number of standard deviations above the sample average for num.common.breaks
#' @param maxgap (numeric) inherited from match.breaks(); sets the maximum gap between co-localizing orthogonal breakpoints
#' @param chrlist (character) vector containing chromosomes to include in the analysis; if NULL all chromosomes available in the input will be included
#' @param interleaved.cut (numeric) 0-1 value indicating percentage of interleaved (non-contiguous) SV breakpoint pairs
#' @param dist.iqm.cut (numeric) interquantile average of the distance between breakpoints within a shattered region
#' @return an instance of the class 'chromo.regs' containing breakpoint mapping onto genes
#' @keywords chromothripsis, chromoplexy, chromosome shattering
#' @export
#' @examples
#' 
#' ## validate input data.frames
#' cnv <- validate.cnv(segdat_lung_ccle)
#' svc <- validate.svc(svdat_lung_ccle)
#' 
#' shattered.regions(cnv,svc)


shattered.regions <- function(cnv, 
                              svc,
                              fc.pct = 0.2, 
                              min.cnv.size = 0, 
                              min.num.probes=0, 
                              low.cov = NULL,
                              clean.brk=NULL,
                              window.size = 10,
                              slide.size = 2,
                              num.cnv.breaks = 6, 
                              num.cnv.sd = 5,
                              num.svc.breaks = 6, 
                              num.svc.sd = 5,
                              num.common.breaks = 3, 
                              num.common.sd = 3,
                              maxgap=10000, 
                              chrlist=NULL, 
                              interleaved.cut = 0.5, 
                              dist.iqm.cut = 1e+05, 
                              verbose=TRUE){
  
    stopifnot(cnv@type == "cnv")
    cnvdat <- cnv@data
    
    stopifnot(svc@type == "svc")
    svcdat <- svc@data
    
  chr.lim <- chromosome.limit.coords(cnv)
  
  if(verbose) message("Finding CNV breakpoints")
  cnvbrk <- cnv.breaks(cnv = cnv, 
                       fc.pct = fc.pct, 
                       min.cnv.size = min.cnv.size, 
                       low.cov = low.cov, 
                       clean.brk = clean.brk,
                       verbose = verbose)
  
  if(verbose) message("Finding SV breakpoints")
  svcbrk <- svc.breaks(svc = svc, 
                      low.cov = low.cov)
  
  if(verbose) message("Matching CNV and SC breakpoints")
  common.brk <- match.breaks(cnvbrk,svcbrk,
                             maxgap = maxgap, 
                             verbose = verbose,
                             plot=FALSE)
  
  if(is.null(chrlist)) chrlist <- unique(c(cnvdat$chrom,svcdat$chrom1,svcdat$chrom2))
  chrlist <- chr.sort(chrlist)
  
  if(verbose) message("Mapping CNV breakpoints across the genome:")
  cnvbrk.dens <- break.density(cnvbrk, chr.lim = chr.lim, window.size = window.size, slide.size = slide.size, verbose = verbose)

  if(verbose) message("Mapping SV breakpoints across the genome:")
  svcbrk.dens <- break.density(svcbrk,chr.lim = chr.lim, window.size = window.size, slide.size = slide.size,verbose = verbose)
    
    burden1 <- common.brk$restab$matched.brk1
    burden2 <- common.brk$restab$matched.brk2
    names(burden1) <- names(burden2)  <- common.brk$restab$sample
    
  common.brk1 <- breaks(breaks = common.brk$brk1_match,
                      burden=burden1,
                      param=list(datatype="combined"))
  
  common.brk2 <- breaks(breaks = common.brk$brk2_match,
                       burden = burden2,
                       param=list(datatype="combined"))

  if(verbose) message("Mapping CNV validated breakpoints across the genome:")
  cnvbrk.common.dens <- break.density(common.brk1, chr.lim = chr.lim, window.size = window.size, slide.size = slide.size,verbose = verbose)
  
  if(verbose) message("Mapping SV validated breakpoints across the genome:")
  svcbrk.common.dens <- break.density(common.brk2,chr.lim = chr.lim, window.size = window.size, slide.size = slide.size,verbose = verbose)
  
  commonSamples <- intersect(names(cnvbrk@burden),names(svcbrk@burden))
  if(length(commonSamples) == 0) stop("There is no common samples between cnv and svc input datasets.") 
  
  # calculate inter quantile mean and standard deviation per sample
  iqmdata1<- rep(0,length(commonSamples))
  names(iqmdata1) <- commonSamples
  
  sddata1 <- iqmdata2 <- sddata2<- iqmdata3<- sddata3 <- iqmdata1
  iqmdata1[names(apply(cnvbrk.dens,1,IQM,lowQ=.1,upQ=.9))] <- apply(cnvbrk.dens,1,IQM,lowQ=.1,upQ=.9)
  sddata1[names(apply(cnvbrk.dens,1,IQSD,lowQ=.1,upQ=.9))]  <- apply(cnvbrk.dens,1,IQSD,lowQ=.1,upQ=.9)
  iqmdata2[names(apply(svcbrk.dens,1,IQM,lowQ=.1,upQ=.9))] <- apply(svcbrk.dens,1,IQM,lowQ=.1,upQ=.9)
  sddata2[names(apply(svcbrk.dens,1,IQSD,lowQ=.1,upQ=.9))]  <- apply(svcbrk.dens,1,IQSD,lowQ=.1,upQ=.9)
  iqmdata3[names(apply(cnvbrk.common.dens,1,IQM,lowQ=.1,upQ=.9))] <- apply(cnvbrk.common.dens,1,IQM,lowQ=.1,upQ=.9)
  sddata3[names(apply(cnvbrk.common.dens,1,IQSD,lowQ=.1,upQ=.9))]  <- apply(cnvbrk.common.dens,1,IQSD,lowQ=.1,upQ=.9)
  
  
  a <- sapply(commonSamples, function(i) names(which(cnvbrk.dens[i,] > iqmdata1[i]+num.cnv.sd*sddata1[i] )))
  b <- sapply(commonSamples, function(i) names(which(cnvbrk.dens[i,] >= num.cnv.breaks)))
  c <- sapply(commonSamples, function(i) names(which(svcbrk.dens[i,] > iqmdata2[i]+num.svc.sd*sddata2[i] )))
  d <- sapply(commonSamples, function(i) names(which(svcbrk.dens[i,] >= num.svc.breaks)))
  e <- sapply(commonSamples, function(i) names(which(cnvbrk.common.dens[i,] > iqmdata3[i]+num.common.sd*sddata3[i] )))
  f <- sapply(commonSamples, function(i) names(which(cnvbrk.common.dens[i,] >= num.common.breaks)))
  
  
  # condition for chromothripsis: at least n=breaks > 6 (svc SND cnv)  AND n-breaks > u+2*sd (svc AND cnv) 
  res <- sapply(commonSamples,function(i) Reduce(intersect, list(a[[i]],b[[i]],c[[i]],d[[i]],e[[i]],f[[i]])))

  highDensityRegions <- cnvbrk.dens[commonSamples,]
  highDensityRegions[] <- 0
  for(cl in commonSamples) highDensityRegions[cl,res[[cl]]] <- 1
  
  #res <- res[which(unlist(lapply(res,length)) >0)]
  if(verbose){
    message("Locating shattered regions...")
    pb <- txtProgressBar(style=3)
    cc <-0
    tot <- length(res)
  }
  
  restab  <- list()
  for(cl in names(res)){
    if(verbose) cc <- cc+1
    if(verbose) setTxtProgressBar(pb, cc/tot)
    if(length(res[[cl]]) > 0){
      tab <- data.table(do.call(rbind,strsplit(res[[cl]]," ")))
      colnames(tab) <- c("chrom","start","end")
      tab$start <- as.numeric(tab$start )
      tab$end <- as.numeric(tab$end )
      
      tabgr = with(tab, GRanges(chrom, IRanges(start=start, end=end))) 
      hits = as.data.frame(GenomicAlignments::findOverlaps(tabgr,tabgr))
      
      agg <- aggregate(subjectHits ~ queryHits, hits, paste,simplify=FALSE)
      prev<-c(); cnum <- 0
      agglist <- list()
      for(x in agg$subjectHits){
        if(length(intersect(x,prev) > 0)){
          agglist[[cnum]] <- unique(c(x,prev))
          prev <- agglist[[cnum]]
        }else{
          cnum <- cnum+1
          agglist[[cnum]]<- x
          prev <-agglist[[cnum]]
        }
      }
      agglistUniq <- list()
      for(i in 1:length(agglist)){
        chrom <- as.character(unique(tab[as.numeric(agglist[[i]]),"chrom"]))
        start<-min( tab[as.numeric(agglist[[i]]),"start"])
        end<-max( tab[as.numeric(agglist[[i]]),"end"])
        nbins <- length(agglist[[i]])
        conf <- "lc"
        agglistUniq[[i]] <-  data.table(chrom,start,end,nbins,conf)
      }
      tabmerged <- data.table(do.call(rbind,agglistUniq))
      restab[[cl]] <- tabmerged
    }
  }
  if(verbose) close(pb)
  
  results <- chromo.regs(
    regions.summary = restab,
    high.density.regions = highDensityRegions,
    high.density.regions.hc = matrix(),
    cnv.brk.dens = cnvbrk.dens,
    svc.brk.dens = svcbrk.dens,
    cnv.brk.common.dens = cnvbrk.common.dens,
    svc.brk.common.dens = svcbrk.common.dens,
    cnvbrk = cnvbrk,
    svcbrk = svcbrk,
    common.brk = common.brk,
    cnv = cnv,
    svc = svc,
    param = list(fc.pct = fc.pct,
                  min.cnv.size = min.cnv.size, 
                  min.num.probes=min.num.probes, 
                  low.cov = low.cov,
                  clean.brk=clean.brk,
                  window.size = window.size,
                  slide.size = slide.size, 
                  num.cnv.breaks = num.cnv.breaks,
                  num.cnv.sd = num.cnv.sd,
                  num.svc.breaks = num.svc.breaks,
                  num.svc.sd = num.svc.sd, 
                  num.common.breaks = num.common.breaks, 
                  num.common.sd = num.common.sd,
                  maxgap=maxgap,
                  chrlist=chrlist, 
                  interleaved.cut = interleaved.cut,
                  dist.iqm.cut = dist.iqm.cut)
        )
  
  results <- shattered.eval(results, interleaved.cut = interleaved.cut, dist.iqm.cut = dist.iqm.cut, verbose = verbose)
  
  return(results)
}
gonzolgarcia/svcnvplus documentation built on March 4, 2020, 6:02 p.m.