readDepth: Find regions of genomic copy number loss and gain from short reads

Documented in addMapability binParams calcWindParams dividePeaks fdrRate initRdClass ploidyPercentages readParameters

library('methods')
library('foreach')
library('doMC')
library('IRanges')


##-------------------------------------------------
## Set up an object to hold the data, then fill
## it when initialized
##
initRdClass <- function(){
  setClass("rdObject", representation(params="numeric", binParams="data.frame", entrypoints="data.frame", readInfo="data.frame", chrs="list"))

setMethod("initialize", "rdObject",
          function(.Object){
            .Object@params=readParameters()
            .Object@entrypoints=readEntrypoints()
            .Object@entrypoints=addMapability(.Object@entrypoints)
            verifyFiles(.Object@entrypoints$chr)
            .Object@readInfo=getReadInfo()             
            .Object@binParams=binParams(.Object@params, .Object@entrypoints, .Object@readInfo)
            return(.Object)
          })
}

##-------------------------------------------------
## calculate the appropriate bin size,
## gain/loss thresholds, number of chromosomes, etc
##
binParams <-function(params,entrypoints,readInfo){

  ## count the total number of reads
  numReads = getNumReads(readInfo)
  
  ## get genome size from entrypoints file, adjust by mappability estimates
  genomeSize = sum(entrypoints$length)
  mapPerc=sum(entrypoints$mapPerc*entrypoints$length)/sum(entrypoints$length)
  effectiveGenomeSize = genomeSize * mapPerc

  if(verbose){
    cat(numReads," total reads\n")
    cat("genome size:",genomeSize,"\n")
    cat("genome mapability percentage:",mapPerc,"\n")  
    cat("effectiveGenomeSize:",effectiveGenomeSize,"\n")
  }


  ## median value has to be adjusted if we have chromosomes with single ploidy
  ## or expected copy number alterations
  ploidyPerc = ploidyPercentages(effectiveGenomeSize,entrypoints,params)

  if(verbose){
    cat("expect ",
        ploidyPerc$haploidPerc*100,"% haploid,",
        ploidyPerc$diploidPerc*100,"% diploid,",
        ploidyPerc$triploidPerc*100,"triploid\n")
  }
    
  ## est. coverage of genome by reads
  coverage = numReads * params["readLength"] / effectiveGenomeSize
  if(verbose){
    cat("approx.",coverage," X coverage of mappable genome \n")
  }
 
  ## calculate window size based on triploid peak, since it always
  ## produces larger (more conservative) windows
  ploidy = 3
  medAdj = 1
  if(verbose){
    if("medAdjustment" %in% names(params)){
      medAdj = params[["medAdjustment"]]
    }
  }
  
  pTrip <- calcWindParams(numReads=numReads,
                          fdr=params["fdr"],
                          genomeSize=effectiveGenomeSize,
                          oDisp=params["overDispersion"],
                          ploidy=ploidy,
                          minSize=params["gcWindowSize"],
                          ploidyPerc=ploidyPerc,
                          medAdj=medAdj)


  binSize = pTrip$binSize
  binSize = round(binSize/100)*100
  med=pTrip$med
  if(verbose){
    cat("expected mean: ",numReads/(effectiveGenomeSize/binSize),"\n")
    cat("adjusted mean: ",med,"\n")
  }
  
  ## calculate separation peak for haploid peak
  pHap = fdrRate(binSize, 1, effectiveGenomeSize, numReads, params["overDispersion"], ploidyPerc, medAdj) 

##   ##plot the output for later review
##   pdf("output/cnSeparation.pdf")
##   plotWindows(pTrip$binSize, effectiveGenomeSize, pHap$div, pTrip$div, params["fdr"], numReads, params["overDispersion"], ploidyPerc, med)
##   dev.off()  
  
  return(data.frame(binSize=binSize, lossThresh=pHap$div, gainThresh=pTrip$div, med=med, hapPerc=ploidyPerc$haploidPerc, dipPerc=ploidyPerc$diploidPerc, tripPerc=ploidyPerc$triploidPerc))
}



##-------------------------------------------------
## read the parameters from the params file 
##
readParameters <- function(){
  p=read.table("params",as.is=TRUE)

  pos = which(p[,1] == "verbose")
  if(!("verbose" %in% p$V1)){
    verbose <<- TRUE
  } else {
   if(p[pos,2] == "false" | p[pos,2] == "f" |
        p[pos,2] == "FALSE" | p[pos,2] == "F" |
	p[pos,2] == "0" ){
       verbose <<- FALSE
       p[pos,2] = 0
     } else {
       verbose <<- TRUE
       p[pos,2] = 1
     }
   }

  params=as.numeric(p[,2])
  names(params)=p[,1]

  if(is.na(params["readLength"]) |
     is.na(params["fdr"]) |
     is.na(params["overDispersion"]) |
     is.na(params["gcWindowSize"]) |
     is.na(params["percCNGain"]) |
     is.na(params["percCNLoss"])){
    stop("a needed parameter is missing. Check your params file.")
  }
   #set multicore options if specified
   if(!(is.na(params["maxCores"]))){
     options(cores = as.numeric(params["maxCores"]))
   }

  return(params)
}



##--------------------------------------------------
## calculate adjusted genome size, based on the fact that we
## may have haploid chromosomes and/or expected CN alterations 
ploidyPercentages <- function(effectiveGenomeSize,ents,params){
  ##first, get the coverage that come from diploid chrs
  diploidPerc = sum((ents$length*ents$mapPerc)[which(ents$ploidy==2)])/effectiveGenomeSize
  diploidPerc = diploidPerc - params[["percCNLoss"]] 
  diploidPerc = diploidPerc - params[["percCNGain"]] 

  ##coverage from haploid chrs
  haploidPerc = sum((ents$length*ents$mapPerc)[which(ents$ploidy==1)])/effectiveGenomeSize
  haploidPerc = haploidPerc + params[["percCNLoss"]]

  return(data.frame(haploidPerc=haploidPerc,
                    diploidPerc=diploidPerc,
                    triploidPerc=params[["percCNGain"]]))
}


##--------------------------------------------------
## returns the percentage of genome covered by
## mappable sequence
##
## if a coverage total file exists, use its value
## else, sum the lengths of the annotations, and
## create the cov total file
##
addMapability <-function(entrypoints){
  ##first, we need the mappable regions
  mapTotalFileName="annotations/mapability/totalMappablePerc"
  mapDir="annotations/mapability/"
  mapTots = 0
  #default is 100% mapability
  entrypoints = cbind(entrypoints,mapPerc=rep(1,length(entrypoints$chr)))
  tmp=NULL
  #file doesn't exist - create it
  if(!(file.exists(mapTotalFileName))){
    sumMaps <- function(filename){
      a=scan(gzfile(filename),what=0,quiet=TRUE)
      return(sum(a)/length(a))
    }

    tmp=foreach(e=entrypoints$chr, .combine="append") %do%{                
      c(e,sumMaps(paste(mapDir,e,".dat.gz",sep="")))
    }
    ##now, place map perc in the appropriate part of the table
    for(i in seq(1,length(tmp),2)){
      entrypoints[which(entrypoints$chr==tmp[i]),]$mapPerc = tmp[i+1]
    }
    write.table(entrypoints[,c("chr","mapPerc")],file=mapTotalFileName,sep="\t",quote=F,row.names=F,col.names=F)
  }

  tmp = scan(mapTotalFileName,what="",quiet=TRUE)
  for(i in seq(1,length(tmp),2)){
    mapp=as.numeric(tmp[i+1])
    if(tmp[i] %in% entrypoints$chr){
      entrypoints[which(entrypoints$chr==tmp[i]),]$mapPerc = mapp
    }
  }
  
  entrypoints$mapPerc= as.numeric(entrypoints$mapPerc)
  return(entrypoints)
}



##--------------------------------------------------
## calculate FDR rate and optimal dividing line
##
fdrRate <- function(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj){ #nullThis,nullBoth){
  numWinds <- genomeSize/windSize

#  med <- numReads/numWinds
  med <- (numReads/(((genomeSize*ploidyPerc$haploidPerc/2) +
                     (genomeSize*ploidyPerc$triploidPerc*3/2) +
                     (genomeSize*ploidyPerc$diploidPerc)) / windSize))*medAdj
  
  medAlt <- med*(ploidy/2)

  divFdr=NULL
  if(ploidy < 2){
    divFdr = dividePeaks(medAlt, med, numWinds*ploidyPerc$haploidPerc, numWinds*ploidyPerc$diploidPerc, oDisp)
  } else {
    divFdr = dividePeaks(med, medAlt, numWinds*ploidyPerc$diploidPerc, numWinds*ploidyPerc$triploidPerc, oDisp)
  }
  return(data.frame(fdr=divFdr$fdr, div=divFdr$div, med=med))
}


##--------------------------------------------------
## find the optimal dividing line between the 
## two specified peaks. Assumes a poisson 
## distribution with overdispersion
##
dividePeaks <- function(amed,bmed,aNum,bNum,oDisp){  
  thresholds =(amed+1):(bmed-1)

  mislabeledWinds <- function(thresh){
    low=(1-pnbinom(thresh,size=(amed/oDisp-1),mu=amed))*aNum
    high=(pnbinom(thresh,size=(bmed/oDisp-1),mu=bmed))*bNum
    return(low+high)
  }

  vals=sapply(thresholds,mislabeledWinds)
  lows = which(vals == min(vals))

  ##choose the low point closest
  ##to the halfway point
  diffFromMed = abs((lows+amed)-(amed+bmed/2))
  pos = which(diffFromMed == min(diffFromMed))
  if(length(pos) > 1){
    pos = pos[1]
  }
  return(data.frame(div=lows[pos]+amed,fdr=mislabeledWinds(lows[pos]+amed)/(aNum+bNum)))
}



#-------------------------------------------------
# Calculates the window size that conforms to the
# given FDR rate and the threshold that best
# separates the peaks of ploidy
#
calcWindParams <- function(numReads,fdr,genomeSize,oDisp, ploidy, minSize, ploidyPerc, medAdj, startDiv=100){#nullThis, nullBoth, startDiv=100){
  ## answer has to be this close to the FDR rate (on the lower side)
  tolerance = 0.005
  #starting point for search
  windSize = genomeSize/startDiv
  divider = NULL
  
  ## first, halve size until we get above FDR threshold
  found <- FALSE
  p <- fdrRate(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj)

  if(p$fdr > fdr){
    stop("not enough reads to achieve the desired FDR rate")
  }
  
  while((found == FALSE) & (windSize > minSize)){
    windSize <- round(windSize / 2)
    p <- fdrRate(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj)
    
    if(p$fdr > fdr){
      found = TRUE
    }
  }

  if(windSize > minSize){
    ## zero in on a size that's within the desired parameters
    found = FALSE    
    adj = windSize/2

    while((found == FALSE) & (windSize > minSize)){    
      if(p$fdr > fdr){
        windSize <- round(windSize + adj)
        p <- fdrRate(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj)
        
      }else if(p$fdr < (fdr - tolerance)){
        windSize <- round(windSize - adj)
        p <- fdrRate(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj)
        
      } else{
        found = TRUE
      }
      adj <- adj/2
    }
  }
    
  ##if the window size is below the minimum size, have to
  ##recalculate params
  if(windSize < minSize){
    windSize = minSize
  } else {
    ## round to multiple of minSize
    windSize = floor(windSize/minSize)*minSize
  }
  
  p <- fdrRate(windSize,ploidy,genomeSize,numReads,oDisp,ploidyPerc, medAdj)
  med=p$med
  div=p$div
  
  return(data.frame(binSize=windSize,div=div,med=med))
}

chrisamiller/readDepth documentation built on June 16, 2021, 2:05 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

chrisamiller/readDepth
Find regions of genomic copy number loss and gain from short reads

R/rd.object.R
In chrisamiller/readDepth: Find regions of genomic copy number loss and gain from short reads

Defines functions calcWindParams dividePeaks fdrRate addMapability ploidyPercentages readParameters binParams initRdClass

Documented in addMapability binParams calcWindParams dividePeaks fdrRate initRdClass ploidyPercentages readParameters

R Package Documentation

Browse R Packages

We want your feedback!

chrisamiller/readDepth Find regions of genomic copy number loss and gain from short reads

R/rd.object.R In chrisamiller/readDepth: Find regions of genomic copy number loss and gain from short reads

Defines functions calcWindParams dividePeaks fdrRate addMapability ploidyPercentages readParameters binParams initRdClass

Documented in addMapability binParams calcWindParams dividePeaks fdrRate initRdClass ploidyPercentages readParameters

R Package Documentation

Browse R Packages

We want your feedback!

chrisamiller/readDepth
Find regions of genomic copy number loss and gain from short reads

R/rd.object.R
In chrisamiller/readDepth: Find regions of genomic copy number loss and gain from short reads