R/rd.functions.R

Defines functions getAnnotations writeAlts writeSegs writeThresholds tripDist plotDist plotOverlappingHist plotSegs plotWindows logScore bedAnnotationLength chrName combineBins listify getChrPloidy getChrLength getNumReads verifyFiles getReadInfo readEntrypoints

Documented in bedAnnotationLength chrName combineBins getAnnotations getChrLength getChrPloidy getNumReads getReadInfo listify logScore plotDist plotOverlappingHist plotSegs plotWindows readEntrypoints tripDist verifyFiles writeAlts writeSegs writeThresholds

##-------------------------------------------------
## Common functions used by the readDepth package
##


##-------------------------------------------------
## load libraries

library('methods')
library('foreach')
library('IRanges')
library('doMC')


##-------------------------------------------------
## read in entrypoints file, return a data frame
##
readEntrypoints <- function(){
  p=read.table("annotations/entrypoints",sep="\t",quote="",colClasses=c("character","numeric","numeric"))
  names(p)=c("chr","length","ploidy")  
  return(p)  
}

##------------------------------------------------
##strip leading and trailing whitespace
##
trim <- function (x) gsub("^\\s+|\\s+$", "", x)


##--------------------------------------------------
##  Do a wc on each read file to find out how many reads there 
##  are. This is way faster than counting with R. Eventually,
##  we can write a little C function to do this so we don't  
##  require a shell with wc installed (allow windows port)
##
getReadInfo <- function(){
  ## check to see if they've already been counted
  lenFileName="reads/readInfo"
  len = 0
  
  if(file.exists(lenFileName)){
    flist = read.table(lenFileName)
    names(flist) = c("len","file")
  } else {
    lenList = foreach(e=Sys.glob("reads/*.bed"), .combine="append") %do%{
      strsplit(trim(system(paste('wc -l ', e, sep=""), intern=T)),"\\s+",perl=T)[[1]]
    }
    flist = data.frame(len=as.numeric(lenList[seq(1,length(lenList)-1,2)]), file=lenList[seq(2,length(lenList),2)])
    write.table(flist,file=lenFileName,sep="\t",quote=FALSE, row.names=FALSE, col.names=FALSE)
  }  
  return(flist)  
}


##--------------------------------------------------
##  make sure each entrypoint has a corresponding bed file
##
verifyFiles <- function(chrs){
  for(i in 1:length(chrs)){
    filename = paste("reads/",chrs[i],".bed",sep="")
    if(!(file.exists(filename))){
      stop("file: '",filename,"' does not exist\n")
    }
  }
}


##-------------------------------------------------
## sum up the already-stored # of reads
##
getNumReads <- function(flist){
  len = sum(as.numeric(flist$len))
  return(len)
}

##--------------------------------------------------
## generate a distribution by modelling the overdispersed
## poisson with the negative binomial distribution
## d = var/mean
##
rpois.od<-function (n, lambda,d=1) {
  if (d==1)
    rpois(n, lambda)
  else
    rnbinom(n, size=(lambda/(d-1)), mu=lambda)
}


##----------------------------------------------------
## return the length or ploidy for a given chr identifier
## 
getChrLength <- function(chr,entries){
  return(entries[which(entries$chr==chr),]$length)
}

getChrPloidy <- function(chr,entries){
  return(entries[which(entries$chr==chr),]$ploidy)
}


##----------------------------------------------------
## create a list of the appropriate size and place
## input in the appropriate bin 
##
listify <- function(input,chr){
  lst = vector("list")
  lst[[chr]] = input
  return(lst)
}


##----------------------------------------------------
## function to combine lists returned from parallel
## intersection procedure
##
combineBins <- function(a,b){ 
  for(i in 1:length(a)){
    name=names(a)[i]
    if(is.null(b$name)){
      b[[name]] = a[[i]]
    }
  }
  return(b)
}
 

##---------------------------------------------------
## convert chromosome number to chr name
##
chrName <- function(num){
  if(num == 23){ num <- "X"}
  if(num == 24){ num <- "Y"}
  
  return(paste("chr",num,sep=""))  
}


##---------------------------------------------------
## sum the total lengths of the annotations in a 
## bed file. This is equal to coverage if annotations
## are non-overlapping
##
bedAnnotationLength <- function(e){
  if(file.exists(e)){
    f=gzfile(e)
    a=scan(f,what=0,quiet=TRUE)
    close(f)
    return(sum((a[seq(2,(length(a)),2)]-a[seq(1,(length(a)-1),2)]+1)))
  }
  return(0)
}


##--------------------------------------------------
## convert read depth to log2 value,
## based on median
##
logScore <- function(val,med){
  if(is.na(val)){
    return(NA)

  }else if(val<=0){
    return(floor(log2(1/med)))

  }else{
    return(log2(val/med))
  }    
}


##----------------------------------------------
## given a set of parameters, plot the peaks
## and thresholds
##
plotWindows <- function(windowSize, genomeSize, divGain, divLoss, fdr, numReads, oDisp, ploidyPerc, med){
  numWinds <- genomeSize/windowSize
  
  ##generate distributions
  d2 <- rpois.od(numWinds*ploidyPerc$diploidPerc,med,oDisp)
  d3 <- rpois.od(numWinds*ploidyPerc$triploidPerc,(med*1.5),oDisp)
  d1 <- rpois.od(numWinds*ploidyPerc$haploidPerc,(med*0.5),oDisp)

  hrange=10*sqrt(med*oDisp)
  hist(d2,breaks=seq(-100000,100000,20),xlim=c(med-hrange,med+hrange),main=paste("Window Size: ",windowSize,sep=""))
  mtext(paste("FDR: ",round(fdr,5),sep=""))
  hist(d3,breaks=seq(-100000,100000,20),add=T,col="red")
  hist(d1,breaks=seq(-100000,100000,20),add=T,col="red")
  abline(v=med,col="blue")
  abline(v=med*(0.5),col="blue")
  abline(v=med*(1.5),col="blue")
  abline(v=divGain,col="green")
  abline(v=divLoss,col="green")
}

##----------------------------------------------
## plot the segments for a given chromosome
##
plotSegs <- function(rdo,segs,chr){

  st = 1
  sp = rdo@entrypoints[which(rdo@entrypoints$chr == chr),]$length
  
  winds = rdo@chrs[[chr]]$rd
  #print(winds)
  binSize = rdo@binParams$binSize  
  pos = seq(binSize/2,(((length(winds)-1)*binSize)-binSize/2),binSize)
  pos = append(pos,sp)
  
  par(mar=c(5, 4, 4, 4) + 0.1)
  plot(pos,winds,ylab="number of reads", xlab="position (bp)")

  abline(h=rdo@binParams$med,col="blue")
  abline(h=rdo@binParams$gainThresh,col="green")
  abline(h=rdo@binParams$lossThresh,col="green")

  asegs = segs[which(segs$chrom == chr),]
  for(i in 1:length(asegs[,1])){
    lines(c(asegs[i,2],asegs[i,3]), c(asegs[i,5],asegs[i,5]), col="red",lwd=3)
  }
  
  par(new=T)
  plot(-10000,-10000,ylim=c(0,(max(winds,na.rm=TRUE)/rdo@binParams$med)), xlim=c(1,sp),axes=F,xlab="", ylab="")
  axis(4, ylim=c(0,max(winds,na.rm=TRUE)/rdo@binParams$med), col="red",col.axis="red")
  mtext("Copy Number",side=4,col="red",line=2.5)

}


##--------------------------------------------------
## plot overlapping histograms
##
plotOverlappingHist <- function(a, b, colors=c("white","gray20","gray50"),
                                breaks=NULL, xlim=NULL, ylim=NULL){
  
  ahist=NULL
  bhist=NULL

  if(!(is.null(breaks))){
    ahist=hist(a,breaks=breaks,plot=F)
    bhist=hist(b,breaks=breaks,plot=F)
  } else {
    ahist=hist(a,plot=F)
    bhist=hist(b,plot=F)

    dist = ahist$breaks[2]-ahist$breaks[1]
    breaks = seq(min(ahist$breaks,bhist$breaks),max(ahist$breaks,bhist$breaks),dist)

    ahist=hist(a,breaks=breaks,plot=F)
    bhist=hist(b,breaks=breaks,plot=F)
  }

  if(is.null(xlim)){
    xlim = c(min(ahist$breaks,bhist$breaks),max(ahist$breaks,bhist$breaks))
  }

  if(is.null(ylim)){
    ylim = c(0,max(ahist$counts,bhist$counts))
  }

  overlap = ahist
  for(i in 1:length(overlap$counts)){
    if(ahist$counts[i] > 0 & bhist$counts[i] > 0){
      overlap$counts[i] = min(ahist$counts[i],bhist$counts[i])
    } else {
      overlap$counts[i] = 0
    }
  }

  plot(ahist, xlim=xlim, ylim=ylim, col=colors[1])
  plot(bhist, xlim=xlim, ylim=ylim, col=colors[2], add=T)
  plot(overlap, xlim=xlim, ylim=ylim, col=colors[3], add=T)
}


##--------------------------------------------------
## plot actual and expected histograms
##
plotDist <- function(rdo,xmax=NULL,filename="output/hist.pdf",windSize=NULL){

  bins=c()
  for(i in 1:length(names(rdo@chrs))){
    bins = append(bins,rdo@chrs[[names(rdo@chrs)[i]]]$rd)
  }

  if(is.null(xmax)){
    xmax=rdo@binParams$gainThresh*2 ##max(bins,na.rm=TRUE) ##sort(bins[round(length(bins)*0.999)])
  }
  
  if(is.null(windSize)){
    windSize = round(xmax/100)
  }

  len=length(bins)
  e=rdo@entrypoints
  breaks=seq(0,xmax+windSize,windSize)
  oDisp=rdo@params["overDispersion"]

  med=rdo@binParams$med 

  p1 = rpois.od(len*rdo@binParams$hapPerc,med/2,oDisp)
  p2=rpois.od(len*rdo@binParams$dipPerc,med,oDisp)
  p3 = rpois.od(len*rdo@binParams$tripPerc,(med*(3/2)),oDisp)
  
  p=append(append(p1,p2),p3)

  bins = bins[which(bins<xmax & bins>0)]
  p = p[which(p<xmax & p>0)]

  pdf(file=filename)
  plotOverlappingHist(a=bins,b=p,breaks=breaks,xlim=c(0,xmax))
  abline(v=rdo@binParams$gainThresh,col="red")
  abline(v=rdo@binParams$lossThresh,col="red")
  dev.off()
}


##--------------------------------------------------
## plot three actual and expected histograms 
## usually used with raw read depth, post mapability
## correction, and post gc content correction
##
tripDist <- function(xmax=NULL,filename="output/hist.pdf",windSize=20, rdo2, rdo3, rdo4){
  pdf(file=filename, width=12,height=4)
  par(mfcol=c(1,3))
  
  for(rdo in c(rdo2,rdo3,rdo4)){
    bins=c()
    for(i in 1:length(names(rdo@chrs))){
      bins = append(bins,rdo@chrs[[names(rdo@chrs)[i]]]$rd)
    }

    if(is.null(xmax)){
      xmax=rdo2@binParams$gainThresh*2 ##max(bins,na.rm=TRUE) ##bins[round(length(bins)*0.999)]
    }
    len=length(bins)
    e=rdo@entrypoints
    breaks=seq(0,xmax,windSize)
    oDisp=rdo@params["overDispersion"]
    
    med=rdo@binParams$med
    
    p1 = rpois.od(len*rdo@binParams$hapPerc,med/2,oDisp)
    p2=rpois.od(len*rdo@binParams$dipPerc,med,oDisp)
    p3 = rpois.od(len*rdo@binParams$tripPerc,(med*(3/2)),oDisp)
    
    p=append(append(p1,p2),p3)
    
    bins = bins[which(bins<xmax & bins>0)]
    p = p[which(p<xmax & p>0)]    

    plotOverlappingHist(a=bins,b=p,breaks=breaks)
    abline(v=rdo@binParams$gainThresh,col="red")
    abline(v=rdo@binParams$lossThresh,col="red")
  }

  dev.off()
}


##-----------------------------------------------------------
## some simple output functions
##
writeThresholds <- function(rdo){
  a1=data.frame(gainThresh=(rdo@binParams$gainThresh/rdo@binParams$med)*2)
  a2=data.frame(lossThresh=(rdo@binParams$lossThresh/rdo@binParams$med)*2)
  a3=data.frame(binSize=rdo@binParams$binSize)

  write.table(t(cbind(a1,a2,a3)),file="output/thresholds.dat",sep="\t",quote=F,row.names=T,col.names=F)
}

writeSegs <- function(segs){
  write.table(segs, file="output/segs.dat", sep="\t", quote=F, row.names=F, col.names=F)
}

writeAlts <- function(segs,rdo){
  write.table(getAlts(segs,rdo), file="output/alts.dat", sep="\t", quote=F, row.names=F, col.names=F)
}


##------------------------------------------------------------
## Code for pulling down the annotation files 
## and sticking them in the right place

getAnnotations <- function(readLength, sex, genome="hg18", bs=FALSE){
  if(sex!="male" & sex!="female" & sex!="autosomes"){
    print("'sex' must be either \"male\", \"female\", or \"autosomes\"")
    return(0)
  }

  
  dlAnnFile <- function(url,outfile){
    download.file(url, outfile, quiet=TRUE)
    if(file.exists(outfile)){
      print(paste(url,"downloaded successfully"))
      return(1)
    } else {
      print(paste("Oops! Couldn't fetch ",url))
      print("Either you specified a read length or genome build that we don't have annotation")
      print("files prepared for, or your network connection isn't working.")
      print("Check the documentation at http://github.com/chrisamiller/readDepth/ for more information")
      return(0)
    }
  }

  entryurl=paste("https://xfer.genome.wustl.edu/gxfer1/project/cancer-genomics/readDepth/entrypoints.",genome,".",sex,sep="")
  entryfile=paste("annotations/entrypoints")    
  
  if(bs){ #bisulfite
    mapurl = paste("https://xfer.genome.wustl.edu/gxfer1/project/cancer-genomics/readDepth/mapability.bs.readLength",readLength,".",genome,".tar",sep="")
    mapfile = paste("annotations/mapability.bs.readLength",readLength,".",genome,".tar",sep="")
    gcurl = paste("https://xfer.genome.wustl.edu/gxfer1/project/cancer-genomics/readDepth/gcWinds.bs.readLength",readLength,".",genome,".tar",sep="")
    gcfile = paste("annotations/gcWinds.bs.readLength",readLength,".",genome,".tar",sep="")    
  } else { #normal
    mapurl = paste("https://xfer.genome.wustl.edu/gxfer1/project/cancer-genomics/readDepth/mapability.readLength",readLength,".",genome,".tar",sep="")
    mapfile = paste("annotations/mapability.readLength",readLength,".",genome,".tar",sep="")
    gcurl = paste("https://xfer.genome.wustl.edu/gxfer1/project/cancer-genomics/readDepth/gcWinds.readLength",readLength,".",genome,".tar",sep="")
    gcfile = paste("annotations/gcWinds.readLength",readLength,".",genome,".tar",sep="")    
  }

  #download the files, untar them
  if(dlAnnFile(mapurl, mapfile)){
    system(paste("tar -xf",mapfile,"-C annotations/"))
  }
  
  if(dlAnnFile(gcurl, gcfile)){
    system(paste("tar -xf",gcfile,"-C annotations/"))
  }

  dlAnnFile(entryurl, entryfile)
#    system(paste("tar -xf",entryfile,"-C annotations/"))
#  }
}
chrisamiller/readDepth documentation built on June 16, 2021, 2:05 p.m.