DiffChIPL: ChIP-seq differential analysis (DiffChIPL)

Documented in NCIS

#NCIS (Normalization for ChIP-Seq) estimates normalizing factor between a ChIP sample and a control/input sample
#input parameters:
#chip.data	 ChIP data.
#input.data	 control data.
#data.type	 "MCS", "AlignedRead", "BED", "BAM"
#frag.len	 average fragment length. Default 200 bp.
#min.binsize	 minimum of binsize to search.
#max.binsize	 maximum of binsize to search.
#binsize.shift	 the threshold of binsize after which the normalization factor is computed as the average of two estimates, one on regular bins and the other on bins shifed half binsize.
#min.stop.binsize	 minimum of binsize to use (stop).
#chr.vec	 vector of chromosomes in the data. Only reads in chr.vec are considered for normalization purpose.
#chr.len.vec	 vector of chromosome lengths corresponding to chr.vec
#contact: kliang@stat.wisc.edu
#last modified: 2012.05.29

#' Estimates normalizing factor between a ChIP sample and a control/input sample
#'
#' @param chip.data ChIP data. For BAM file format, it is an string to represent bam file path
#' @param input.data control data. For BAM file format, it is an string to represent bam file path
#' @param data.type "MCS", "AlignedRead", "BED" or "BAM".
#' @param frag.len  average fragment length. Default 200 bp.
#' @param min.binsize integer minimum of binsize to search.
#' @param max.binsize integer maximum of binsize to search.
#' @param binsize.shift integer the threshold of binsize after which the normalization factor is computed
#'                     as the average of two estimates, one on regular bins and the other on bins shifed half binsize.
#' @param min.stop.binsize integer minimum of binsize to use (stop).
#' @param chr.vec vector of chromosomes in the data. Only reads in chr.vec are considered for normalization purpose.
#' @param chr.len.vec vector of chromosome lengths corresponding to chr.vec
#' @param quant quantile to start the searching for total threshold. Default 0.75.
#' @param removeUN_Random_Chr boolean, default is FALSE, whether to remove unknown or random chromosomes
#' @details data.type
#'     {MCS} Minimum ChIP-Seq format. data.frame with fields: chr (factor), pos (integer) and strand
#'     (factor, "+" and "-"). pos is 5' location. This is different from eland default which use 3' location for reverse strand.
#'     {AlignedRead} from Bioconductor ShortRead package (with support of commonly used formats,
#'     including Eland, MAQ, Bowtie, SOAP and BAM).
#'     {BED} with at least first 6 fields (chrom, start, end, name, score and strand),
#'      http://genome.ucsc.edu/FAQ/FAQformat.html#format1.
#'     {BAM} for bam files with unique reads
#'
#' @return list
#'         - {est }{the estimated normalizing factor.}
#'         - {binsize.est }{the estimated binsize.}
#'         - {r.seq.depth}{sequencing depth ratio.}
#'         - {pi0}{the estimated proportion of background reads among ChIP sample.}
#'
#' @references Liang, K and Keles, S (2012). Normalization of ChIP-seq data with control, to appear.
#'
#' @examples
#' data("SEG3")
#' res <- NCIS(chip.data, input.data, data.type="MCS")
#' res
#'
#' NCIS(chip.data, input.data, data.type = c("MCS", "BED", "AlignedRead", "BAM"),
#'    frag.len = 200, min.binsize = 100, max.binsize = 20000, binsize.shift = 100,
#'    min.stop.binsize = 100, chr.vec = NULL, chr.len.vec = NULL, quant=0.75)
#'
NCIS <- function(chip.data, input.data, data.type=c("MCS", "BED", "AlignedRead", "BAM"),
                 frag.len=200, min.binsize=100, max.binsize=20000,
                 binsize.shift=100, min.stop.binsize=100,
                 chr.vec=NULL, chr.len.vec=NULL, quant=0.75,
                 removeUN_Random_Chr = FALSE){
  if(data.type=="MCS"){
    chip <- read.MCS(chip.data)
    input <- read.MCS(input.data)
  }else if(data.type=="AlignedRead"){
    require(ShortRead)
    chip <- read.AlignedRead(chip.data)
    input <- read.AlignedRead(input.data)
  }else if(data.type=="BED"){
    chip <- read.BED(chip.data)
    input <- read.BED(input.data)
  }else if(data.type=="BAM"){
    library(GenomicAlignments)
    chip <- read.BAM(chip.data, removeUN_Random_Chr = removeUN_Random_Chr)
    input <- read.BAM(input.data, removeUN_Random_Chr = removeUN_Random_Chr)
  }else{
    stop("Unknown data format: type can only be 'NCIS', 'BED' or 'AlignedRead'")
  }
  shift.size <- round(frag.len/2)
  NCIS.internal(chip.pos = chip,input.pos= input, shift.size=shift.size,
                min.binsize=min.binsize, max.binsize=max.binsize,
                binsize.shift= binsize.shift, min.stop.binsize=min.stop.binsize,
                chr.vec=chr.vec, chr.len.vec=chr.len.vec, quant=quant)
}

#NCIS takes chip.pos and input.pos
NCIS.internal <- function(chip.pos, input.pos, shift.size=100, min.binsize=100,
                          max.binsize=20000, binsize.shift=100,
                          min.stop.binsize=100,
                          chr.vec=NULL, chr.len.vec=NULL, quant=0.75){

  if(is.null(chr.vec)){
    chip.name <- names(chip.pos)
    input.name <- names(input.pos)
    chr.vec <- intersect(chip.name, input.name)
    if(length(chr.vec)<length(chip.name)){
      cat("Control sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in ChIP sample. These chromosomes are ignored.\n")
    }
    if(length(chr.vec)<length(input.name)){
      cat("ChIP sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in control sample. These chromosomes are ignored.\n")
    }
  }

  nchip <- sum(sapply(chr.vec, function(x) {sum(sapply(chip.pos[[x]], length))}) )
  ninput <- sum(sapply(chr.vec, function(x) {sum(sapply(input.pos[[x]], length))}) )
  r.seq.depth <- nchip/ninput

  sizevec <- rep(c(1,2,5), times=3)*10^rep(2:4, each=3)
  sizevec <- sizevec[sizevec <= max.binsize]
  sizevec <- sizevec[sizevec >= min.binsize]
  norm.est <- rep(1000, length(sizevec))
  #names(norm.est) <- sizevec

  if(!is.null(chr.len.vec) & length(setdiff(chr.vec, names(chr.len.vec)))==0){
    chr.end.max <- chr.len.vec[chr.vec]
  }else{
    chr.end.max <- sapply(chr.vec, function(chr) max( c(max(chip.pos[[chr]][["+"]]), max(chip.pos[[chr]][["-"]]),
                                                        max(input.pos[[chr]][["+"]]), max(input.pos[[chr]][["-"]])) ))
    #names(chr.end.max) <- chr.vec
  }

  binsize.est <- -1

  for(si in 1:length(sizevec)){
    binsize <- sizevec[si]

    bindata <- bin.data(chip.pos, input.pos, binsize, shift.size=shift.size,
                        chr.vec=chr.vec, chr.end.max=chr.end.max)
    #length(bindata$chip)
    res <- est.norm.quant.search(chipD=bindata$chip, inputD=bindata$input, quant=quant)

    if(binsize < binsize.shift){
      norm.est[si] <- res
    }else{
      #run it twice, 2nd time shift half binsize
      bindata <- bin.data(chip.pos, input.pos, binsize, shift.size=shift.size, shift.half.size=TRUE, chr.vec=chr.vec, chr.end.max=chr.end.max)

      res2 <- est.norm.quant.search(chipD = bindata$chip, inputD=bindata$input, quant=quant)

      norm.est[si] <- (res+res2)/2
    }#end else

    #stopping criteria
    if(si>1 & binsize.est<0){
      if(norm.est[si]>=norm.est[si-1]){
        est <- norm.est[si-1]
        binsize.est <- sizevec[si-1]
      }
      if(si==length(sizevec) & binsize.est<0){ #the end of binsize, no converge yet
        est <- norm.est[si]
        binsize.est <- sizevec[si]
      }
    } #end if(si>1 & binsize.est<0)
    if(binsize.est>0 & binsize>=min.stop.binsize){
      break
    }
  } #end for(si in 1:length(sizevec))

  return(list(est=est, binsize.est=binsize.est,
              r.seq.depth=r.seq.depth, pi0=est/r.seq.depth))
}


#data is dataframe with fields: chr(factor), pos(integer) and strand(factor, "+" and "-")
#pos is 5' location; this is different from eland default which use 3' location for reverse strand.
read.MCS <- function(data){
  if(is.data.frame(data)){
    #check data has required field
    if(!setequal(intersect(c("chr", "pos", "strand"), colnames(data)), c("chr", "pos", "strand"))) stop("MCS format need to be a data.frame with fields: chr, pos and strand")
    res <- split(data[, c("pos", "strand")], data$chr, drop = TRUE)
    res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
    return(res)
  }else{
    if(!setequal(names(data[[1]]), c("+", "-"))) stop("MCS list format need to be a list of chromosomes, each of which is a list of two strands.")
    return(data)
  }
}

#BED file should have at least first 6 fields (chrom, start, end, name, score and strand), see
#http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for more details
#chip <- read.BED("BED.file")
read.BED <- function(bed.file){
  temp <- read.delim(bed.file,
                     header=FALSE, row.names=NULL)
  if(ncol(temp)<6) stop("BED file should has at least 6 fields!")
  colnames(temp)[1:6] <- c("chr", "pos", "posEnd", "t1", "t2", "strand")
  index <- (1:nrow(temp))[temp$strand=="-"]
  tt <- temp[index[1],]
  if(tt$pos < tt$posEnd){
    temp$pos[index] <- temp$posEnd[index]
  }
  res <- split(temp[, c("pos", "strand")], temp$chr, drop = TRUE)
  res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
  return(res)
}


#chip <- read.AlignedRead("AlignedRead.object")
read.AlignedRead <-
  function(aln){
    if(is(aln, "AlignedRead")){
      res <- split(data.frame(
                        pos= position(aln)+(strand(aln)=="-")*(width(aln)-1),
                        strand=factor(strand(aln))), 
                        chromosome(aln), drop = TRUE)
      res <- res[names(res)!=""]
      res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
      return(res)
    }else{
      stop("Need an AlignedRead object.")
    }
  }

#chip <- read.BAM("bam file path")
read.BAM <- function(aln, removeUN_Random_Chr = FALSE){
  if(is.character(aln)){
    gal1 <- GenomicAlignments::readGAlignments(aln)
    if(removeUN_Random_Chr){
       gal1 <- GenomeInfoDb::keepStandardChromosomes(gal1, pruning.mode="coarse")
    }
    pos1 =  GenomicRanges::start(gal1) +
      (GenomicRanges::strand(gal1)=="-") * (GenomicRanges::width(gal1)-1)
    res <- split(data.frame(pos=pos1, strand=factor(GenomicRanges::strand(gal1))),
                 seqnames(gal1), drop = TRUE)
    res <- res[names(res)!=""]
    res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))

    return(res)
  }else{
    stop("Need a bam file path")
  }
}


bin.data <- function(chip.pos, input.pos, binsize,
                     shift.size=100,
                     shift.half.size=FALSE,
                     zero.filter=TRUE,
                     by.strand=FALSE,
                     chr.vec=NULL, chr.end.max=NULL,
                     by.chr=FALSE){

  if(is.null(chr.vec)){
    chip.name <- names(chip.pos)
    input.name <- names(input.pos)
    chr.vec <- intersect(chip.name, input.name)
    if(length(chr.vec)<length(chip.name)){
      cat("Control sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in ChIP sample. These chromosomes are ignored.\n")
    }
    if(length(chr.vec)<length(input.name)){
      cat("ChIP sample doesn't have chromosome", setdiff(input.name, chr.vec), "which are in control sample. These chromosomes are ignored.\n")
    }
  }
  if(is.null(chr.end.max)){
    chr.end.max <- sapply(chr.vec, function(chr) max( c(max(chip.pos[[chr]][["+"]]), max(chip.pos[[chr]][["-"]]),
                                                        max(input.pos[[chr]][["+"]]), max(input.pos[[chr]][["-"]])) ))
  }
  chr.len <- ceiling((chr.end.max+shift.size)/binsize)

  chip.f <- list()
  chip.r <- list()
  input.f <- list()
  input.r <- list()
  if(!by.strand){
    chip <- list()
    input <- list()
  }

  for(chr in chr.vec){
    if(shift.half.size){
      bk <- c(0, seq(from=round(binsize/2), by=binsize, length.out=chr.len[[chr]]+1))
    }else{
      bk <- seq(from=0, by=binsize, length.out=chr.len[[chr]]+1)
    }
    bk.f <- bk-shift.size
    bk.r <- c(0, bk+shift.size)
    chip.f[[chr]] <- hist(chip.pos[[chr]][["+"]], breaks=bk.f, plot = FALSE)$counts
    chip.r[[chr]] <- hist(chip.pos[[chr]][["-"]], breaks=bk.r, plot = FALSE)$counts[-1]
    input.f[[chr]] <- hist(input.pos[[chr]][["+"]], breaks=bk.f, plot = FALSE)$counts
    input.r[[chr]] <- hist(input.pos[[chr]][["-"]], breaks=bk.r, plot = FALSE)$counts[-1]
    if(zero.filter){
      ind <- chip.f[[chr]]+chip.r[[chr]]+input.f[[chr]]+input.r[[chr]]>0
      chip.f[[chr]] <- chip.f[[chr]][ind]
      chip.r[[chr]] <- chip.r[[chr]][ind]
      input.f[[chr]] <- input.f[[chr]][ind]
      input.r[[chr]] <- input.r[[chr]][ind]
    }
    if(!by.strand){
      chip[[chr]] <- chip.f[[chr]]+chip.r[[chr]]
      input[[chr]] <- input.f[[chr]]+input.r[[chr]]
    }
  }
  if(by.strand){
    if(by.chr){
      return(list(chip.f=chip.f, chip.r=chip.r, input.f=input.f, input.r=input.r))
    }else{
      return(list(chip.f=unlist(chip.f, use.names = FALSE), chip.r=unlist(chip.r, use.names = FALSE),
                  input.f=unlist(input.f, use.names = FALSE), input.r=unlist(input.r, use.names = FALSE)))
    }
  }else{
    if(by.chr){
      return(list(chip=chip, input=input))
    }else{
      return(list(chip=unlist(chip, use.names = FALSE), input=unlist(input, use.names = FALSE)))
    }
  }
}


est.norm.quant.search <- function(chipD, inputD, quant=0.75){
  total <- chipD + inputD
  tbl <- table(total)
  total.count <- as.integer(names(tbl))
  cum.bc <- cumsum(tbl)
  cum.prop <- cum.bc/length(total)
  if(1==0){
    plot(chipD, inputD)
    plot(chipD, total)
    length(total.count)
    length(cum.prop)
    plot(total.count, cum.prop)
  }
  if(cum.prop[1] >= quant){
    threshold <- 1
  }else{
    #largest total before quant
    threshold <- max(total.count[cum.prop < quant])
  }
  ind <- total <= threshold
  chip.sum.low <- sum(chipD[ind])
  input.sum.low <- sum(inputD[ind])
  bin.count.low <- cum.bc[which(total.count==threshold)]

  chipD <- chipD[!ind]
  inputD <- inputD[!ind]
  od <- order(chipD + inputD)
  chipD <- chipD[od]
  inputD <- inputD[od]

  #start after threshold
  cum.bc.high <- cum.bc[total.count>threshold]-bin.count.low

  all.cum.chip <- cumsum(chipD)
  all.cum.input <- cumsum(inputD)
  cum.chip <- c(0, all.cum.chip[cum.bc.high])
  cum.input <- c(0, all.cum.input[cum.bc.high])
  cum.ratio <- (chip.sum.low+cum.chip)/(input.sum.low+cum.input)

  if(sum(diff(cum.ratio) >= 0) ==0) stop("No increase in cum.ratio, binding signal missing!")
  index <- min((2:length(cum.ratio))[diff(cum.ratio) >= 0])
  return(cum.ratio[index])
}