#NCIS (Normalization for ChIP-Seq) estimates normalizing factor between a ChIP sample and a control/input sample
#input parameters:
#chip.data ChIP data.
#input.data control data.
#data.type "MCS", "AlignedRead", "BED", "BAM"
#frag.len average fragment length. Default 200 bp.
#min.binsize minimum of binsize to search.
#max.binsize maximum of binsize to search.
#binsize.shift the threshold of binsize after which the normalization factor is computed as the average of two estimates, one on regular bins and the other on bins shifed half binsize.
#min.stop.binsize minimum of binsize to use (stop).
#chr.vec vector of chromosomes in the data. Only reads in chr.vec are considered for normalization purpose.
#chr.len.vec vector of chromosome lengths corresponding to chr.vec
#contact: kliang@stat.wisc.edu
#last modified: 2012.05.29
#' Estimates normalizing factor between a ChIP sample and a control/input sample
#'
#' @param chip.data ChIP data. For BAM file format, it is an string to represent bam file path
#' @param input.data control data. For BAM file format, it is an string to represent bam file path
#' @param data.type "MCS", "AlignedRead", "BED" or "BAM".
#' @param frag.len average fragment length. Default 200 bp.
#' @param min.binsize integer minimum of binsize to search.
#' @param max.binsize integer maximum of binsize to search.
#' @param binsize.shift integer the threshold of binsize after which the normalization factor is computed
#' as the average of two estimates, one on regular bins and the other on bins shifed half binsize.
#' @param min.stop.binsize integer minimum of binsize to use (stop).
#' @param chr.vec vector of chromosomes in the data. Only reads in chr.vec are considered for normalization purpose.
#' @param chr.len.vec vector of chromosome lengths corresponding to chr.vec
#' @param quant quantile to start the searching for total threshold. Default 0.75.
#' @param removeUN_Random_Chr boolean, default is FALSE, whether to remove unknown or random chromosomes
#' @details data.type
#' {MCS} Minimum ChIP-Seq format. data.frame with fields: chr (factor), pos (integer) and strand
#' (factor, "+" and "-"). pos is 5' location. This is different from eland default which use 3' location for reverse strand.
#' {AlignedRead} from Bioconductor ShortRead package (with support of commonly used formats,
#' including Eland, MAQ, Bowtie, SOAP and BAM).
#' {BED} with at least first 6 fields (chrom, start, end, name, score and strand),
#' http://genome.ucsc.edu/FAQ/FAQformat.html#format1.
#' {BAM} for bam files with unique reads
#'
#' @return list
#' - {est }{the estimated normalizing factor.}
#' - {binsize.est }{the estimated binsize.}
#' - {r.seq.depth}{sequencing depth ratio.}
#' - {pi0}{the estimated proportion of background reads among ChIP sample.}
#'
#' @references Liang, K and Keles, S (2012). Normalization of ChIP-seq data with control, to appear.
#'
#' @examples
#' data("SEG3")
#' res <- NCIS(chip.data, input.data, data.type="MCS")
#' res
#'
#' NCIS(chip.data, input.data, data.type = c("MCS", "BED", "AlignedRead", "BAM"),
#' frag.len = 200, min.binsize = 100, max.binsize = 20000, binsize.shift = 100,
#' min.stop.binsize = 100, chr.vec = NULL, chr.len.vec = NULL, quant=0.75)
#'
NCIS <- function(chip.data, input.data, data.type=c("MCS", "BED", "AlignedRead", "BAM"),
frag.len=200, min.binsize=100, max.binsize=20000,
binsize.shift=100, min.stop.binsize=100,
chr.vec=NULL, chr.len.vec=NULL, quant=0.75,
removeUN_Random_Chr = FALSE){
if(data.type=="MCS"){
chip <- read.MCS(chip.data)
input <- read.MCS(input.data)
}else if(data.type=="AlignedRead"){
require(ShortRead)
chip <- read.AlignedRead(chip.data)
input <- read.AlignedRead(input.data)
}else if(data.type=="BED"){
chip <- read.BED(chip.data)
input <- read.BED(input.data)
}else if(data.type=="BAM"){
library(GenomicAlignments)
chip <- read.BAM(chip.data, removeUN_Random_Chr = removeUN_Random_Chr)
input <- read.BAM(input.data, removeUN_Random_Chr = removeUN_Random_Chr)
}else{
stop("Unknown data format: type can only be 'NCIS', 'BED' or 'AlignedRead'")
}
shift.size <- round(frag.len/2)
NCIS.internal(chip.pos = chip,input.pos= input, shift.size=shift.size,
min.binsize=min.binsize, max.binsize=max.binsize,
binsize.shift= binsize.shift, min.stop.binsize=min.stop.binsize,
chr.vec=chr.vec, chr.len.vec=chr.len.vec, quant=quant)
}
#NCIS takes chip.pos and input.pos
NCIS.internal <- function(chip.pos, input.pos, shift.size=100, min.binsize=100,
max.binsize=20000, binsize.shift=100,
min.stop.binsize=100,
chr.vec=NULL, chr.len.vec=NULL, quant=0.75){
if(is.null(chr.vec)){
chip.name <- names(chip.pos)
input.name <- names(input.pos)
chr.vec <- intersect(chip.name, input.name)
if(length(chr.vec)<length(chip.name)){
cat("Control sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in ChIP sample. These chromosomes are ignored.\n")
}
if(length(chr.vec)<length(input.name)){
cat("ChIP sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in control sample. These chromosomes are ignored.\n")
}
}
nchip <- sum(sapply(chr.vec, function(x) {sum(sapply(chip.pos[[x]], length))}) )
ninput <- sum(sapply(chr.vec, function(x) {sum(sapply(input.pos[[x]], length))}) )
r.seq.depth <- nchip/ninput
sizevec <- rep(c(1,2,5), times=3)*10^rep(2:4, each=3)
sizevec <- sizevec[sizevec <= max.binsize]
sizevec <- sizevec[sizevec >= min.binsize]
norm.est <- rep(1000, length(sizevec))
#names(norm.est) <- sizevec
if(!is.null(chr.len.vec) & length(setdiff(chr.vec, names(chr.len.vec)))==0){
chr.end.max <- chr.len.vec[chr.vec]
}else{
chr.end.max <- sapply(chr.vec, function(chr) max( c(max(chip.pos[[chr]][["+"]]), max(chip.pos[[chr]][["-"]]),
max(input.pos[[chr]][["+"]]), max(input.pos[[chr]][["-"]])) ))
#names(chr.end.max) <- chr.vec
}
binsize.est <- -1
for(si in 1:length(sizevec)){
binsize <- sizevec[si]
bindata <- bin.data(chip.pos, input.pos, binsize, shift.size=shift.size,
chr.vec=chr.vec, chr.end.max=chr.end.max)
#length(bindata$chip)
res <- est.norm.quant.search(chipD=bindata$chip, inputD=bindata$input, quant=quant)
if(binsize < binsize.shift){
norm.est[si] <- res
}else{
#run it twice, 2nd time shift half binsize
bindata <- bin.data(chip.pos, input.pos, binsize, shift.size=shift.size, shift.half.size=TRUE, chr.vec=chr.vec, chr.end.max=chr.end.max)
res2 <- est.norm.quant.search(chipD = bindata$chip, inputD=bindata$input, quant=quant)
norm.est[si] <- (res+res2)/2
}#end else
#stopping criteria
if(si>1 & binsize.est<0){
if(norm.est[si]>=norm.est[si-1]){
est <- norm.est[si-1]
binsize.est <- sizevec[si-1]
}
if(si==length(sizevec) & binsize.est<0){ #the end of binsize, no converge yet
est <- norm.est[si]
binsize.est <- sizevec[si]
}
} #end if(si>1 & binsize.est<0)
if(binsize.est>0 & binsize>=min.stop.binsize){
break
}
} #end for(si in 1:length(sizevec))
return(list(est=est, binsize.est=binsize.est,
r.seq.depth=r.seq.depth, pi0=est/r.seq.depth))
}
#data is dataframe with fields: chr(factor), pos(integer) and strand(factor, "+" and "-")
#pos is 5' location; this is different from eland default which use 3' location for reverse strand.
read.MCS <- function(data){
if(is.data.frame(data)){
#check data has required field
if(!setequal(intersect(c("chr", "pos", "strand"), colnames(data)), c("chr", "pos", "strand"))) stop("MCS format need to be a data.frame with fields: chr, pos and strand")
res <- split(data[, c("pos", "strand")], data$chr, drop = TRUE)
res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
return(res)
}else{
if(!setequal(names(data[[1]]), c("+", "-"))) stop("MCS list format need to be a list of chromosomes, each of which is a list of two strands.")
return(data)
}
}
#BED file should have at least first 6 fields (chrom, start, end, name, score and strand), see
#http://genome.ucsc.edu/FAQ/FAQformat.html#format1 for more details
#chip <- read.BED("BED.file")
read.BED <- function(bed.file){
temp <- read.delim(bed.file,
header=FALSE, row.names=NULL)
if(ncol(temp)<6) stop("BED file should has at least 6 fields!")
colnames(temp)[1:6] <- c("chr", "pos", "posEnd", "t1", "t2", "strand")
index <- (1:nrow(temp))[temp$strand=="-"]
tt <- temp[index[1],]
if(tt$pos < tt$posEnd){
temp$pos[index] <- temp$posEnd[index]
}
res <- split(temp[, c("pos", "strand")], temp$chr, drop = TRUE)
res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
return(res)
}
#chip <- read.AlignedRead("AlignedRead.object")
read.AlignedRead <-
function(aln){
if(is(aln, "AlignedRead")){
res <- split(data.frame(
pos= position(aln)+(strand(aln)=="-")*(width(aln)-1),
strand=factor(strand(aln))),
chromosome(aln), drop = TRUE)
res <- res[names(res)!=""]
res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
return(res)
}else{
stop("Need an AlignedRead object.")
}
}
#chip <- read.BAM("bam file path")
read.BAM <- function(aln, removeUN_Random_Chr = FALSE){
if(is.character(aln)){
gal1 <- GenomicAlignments::readGAlignments(aln)
if(removeUN_Random_Chr){
gal1 <- GenomeInfoDb::keepStandardChromosomes(gal1, pruning.mode="coarse")
}
pos1 = GenomicRanges::start(gal1) +
(GenomicRanges::strand(gal1)=="-") * (GenomicRanges::width(gal1)-1)
res <- split(data.frame(pos=pos1, strand=factor(GenomicRanges::strand(gal1))),
seqnames(gal1), drop = TRUE)
res <- res[names(res)!=""]
res <- lapply(res, function(x) split(x[, "pos"], x$strand, drop = TRUE))
return(res)
}else{
stop("Need a bam file path")
}
}
bin.data <- function(chip.pos, input.pos, binsize,
shift.size=100,
shift.half.size=FALSE,
zero.filter=TRUE,
by.strand=FALSE,
chr.vec=NULL, chr.end.max=NULL,
by.chr=FALSE){
if(is.null(chr.vec)){
chip.name <- names(chip.pos)
input.name <- names(input.pos)
chr.vec <- intersect(chip.name, input.name)
if(length(chr.vec)<length(chip.name)){
cat("Control sample doesn't have chromosome", setdiff(chip.name, chr.vec), "which are in ChIP sample. These chromosomes are ignored.\n")
}
if(length(chr.vec)<length(input.name)){
cat("ChIP sample doesn't have chromosome", setdiff(input.name, chr.vec), "which are in control sample. These chromosomes are ignored.\n")
}
}
if(is.null(chr.end.max)){
chr.end.max <- sapply(chr.vec, function(chr) max( c(max(chip.pos[[chr]][["+"]]), max(chip.pos[[chr]][["-"]]),
max(input.pos[[chr]][["+"]]), max(input.pos[[chr]][["-"]])) ))
}
chr.len <- ceiling((chr.end.max+shift.size)/binsize)
chip.f <- list()
chip.r <- list()
input.f <- list()
input.r <- list()
if(!by.strand){
chip <- list()
input <- list()
}
for(chr in chr.vec){
if(shift.half.size){
bk <- c(0, seq(from=round(binsize/2), by=binsize, length.out=chr.len[[chr]]+1))
}else{
bk <- seq(from=0, by=binsize, length.out=chr.len[[chr]]+1)
}
bk.f <- bk-shift.size
bk.r <- c(0, bk+shift.size)
chip.f[[chr]] <- hist(chip.pos[[chr]][["+"]], breaks=bk.f, plot = FALSE)$counts
chip.r[[chr]] <- hist(chip.pos[[chr]][["-"]], breaks=bk.r, plot = FALSE)$counts[-1]
input.f[[chr]] <- hist(input.pos[[chr]][["+"]], breaks=bk.f, plot = FALSE)$counts
input.r[[chr]] <- hist(input.pos[[chr]][["-"]], breaks=bk.r, plot = FALSE)$counts[-1]
if(zero.filter){
ind <- chip.f[[chr]]+chip.r[[chr]]+input.f[[chr]]+input.r[[chr]]>0
chip.f[[chr]] <- chip.f[[chr]][ind]
chip.r[[chr]] <- chip.r[[chr]][ind]
input.f[[chr]] <- input.f[[chr]][ind]
input.r[[chr]] <- input.r[[chr]][ind]
}
if(!by.strand){
chip[[chr]] <- chip.f[[chr]]+chip.r[[chr]]
input[[chr]] <- input.f[[chr]]+input.r[[chr]]
}
}
if(by.strand){
if(by.chr){
return(list(chip.f=chip.f, chip.r=chip.r, input.f=input.f, input.r=input.r))
}else{
return(list(chip.f=unlist(chip.f, use.names = FALSE), chip.r=unlist(chip.r, use.names = FALSE),
input.f=unlist(input.f, use.names = FALSE), input.r=unlist(input.r, use.names = FALSE)))
}
}else{
if(by.chr){
return(list(chip=chip, input=input))
}else{
return(list(chip=unlist(chip, use.names = FALSE), input=unlist(input, use.names = FALSE)))
}
}
}
est.norm.quant.search <- function(chipD, inputD, quant=0.75){
total <- chipD + inputD
tbl <- table(total)
total.count <- as.integer(names(tbl))
cum.bc <- cumsum(tbl)
cum.prop <- cum.bc/length(total)
if(1==0){
plot(chipD, inputD)
plot(chipD, total)
length(total.count)
length(cum.prop)
plot(total.count, cum.prop)
}
if(cum.prop[1] >= quant){
threshold <- 1
}else{
#largest total before quant
threshold <- max(total.count[cum.prop < quant])
}
ind <- total <= threshold
chip.sum.low <- sum(chipD[ind])
input.sum.low <- sum(inputD[ind])
bin.count.low <- cum.bc[which(total.count==threshold)]
chipD <- chipD[!ind]
inputD <- inputD[!ind]
od <- order(chipD + inputD)
chipD <- chipD[od]
inputD <- inputD[od]
#start after threshold
cum.bc.high <- cum.bc[total.count>threshold]-bin.count.low
all.cum.chip <- cumsum(chipD)
all.cum.input <- cumsum(inputD)
cum.chip <- c(0, all.cum.chip[cum.bc.high])
cum.input <- c(0, all.cum.input[cum.bc.high])
cum.ratio <- (chip.sum.low+cum.chip)/(input.sum.low+cum.input)
if(sum(diff(cum.ratio) >= 0) ==0) stop("No increase in cum.ratio, binding signal missing!")
index <- min((2:length(cum.ratio))[diff(cum.ratio) >= 0])
return(cum.ratio[index])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.