#' @title ChIP-seq GC Effects Estimation
#'
#' @description
#' GC effects are estimated based on effective GC content and reads count
#' on genome-wide windows, using generalized linear mixture models. Genome
#' wide windows are randomly or supervised sampled with given proportions.
#' GC effects of background and foreground are estimated separately.
#'
#' @param coverage A list object returned by function \code{read5endCoverage}.
#'
#' @param bdwidth A non-negative integer vector with two elements
#' specifying ChIP-seq binding width and peak detection half window size.
#' Usually generated by function \code{bindWidth}. A bad estimation of
#' bdwidth results no meaning of downstream analysis.
#'
#' @param flank A non-negative integer specifying the flanking width of
#' ChIP-seq binding. This parameter provides the flexibility that reads
#' appear in flankings by decreased probabilities as increased distance
#' from binding region. This paramter helps to define effective GC
#' content calculation. Default is NULL, which means this paramater will
#' be calculated from \code{bdwidth}. However, if customized numbers
#' provided, there won't be recalucation for this parameter; instead, the
#' 2nd elements of \code{bdwidth} will be recalculated based on \code{flank}.
#'
#' @param plot A logical vector which, when TRUE (default), returns plots
#' of intermediate results.
#'
#' @param sampling A numeric vector with length 2. The first number specifies
#' the proportion of regions to be sampled for GC effects estimation.
#' The second number specifies the repeat times for sampling.
#' Default c(0.05,1) gives pretty robust estimation for human genome. However,
#' smaller genomes might need both higher proportion and more repeat times for
#' robust estimation.
#'
#' @param supervise A GRanges object specifying peak regions in the
#' studied data, such as peaks called by peak callers, e.g. MACS & SPP. These
#' peak regions provide supervised window sampling for both mixtures in the
#' generalized linear model. Default no supervising. Or, if provided peak
#' regions have too few covered windows, supervised sampling will be replaced
#' by random sampling automatically.
#'
#' @param gcrange A non-negative numeric vector with length 2. This vector
#' sets the range of GC content to filter regions for GC effect estimation.
#' For human, most regions have GC content between 0.3 and 0.8, which is
#' set as the default. Other regions with GC content beyond this range
#' will be ignored. This range is critical when very few foreground regions
#' are selected for mixture model fitting, since outliers could drive the
#' regression lines. Thus, if possible, first make a scatter plot between
#' counts and GC content to decide this parameter. Alternatively,
#' select a narrower range, e.g. c(0.35,0.7), to aviod outlier effects from
#' both high and low GC-content regions.
#'
#' @param emtrace A logical vector which, when TRUE (default), allows to
#' print the trace of log likelihood changes in EM iterations.
#'
#' @param model A character specifying the distribution model to be used in
#' generalized linear model fitting. The default is negative
#' binomial(\code{nbinom}), while \code{poisson} is also supported currently.
#' Based on our tests of multiple datasets, mostly poisson is a very good
#' approximation of negative binomial, and provides much faster model fitting.
#'
#' @param mu0 A non-negative numeric initiating read count signals for
#' background regions. This is treated as the starting value of background mean
#' for poisson/nbinom fitting. Default is 1.
#'
#' @param mu1 A non-negative numeric initiating read count signals for
#' foreground regions. This is treated as the starting value of foreground mean
#' for poisson/nbinom fitting, Default is 50.
#'
#' @param theta0 A non-negative numeric initiating the shape parameter of
#' negative binomial model for background regions. For more detail, see
#' theta in \code{\link[MASS]{glm.nb}} function.
#'
#' @param theta1 A non-negative numeric initiating the shape parameter of
#' negative binomial model for foreground regions. For more detail, see
#' theta in \code{\link[MASS]{glm.nb}} function.
#'
#' @param p A non-negative numeric specifying the proportion of foreground
#' regions in all estimated regions. This is treated as a starting value for
#' EM algorithm. Default is 0.02.
#'
#' @param converge A non-negative numeric specifying the condition of EM
#' algorithm termination. EM algorithm stops when the ratio of log likelihood
#' increment to whole log likelihood is less or equivalent to
#' \code{converge}.
#'
#' @param genome A \link[BSgenome]{BSgenome} object containing the sequences
#' of the reference genome that was used to align the reads, or the name of
#' this reference genome specified in a way that is accepted by the
#' \code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
#' software package. In that case the corresponding BSgenome data package
#' needs to be already installed (see \code{?\link[BSgenome]{getBSgenome}} in
#' the \pkg{BSgenome} package for the details).
#'
#' @param gctype A character vector specifying choice of method to calculate
#' effective GC content. Default \code{ladder} is based on uniformed fragment
#' distribution. A more smoother method based on tricube assumption is also
#' allowed. However, tricube should be not used if estimated peak half size
#' is 3 times or more larger than estimated bind width.
#'
#' @return A list of objects
#' \item{gc}{The GC contents at which GC effects are estimated.}
#' \item{mu0}{Predicted background signals at GC content \code{gc}.}
#' \item{mu1}{Predicted foreground signals at GC content \code{gc} .}
#' \item{mu0med0}{Median of predicted background signals.}
#' \item{mu1med1}{Median of predicted foreground signals.}
#' \item{mu0med1}{Median of predicted background signals at GC content of
#' foreground windows.}
#' \item{mu1med0}{Median of predicted foreground signals at GC content of
#' background windows.}
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomicRanges
#' @import Biostrings
#' @import MASS
#' @importFrom BSgenome getBSgenome
#' @importFrom BSgenome getSeq
#' @importFrom splines ns
#' @importFrom grDevices colorRampPalette
#' @importFrom graphics layout
#' @importFrom graphics plot
#' @importFrom graphics axis
#' @importFrom graphics lines
#' @importFrom stats dpois
#' @importFrom stats dnbinom
#' @importFrom stats glm
#' @importFrom stats predict
#'
#' @export
#' @examples
#' bam <- system.file("extdata", "chipseq.bam", package="gcapc")
#' cov <- read5endCoverage(bam)
#' bdw <- bindWidth(cov)
#' gcb <- gcEffects(cov, bdw, sampling = c(0.15,1))
gcEffects <- function(coverage,bdwidth,flank=NULL,plot=TRUE,
sampling=c(0.05,1),supervise=GRanges(),
gcrange=c(0.3,0.8),emtrace=TRUE,
model=c('nbinom','poisson'),
mu0=1,mu1=50,theta0=mu0,theta1=mu1,
p=0.02,converge=1e-3,
genome="hg19",gctype=c("ladder","tricube")){
### input sanity check
genome <- getBSgenome(genome)
model <- match.arg(model)
gctype <- match.arg(gctype)
bdw <- bdwidth[1]
halfbdw <- floor(bdw/2)
if((length(bdwidth)<2 & is.null(flank)) ||
(!is.null(flank) & length(bdwidth)<1))
stop("Parameter 'bdwidth' or 'flank' error.\n")
if(is.null(flank)){
pdwh <- bdwidth[2]
flank <- pdwh-bdw+halfbdw
}else{
pdwh <- flank+bdw-halfbdw
}
if(length(sampling)<2 || sampling[1]<=0 || sampling[1]>1 || sampling[2]<1)
stop("Parameter 'sampling' error.\n")
sampling[2] <- floor(sampling[2])
if(class(supervise)!="GRanges")
stop("Parameter 'supervise' error.\n")
if(sum(gcrange<0)>0 || sum(gcrange>1)>0 || sum(is.na(gcrange))>0)
stop("Parameter 'gcrange' error.\n")
if(mu0<=0 || mu1<=0 || mu0>=mu1)
stop("Parameter 'mu0' or 'mu1' error.\n")
if(model=='nbinom' && (theta1<=0 || theta0<=0))
stop("Parameter 'theta0' or 'theta1' error in nbinom model.\n")
if(p<=0 || p>=0.5)
stop("'p' must be in (0,0.5).\n")
if(converge<=0 || converge>0.1)
stop("'converge' must be in (0,0.1].\n")
if(gctype=="ladder"){
weight <- c(seq_len(flank),rep(flank+1,bdw),rev(seq_len(flank)))
weight <- weight/sum(weight)
}else if(gctype=="tricube"){
w <- flank+halfbdw
weight <- (1-abs(seq(-w,w)/w)^3)^3
weight <- weight/sum(weight)
}
### regions
cat("Starting to estimate GC effects\n")
seqs <- sapply(coverage$fwd,length)
seqs <- floor(seqs/bdw-10)*bdw
binl <- sapply(seqs, function(i) length(seq(1+bdw*10, i, bdw)))
starts <- unlist(lapply(seqs, function(i) seq(1+bdw*10, i, bdw)))
ends <- unlist(lapply(seqs, function(i) seq(bdw*11, i, bdw)))
chrs <- rep(names(seqs), times=binl)
region <- GRanges(chrs,IRanges(start=starts,end=ends))
rm(seqs,binl,starts,ends,chrs)
### sampling and counts
fo <- unique(subjectHits(findOverlaps(supervise,region)))
if(length(fo)<1000 || length(region)-length(fo) <1000){
cat("...... Too few/much ranges in 'supervise'.",
"Selecting random sampling\n")
sampidx <- sort(as.integer(sapply(seq_len(sampling[2]),function(x)
sample.int(length(region),floor(length(region)*sampling[1])))))
samptype <- 'random'
}else{
cat("...... Selecting supervised sampling\n")
num1 <- floor(length(fo) * sampling[1])
num2 <- floor((length(region)-length(fo)) * sampling[1])
num1 <- min(num1*ceiling(0.1/(num1/num2)),length(fo))
sampidx <- sort(as.integer(sapply(seq_len(sampling[2]),
function(x) c(sample(fo,num1),
sample(setdiff(seq_along(region),fo),num2)))))
samptype <- 'supervised'
}
region <- region[sampidx]
cat("......... Sampling",sampling[1]*100,"percent of regions by",
sampling[2], "times, total",length(region),"regions\n")
regionsp <- resize(split(region,seqnames(region)),pdwh)
cat("...... Counting reads\n")
rcfwd <- unlist(viewSums(Views(coverage$fwd,
ranges(shift(regionsp,-flank)))))
rcrev <- unlist(viewSums(Views(coverage$rev,
ranges(shift(regionsp,halfbdw)))))
rm(regionsp,sampidx,fo)
if(plot){
idxx <- sample.int(length(region),min(50000,length(region)))
plot(rcfwd[idxx],rcrev[idxx],
main=paste('RC: shifted',halfbdw,"; CORR:",
round(cor(rcfwd,rcrev),3),"; Flank:",flank),
ylab='Reverse strand',xlab='Forward strand')
rm(idxx)
}
### effective gc content
cat("...... Calculating GC content with flanking",flank,"\n")
rwidth <- width(region[1])
nr <- shift(resize(region,rwidth + flank*2),-flank)
seqs <- getSeq(genome,nr) # slow
gcpos <- startIndex(vmatchPattern("S", seqs, fixed="subject"))
gc <- round(sapply(gcpos,function(x) sum(weight[x])),3)
rm(rwidth,nr,seqs,gcpos,region)
### em algorithms
cat("...... Estimating GC effects\n")
gc <- rep(gc,2)
rc <- c(rcfwd,rcrev)
idx <- gc>=gcrange[1] & gc<=gcrange[2] & !is.na(rc) & !is.na(gc)
dat <- data.frame(y=rc[idx],gc=gc[idx])
if(model=='poisson'){
logp1 <- dpois(dat$y, lambda = mu1, log = TRUE)
logp0 <- dpois(dat$y, lambda = mu0, log = TRUE)
}else{
logp1 <- dnbinom(dat$y, size=theta1, mu=mu1, log = TRUE)
logp0 <- dnbinom(dat$y, size=theta0, mu=mu0, log = TRUE)
}
z <- 1/(1+exp(logp0-logp1)*(1-p)/p)
llf <- sum(z*(logp1+log(p)) + (1-z)*(logp0+log(1-p)))
llgap <- llf
i <- 0
rm(rcfwd,rcrev,rc,gc,idx)
while(abs(llgap) > (abs(llf) * converge) && i < 100){
p <- (2+sum(z))/(2*2+length(z))
dat1 <- dat[z>=0.5,]
dat0 <- dat[z<0.5,]
if(model=='poisson'){
lmns0 <- glm(y ~ ns(gc, df = 2), data=dat0, family="poisson")
lmns1 <- glm(y ~ ns(gc, df = 2), data=dat1, family="poisson")
predY0 <- predict(lmns0, data.frame(gc = dat$gc),type="response")
predY1 <- predict(lmns1, data.frame(gc = dat$gc),type="response")
logp1 <- dpois(dat$y, lambda = predY1, log = TRUE)
logp0 <- dpois(dat$y, lambda = predY0, log = TRUE)
}else{
# slow
lmns0 <- glm.nb(y ~ ns(gc, df = 2), data=dat0, init.theta=theta0)
lmns1 <- glm.nb(y ~ ns(gc, df = 2), data=dat1, init.theta=theta1)
predY0 <- predict(lmns0, data.frame(gc = dat$gc),type="response")
predY1 <- predict(lmns1, data.frame(gc = dat$gc),type="response")
theta1 <- lmns1$theta
theta0 <- lmns0$theta
logp1 <- dnbinom(dat$y, size=theta1, mu=predY1, log = TRUE)
logp0 <- dnbinom(dat$y, size=theta0, mu=predY0, log = TRUE)
}
z <- 1/(1+exp(logp0-logp1)*(1-p)/p)
if(sum(z>=0.5) < length(gc)*0.0005 | sum(z<0.5) < length(gc)*0.0005)
break;
lli <- sum(z*(logp1+log(p)) + (1-z)*(logp0+log(1-p)))
llgap <- lli - llf
llf <- lli
i <- i + 1
if(emtrace)
cat("......... Iteration",i,'\tll',llf,'\tincrement',llgap,'\n')
}
### gc effects
gcbase <- round(seq(0,1,0.001),3)
gcbias <- list(gc=gcbase,
mu0=predict(lmns0,data.frame(gc = gcbase),type="response"),
mu1=predict(lmns1,data.frame(gc = gcbase),type="response"),
mu0med0=median(predY0[z<0.5]),
mu1med1=median(predY1[z>=0.5]),
mu0med1=median(predY0[z>=0.5]),
mu1med0=median(predY1[z<0.5]))
if(plot){
tmp <- nrow(dat)
idx0 <- sample.int(tmp,min(100000,tmp))
rbPal <- colorRampPalette(c('green','orange'))
color <- rbPal(20)[as.numeric(cut(z[idx0],breaks = 20))]
plot(dat$gc[idx0],dat$y[idx0]+0.5,col=color,xlim=gcrange,pch=20,
main=paste(model,samptype),
xlab='Effective GC content',ylab="Read counts",log='y',yaxt='n')
idx00 <- sample.int(tmp,min(5000,tmp))
idx00 <- idx00[order(dat$gc[idx00])]
lines(dat$gc[idx00],predY1[idx00]+0.5,col='red',lwd=3)
lines(dat$gc[idx00],predY0[idx00]+0.5,col='blue',lwd=3)
axis(side=2, at=c(0,2^(0:10))+0.5, labels=c(0,2^(0:10)))
}
gcbias
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.