R/refineSites.r
In gcapc: GC Aware Peak Caller

Documented in refineSites

#' @title Adjust ChIP-seq Read Count Table
#'
#' @description
#' For a given set of sites with the same/comparable width, their
#' read count table from multiple samples are adjusted based on
#' potential GC effects. For each sample separately, GC effects are
#' estimated based on their effective GC content and
#' reads count using generalized linear mixture models. Then, count
#' table is adjusted based on estimated GC effects.
#' It it important that the given sites includes both foreground and
#' background regions, see \code{sites} below.
#'
#' @param counts A count matrix with each row corresponding to each element
#' in \code{sites} and each column corresponding to one sample. Every value
#' in the matrix indicates the read counts for one site in one sample. It is
#' noted that since effective GC content is used in this function, it is
#' important to extend either original reads or original \code{sites} to
#' consider reads that 5' starting in \code{flank} regions, when counting
#' sequencing reads.
#'
#' @param sites A GRanges object with length equivalent to number of rows
#' in \code{counts} matrix. It is preferable that every GRange have the same
#' width; otherwise, the mixture model is modeling different things with
#' wider GRanges certainly have more reads. However, it is OK if only a
#' minority of GRanges have different width, since the model is pretty robust
#' to outliers. Also, it is important that \code{sites} including both
#' foreground and background regions in each sample, otherwise the mixture
#' model will fail to fit two components. Fortunately, if you are inputing
#' a large collection of samples, foreground sites in one sample may play
#' the role as background in other samples. In this case, manually selecting
#' real background is not necessary.
#'
#' @param flank A non-negative integer specifying the flanking width of
#' ChIP-seq binding. This parameter provides the flexibility that reads
#' appear in flankings by decreased probabilities as increased distance
#' from binding region. This paramter helps to define effective GC
#' content calculation.
#'
#' @param outputidx A logical vector with the length equivalent to number
#' of rows in \code{counts}. This provides which subset of adjusted count
#' matrix should be outputed. This would be extremely useful if you have
#' manually collected background sites and want to only export the sites
#' you care about.
#'
#' @param gcrange A non-negative numeric vector with length 2. This vector
#' sets the range of GC content to filter regions for GC effect estimation.
#' For human, most regions have GC content between 0.3 and 0.8, which is
#' set as the default. Other regions with GC content beyond this range
#' will be ignored. This range is critical when very few foreground regions
#' are selected for mixture model fitting, since outliers could drive the
#' regression lines. Thus, if possible, first make a scatter plot between
#' counts and GC content to decide  this parameter. Alternatively,
#' select a narrower range, e.g. c(0.35,0.7), to aviod outlier effects from
#' both high and low GC-content regions.
#'
#' @param emtrace A logical vector which, when TRUE (default), allows to
#' print the trace of log likelihood changes in EM iterations.
#'
#' @param plot A logical vector which, when TRUE (default), returns miture
#' fitting plot.
#'
#' @param model A character specifying the distribution model to be used in
#' generalized linear model fitting. The default is negative
#' binomial(\code{nbinom}), while \code{poisson} is also supported currently.
#' More details see \code{gcEffects}.
#'
#' @param mu0 A non-negative numeric initiating read count signals for
#' background sites. This is treated as the starting value of background mean
#' for poisson/nbinom fitting.
#'
#' @param mu1 A non-negative numeric initiating read count signals for
#' foreground sites. This is treated as the starting value of foreground mean
#' for poisson/nbinom fitting.
#'
#' @param theta0 A non-negative numeric initiating the shape parameter of
#' negative binomial model for background sites. For more detail, see
#' theta in \code{\link[MASS]{glm.nb}} function.
#'
#' @param theta1 A non-negative numeric initiating the shape parameter of
#' negative binomial model for foreground sites. For more detail, see
#' theta in \code{\link[MASS]{glm.nb}} function.
#'
#' @param p A non-negative numeric specifying the proportion of foreground
#' sites in all estimated sites. This is treated as a starting value for
#' EM algorithm.
#'
#' @param converge A non-negative numeric specifying the condition of EM
#' algorithm termination. EM algorithm stops when the ratio of log likelihood
#' increment to whole log likelihood is less or equivalent to
#' \code{converge}.
#'
#' @param genome A \link[BSgenome]{BSgenome} object containing the sequences
#' of the reference genome that was used to align the reads, or the name of
#' this reference genome specified in a way that is accepted by the
#' \code{\link[BSgenome]{getBSgenome}} function defined in the \pkg{BSgenome}
#' software package. In that case the corresponding BSgenome data package
#' needs to be already installed (see \code{?\link[BSgenome]{getBSgenome}} in
#' the \pkg{BSgenome} package for the details).
#'
#' @param gctype A character vector specifying choice of method to calculate
#' effective GC content. Default \code{ladder} is based on uniformed fragment
#' distribution. A more smoother method based on tricube assumption is also
#' allowed. However, tricube should be not used if \code{flank} is too large.
#'
#' @return The count matrix after GC adjustment. The matrix values are not
#' integer any more.
#'
#' @import S4Vectors
#' @import IRanges
#' @import GenomicRanges
#' @import Biostrings
#' @import MASS
#' @importFrom BSgenome getBSgenome
#' @importFrom BSgenome getSeq
#' @importFrom matrixStats colMedians
#' @importFrom splines ns
#' @importFrom grDevices rgb
#' @importFrom graphics plot
#' @importFrom graphics axis
#' @importFrom graphics lines
#' @importFrom stats dpois
#' @importFrom stats dnbinom
#' @importFrom stats glm
#' @importFrom stats predict
#'
#' @export


refineSites <- function(counts,sites,flank=250L,
                        outputidx=rep(TRUE,nrow(counts)),
                        gcrange=c(0.3,0.8),emtrace=TRUE,plot=TRUE,
                        model=c('nbinom','poisson'),
                        mu0=1,mu1=50,theta0=mu0,theta1=mu1,
                        p=0.2,converge=1e-4,
                        genome="hg19",gctype=c("ladder","tricube")){
    ### input sanity check
    genome <- getBSgenome(genome)
    model <- match.arg(model)
    gctype <- match.arg(gctype)
    sitew <- median(width(sites))
    if(sum(gcrange<0)>0 || sum(gcrange>1)>0 || sum(is.na(gcrange))>0)
        stop("Parameter 'gcrange' error.\n")
    if(mu0<=0 || mu1<=0 || mu0>=mu1)
        stop("Parameter 'mu0' or 'mu1' error.\n")
    if(model=='nbinom' && (theta1<=0 || theta0<=0))
        stop("Parameter 'theta0' or 'theta1' error in nbinom model.\n")
    if(p<=0 || p>=1)
        stop("'p' must be in (0,1).\n")
    if(converge<=0 || converge>0.1)
        stop("'converge' must be in (0,0.1].\n")
    if(gctype=="ladder"){
        weight <- c(seq_len(flank),rep(flank+1,sitew),rev(seq_len(flank)))
        weight <- weight/sum(weight)
    }else if(gctype=="tricube"){
        w <- flank+floor(sitew/2)
        weight <- (1-abs(seq(-w,w)/w)^3)^3
        weight <- weight/sum(weight)
    }
    ### effective gc content
    cat("Start to estimate GC effects.\n")
    cat("...... Calculating GC content with flanking",flank,"\n")
    nr <- shift(resize(sites,width(sites) + flank*2),-flank)
    seqs <- getSeq(genome,nr)
    gcpos <- startIndex(vmatchPattern("S", seqs, fixed="subject"))
    gc <- round(sapply(gcpos,function(x) sum(weight[x])),3)
    rm(nr,seqs,gcpos)
    ### em algorithms
    cat("...... Estimating GC effects\n")
    fitmu0 <- fitmu1 <- fitz <- matrix(NA,sum(outputidx),ncol(counts))
    for(rep in seq_len(ncol(counts))){
        cat("......... Estimating sample",rep,"\n")
        rc <- counts[,rep]
        idx <- gc>=gcrange[1] & gc<=gcrange[2] & !is.na(rc) & !is.na(gc)
        dat <- data.frame(y=rc[idx],gc=gc[idx])
        theta1E <- theta1
        theta0E <- theta0
        if(model=='poisson'){
            logp1 <- dpois(dat$y, lambda = mu1, log = TRUE)
            logp0 <- dpois(dat$y, lambda = mu0, log = TRUE)
        }else{
            logp1 <- dnbinom(dat$y, size=theta1E, mu=mu1, log = TRUE)
            logp0 <- dnbinom(dat$y, size=theta0E, mu=mu0, log = TRUE)
        }
        z <- 1/(1+exp(logp0-logp1)*(1-p)/p)
        llf <- sum(z*(logp1+log(p)) + (1-z)*(logp0+log(1-p)))
        llgap <- llf
        i <- 0
        while(abs(llgap) > (abs(llf) * converge) && i < 100){
            p <- (2+sum(z))/(2*2+length(z))
            dat1 <- dat[z>=0.5,]
            dat0 <- dat[z<0.5,]
            if(model=='poisson'){
                lmns0 <- glm(y ~ ns(gc, df = 2), data=dat0, family="poisson")
                lmns1 <- glm(y ~ ns(gc, df = 2), data=dat1, family="poisson")
                predY0 <- predict(lmns0,data.frame(gc=dat$gc),type="response")
                predY1 <- predict(lmns1,data.frame(gc=dat$gc),type="response")
                logp1 <- dpois(dat$y, lambda = predY1, log = TRUE)
                logp0 <- dpois(dat$y, lambda = predY0, log = TRUE)
            }else{
                lmns0 <- glm.nb(y ~ ns(gc, df=2),data=dat0,init.theta=theta0E)
                lmns1 <- glm.nb(y ~ ns(gc, df=2),data=dat1,init.theta=theta1E)
                predY0 <- predict(lmns0,data.frame(gc=dat$gc),type="response")
                predY1 <- predict(lmns1,data.frame(gc=dat$gc),type="response")
                theta1E <- lmns1$theta
                theta0E <- lmns0$theta
                logp1 <- dnbinom(dat$y, size=theta1E, mu=predY1, log = TRUE)
                logp0 <- dnbinom(dat$y, size=theta0E, mu=predY0, log = TRUE)
            }
            z <- 1/(1+exp(logp0-logp1)*(1-p)/p)
            if(sum(z>=0.5) < length(gc)*0.0005 | sum(z<0.5) < length(gc)*0.0005)
                break;
            lli <- sum(z*(logp1+log(p)) + (1-z)*(logp0+log(1-p)))
            llgap <- lli - llf
            llf <- lli
            i <- i + 1
            if(emtrace)
                cat("......... Iteration",i,'\tll',llf,'\tincrement',llgap,'\n')
        }
        if(plot){
            idx0 <- sample.int(nrow(dat),min(50000,nrow(dat)))
            plot(dat$gc[idx0],dat$y[idx0]+0.5,col=rgb(0,0,0,alpha=0.1),
                 xlim=gcrange,pch=20,main=paste("rep",rep),log='y',yaxt='n',
                 xlab='Effective GC content',ylab="Read counts")
            idx00 <- sample.int(nrow(dat),min(1000,nrow(dat)))
            idx00 <- idx00[order(dat$gc[idx00])]
            lines(dat$gc[idx00],predY1[idx00]+0.5,col='red',lwd=3)
            lines(dat$gc[idx00],predY0[idx00]+0.5,col='blue',lwd=3)
            axis(side=2, at=c(0,2^(0:10))+0.5, labels=c(0,2^(0:10)))
        }
        ### gc effects
        fitmu0[idx[outputidx],rep] <- predY0[outputidx[idx]]
        fitmu1[idx[outputidx],rep] <- predY1[outputidx[idx]]
        fitz[idx[outputidx],rep] <- z[outputidx[idx]]
        cat("......... Sample",rep,"finished\n")
    }
    med1 <- fitmu1
    med0 <- fitmu0
    med1[fitz<0.5] <- NA
    med0[fitz<0.5] <- NA
    gce <- log2(t(t(fitmu1)/colMedians(med1,na.rm=TRUE))) * fitz +
           log2(t(t(fitmu0)/colMedians(med0,na.rm=TRUE))) * (1-fitz)
    counts[outputidx,] / 2^gce
}