poisson.R
In chipseq: chipseq: A package for analyzing chipseq data

library("chipseq")

if (file.exists("alignInfo.rda")) load("alignInfo.rda") else
{

    library("BSgenome.Mmusculus.UCSC.mm9")

    ##lanes 1, 3, 6 are Myoblasts
    ##lanes 2, 4, 7 are Myotubes
    ##lane 8 is a reference lane

    lanes <- c(1, 2, 3, 4, 6, 7, 8)

    reads = vector("list", length = length(lanes))
    names(reads) = as.character(lanes)

    for (i in seq_along(lanes) ) {
        lane <- lanes[i]
        message("Starting Lane ", lane)
        pat <- paste("s_", lane, ".map", sep="")
        ## we drop the sex chromosomes and mitochondria.
        reads[[i]] <- readAndClean("/home/jdavison/ycao/26-06-2008/binary",
                                   pattern = pat, exclude = "[MXY]|rand")
    }

    lreads <- lapply(reads, as.list) # same info (no quality etc) as nested list

    ## mouse has 19 chromosomes

    chrom.list <- paste("chr", c(1:19), sep = "")

    ## nchrom <- length(chrom.list)
    ## chromLens = rep(NA, nchrom)
    ## names(chromLens) = chrom.list
    ## for( i in 1:nchrom) 
    ##     chromLens[i] = nchar(unmasked(Mmusculus[[chrom.list[i]]])) 

    chromLens <-
        sapply(chrom.list,
               function(chr) {
                   nchar(unmasked(Mmusculus[[chr]]))
                   ## same as 'length(Mmusculus[[chr]])' ?
               },
               simplify = TRUE)


    save(lreads, chromLens, file = "alignInfo.rda")

    ## system.time(seqRanges.old <- lapply(reads, extendReads), gcFirst=TRUE)
} 


## basically same, but retains order of chromosomes
system.time(seqRanges <- lapply(lreads, extendReads), gcFirst=TRUE)

cblasts = combineLanes(seqRanges[c(1,3,5)])
ctubes = combineLanes(seqRanges[c(2,4,6)])


## GOALS:

## Background: Suppose all reads could be represented by a point that
## is "close" to the underlying binding site (this could be the center
## of our extended intervals).  Without loss of generality, the
## locations of these points could be modeled as a non-uniform Poisson
## process.  Comparisons between lanes (or unions of lanes), say lane
## A and lane B, could be viewed as the null hypothesis that the
## intensities of the corresponding processes are multiples of each
## other (the constant factor c reflecting differences in the overall
## number of reads).  In this situation, suppose we had an interval I
## where A had exactly k points.  Then conditional on I, the number of
## points in I from B would be Poisson(c * |I|), where |I| is some
## measure of the length of I.  The unconditional distribution would
## be a Poisson mixture over the distribution of |I|, which would
## hopefully depend only on k and not the unknown Poisson process
## intensity.

## Anyway, so one pseudo-logical choice of I is as follows: within
## each ``island'', choose the shortest interval covering k reads.

## We are doing things in terms of islands, not point estimates.  In
## terms of coverage x on an island, viewSums(x) / 200L gives the
## number of reads in the island.  For now, we will assume this is
## also approximately true (and meaningful) for parts of an island.
## So we could think of finding shortest subintervals in an island so
## that viewSums(x) / 200L is close to k.  We would then count
## viewSums(x) for the other lane.

## For now, we won't even do that.  We will just slice coverage of A
## at k, and for the resulting "peaks", do viewSums() on both A and B.
## We can then think of the values on A as predictor and those on B as
## response, and maybe come up with some sort of variance model.

covblasts = laneCoverage(cblasts[1:4], chromLens)
covtubes = laneCoverage(ctubes[1:4], chromLens)
covctrl = laneCoverage(seqRanges[["8"]][1:4], chromLens)


## coverage() is potentially memory intensive

if (file.exists("peakSummary.rda")) load("peakSummary.rda") else
{

    ref.peaks <- lapply(covtubes, slice, lower = 10)
    ref.peaks.in.blasts <-
        copyIRangesbyChr(ref.peaks, covblasts)
    ref.peaks.in.ctrl <-
        copyIRangesbyChr(ref.peaks, covctrl)


    ## Switch roles (ref vs response).  Beware of wrong column names
    if (FALSE)
    {
        ref.peaks <- lapply(covblasts, slice, lower = 10)
        ref.peaks.in.blasts <-
            copyIRangesbyChr(ref.peaks, covtubes)
        ref.peaks.in.ctrl <-
            copyIRangesbyChr(ref.peaks, covctrl)
    }


    peakSummary <-
        do.call(make.groups,
                sapply(names(ref.peaks),
                       function(chr) {
                           data.frame(start = start(ref.peaks[[chr]]),
                                      end = end(ref.peaks[[chr]]),
                                      reads.tubes = viewSums(ref.peaks[[chr]]) / 200,
                                      reads.blasts = viewSums(ref.peaks.in.blasts[[chr]]) / 200,
                                      reads.ctrl = viewSums(ref.peaks.in.ctrl[[chr]]) / 200)
                       },
                       simplify = FALSE))
    names(peakSummary)[names(peakSummary) == "which"] <- "chromosome"


    save(peakSummary, file = "peakSummary.rda")
}


xyplot(sqrt(reads.blasts) + sqrt(3 * reads.ctrl) ~ sqrt(reads.tubes) | chromosome,
       data = peakSummary, auto.key = TRUE,
       subset = chromosome %in% c("chr1", "chr2", "chr3", "chr4"),
       par.settings = simpleTheme(pch = 16, alpha = 0.4),
       type = c("p", "g", "r"), col.line = "black", aspect = "iso")


xyplot(sqrt(reads.blasts) ~ sqrt(reads.tubes) | chromosome,
       data = peakSummary, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4") &
                 reads.ctrl < quantile(reads.ctrl, 0.99)),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "r"), col.line = "black", aspect = "iso")


xyplot(log2(1+reads.blasts) ~ log2(1+reads.tubes) | chromosome,
       data = peakSummary, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4") &
                 reads.ctrl < quantile(reads.ctrl, 0.99)),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "r"), col.line = "black", aspect = "iso")


## Naive regression: y|x ~ Poisson(cx), i.e., mean=cx, variance=cx.
## We fit a model with y ~ 0 + x with
## weights=1/sqrt(variance)=1/sqrt(x).  This ignores that Poisson has
## no scale factor to estimate, but we probably have some variance
## inflation anyway.  The other option, of course, is to use glm().
## UPDATE: the proper weights seem to be 1/x, even with sqrt(y) as the
## response.

peakSummary.lm <- 
    within(peakSummary,
       {
           fm <- lm(sqrt(reads.blasts) ~ 0 + chromosome:sqrt(reads.tubes),
                    weights = 1/reads.tubes)
           ## print(summary(fm))
           pred.blasts <- predict(fm)
           resid.blasts <- residuals(fm, type = "deviance")
           rstandard.blasts <- rstandard(fm)
           rstudent.blasts <- rstudent(fm)
           rm(fm)
       })

xyplot(resid.blasts ~ log2(reads.tubes) | chromosome,
       data = peakSummary.lm, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "smooth"), col.line = "black")

xyplot(abs(resid.blasts) ~ log2(reads.tubes) | chromosome,
       data = peakSummary.lm, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "smooth"), col.line = "black")

qqmath(~resid.blasts | chromosome,
       data = peakSummary.lm, 
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       panel = function(...) {
           panel.abline(0, 1)
           panel.qqmath(...)
       },
       type = c("p", "g"), col.line = "black", aspect = "iso")

bwplot(chromosome ~ resid.blasts, data = peakSummary.lm)


## library(robustbase)

## A desirable alternative is robust regression, but lmrob in
## robustbase doesn't seem to handle weights.  The log2 responses seem
## reasoanbly homoskedastic.  


xyplot(abs(log2(reads.blasts)-log2(reads.tubes)) ~ log2(reads.tubes) | chromosome,
       data = peakSummary.lm, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "smooth"), col.line = "black")

## Our model of y ~ c.x translates to log(y) ~ log(c) + log(x), so all
## we need to do is estimate log(c). A robust estimate is
## median(log(y)-log(x)).  So residuals are
##  [ (log(y)-log(x)) - median(log(y)-log(x)) ]


peakSummary.rob <- 
    within(peakSummary,
       {
           diffs <- log2(reads.blasts)-log2(reads.tubes)
           resids <-  # prefer per-chromosome?
               (diffs - median(diffs)) / mad(diffs)
           resids.mean <- diffs - mean(diffs[is.finite(diffs)])
       })

median(peakSummary.rob$diffs)

2^median(peakSummary.rob$diffs)
sum(sapply(cblasts, length)) / sum(sapply(ctubes, length))

xyplot(resids ~ log2(reads.tubes) | chromosome,
       data = peakSummary.rob, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "smooth"), col.line = "black")


xyplot(abs(resids) ~ log2(reads.tubes) | chromosome,
       data = peakSummary.rob, auto.key = TRUE,
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       type = c("p", "g", "smooth"), col.line = "black")

qqmath(~resids | chromosome,
       data = peakSummary.rob, 
       subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
       par.settings = simpleTheme(pch = ".", cex = 3),
       panel = function(...) {
           panel.abline(0, 1)
           panel.qqmath(...)
       },
       type = c("p", "g"), col.line = "black", aspect = "iso")

bwplot(chromosome ~ resids, data = peakSummary.rob)

toppeaks <- subset(peakSummary.rob, abs(resids) > 4)
rownames(toppeaks) <- NULL
toppeaks[rev(order(abs(toppeaks$resids))), 1:6]