Nothing
library("chipseq")
if (file.exists("alignInfo.rda")) load("alignInfo.rda") else
{
library("BSgenome.Mmusculus.UCSC.mm9")
##lanes 1, 3, 6 are Myoblasts
##lanes 2, 4, 7 are Myotubes
##lane 8 is a reference lane
lanes <- c(1, 2, 3, 4, 6, 7, 8)
reads = vector("list", length = length(lanes))
names(reads) = as.character(lanes)
for (i in seq_along(lanes) ) {
lane <- lanes[i]
message("Starting Lane ", lane)
pat <- paste("s_", lane, ".map", sep="")
## we drop the sex chromosomes and mitochondria.
reads[[i]] <- readAndClean("/home/jdavison/ycao/26-06-2008/binary",
pattern = pat, exclude = "[MXY]|rand")
}
lreads <- lapply(reads, as.list) # same info (no quality etc) as nested list
## mouse has 19 chromosomes
chrom.list <- paste("chr", c(1:19), sep = "")
## nchrom <- length(chrom.list)
## chromLens = rep(NA, nchrom)
## names(chromLens) = chrom.list
## for( i in 1:nchrom)
## chromLens[i] = nchar(unmasked(Mmusculus[[chrom.list[i]]]))
chromLens <-
sapply(chrom.list,
function(chr) {
nchar(unmasked(Mmusculus[[chr]]))
## same as 'length(Mmusculus[[chr]])' ?
},
simplify = TRUE)
save(lreads, chromLens, file = "alignInfo.rda")
## system.time(seqRanges.old <- lapply(reads, extendReads), gcFirst=TRUE)
}
## basically same, but retains order of chromosomes
system.time(seqRanges <- lapply(lreads, extendReads), gcFirst=TRUE)
cblasts = combineLanes(seqRanges[c(1,3,5)])
ctubes = combineLanes(seqRanges[c(2,4,6)])
## GOALS:
## Background: Suppose all reads could be represented by a point that
## is "close" to the underlying binding site (this could be the center
## of our extended intervals). Without loss of generality, the
## locations of these points could be modeled as a non-uniform Poisson
## process. Comparisons between lanes (or unions of lanes), say lane
## A and lane B, could be viewed as the null hypothesis that the
## intensities of the corresponding processes are multiples of each
## other (the constant factor c reflecting differences in the overall
## number of reads). In this situation, suppose we had an interval I
## where A had exactly k points. Then conditional on I, the number of
## points in I from B would be Poisson(c * |I|), where |I| is some
## measure of the length of I. The unconditional distribution would
## be a Poisson mixture over the distribution of |I|, which would
## hopefully depend only on k and not the unknown Poisson process
## intensity.
## Anyway, so one pseudo-logical choice of I is as follows: within
## each ``island'', choose the shortest interval covering k reads.
## We are doing things in terms of islands, not point estimates. In
## terms of coverage x on an island, viewSums(x) / 200L gives the
## number of reads in the island. For now, we will assume this is
## also approximately true (and meaningful) for parts of an island.
## So we could think of finding shortest subintervals in an island so
## that viewSums(x) / 200L is close to k. We would then count
## viewSums(x) for the other lane.
## For now, we won't even do that. We will just slice coverage of A
## at k, and for the resulting "peaks", do viewSums() on both A and B.
## We can then think of the values on A as predictor and those on B as
## response, and maybe come up with some sort of variance model.
covblasts = laneCoverage(cblasts[1:4], chromLens)
covtubes = laneCoverage(ctubes[1:4], chromLens)
covctrl = laneCoverage(seqRanges[["8"]][1:4], chromLens)
## coverage() is potentially memory intensive
if (file.exists("peakSummary.rda")) load("peakSummary.rda") else
{
ref.peaks <- lapply(covtubes, slice, lower = 10)
ref.peaks.in.blasts <-
copyIRangesbyChr(ref.peaks, covblasts)
ref.peaks.in.ctrl <-
copyIRangesbyChr(ref.peaks, covctrl)
## Switch roles (ref vs response). Beware of wrong column names
if (FALSE)
{
ref.peaks <- lapply(covblasts, slice, lower = 10)
ref.peaks.in.blasts <-
copyIRangesbyChr(ref.peaks, covtubes)
ref.peaks.in.ctrl <-
copyIRangesbyChr(ref.peaks, covctrl)
}
peakSummary <-
do.call(make.groups,
sapply(names(ref.peaks),
function(chr) {
data.frame(start = start(ref.peaks[[chr]]),
end = end(ref.peaks[[chr]]),
reads.tubes = viewSums(ref.peaks[[chr]]) / 200,
reads.blasts = viewSums(ref.peaks.in.blasts[[chr]]) / 200,
reads.ctrl = viewSums(ref.peaks.in.ctrl[[chr]]) / 200)
},
simplify = FALSE))
names(peakSummary)[names(peakSummary) == "which"] <- "chromosome"
save(peakSummary, file = "peakSummary.rda")
}
xyplot(sqrt(reads.blasts) + sqrt(3 * reads.ctrl) ~ sqrt(reads.tubes) | chromosome,
data = peakSummary, auto.key = TRUE,
subset = chromosome %in% c("chr1", "chr2", "chr3", "chr4"),
par.settings = simpleTheme(pch = 16, alpha = 0.4),
type = c("p", "g", "r"), col.line = "black", aspect = "iso")
xyplot(sqrt(reads.blasts) ~ sqrt(reads.tubes) | chromosome,
data = peakSummary, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4") &
reads.ctrl < quantile(reads.ctrl, 0.99)),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "r"), col.line = "black", aspect = "iso")
xyplot(log2(1+reads.blasts) ~ log2(1+reads.tubes) | chromosome,
data = peakSummary, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4") &
reads.ctrl < quantile(reads.ctrl, 0.99)),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "r"), col.line = "black", aspect = "iso")
## Naive regression: y|x ~ Poisson(cx), i.e., mean=cx, variance=cx.
## We fit a model with y ~ 0 + x with
## weights=1/sqrt(variance)=1/sqrt(x). This ignores that Poisson has
## no scale factor to estimate, but we probably have some variance
## inflation anyway. The other option, of course, is to use glm().
## UPDATE: the proper weights seem to be 1/x, even with sqrt(y) as the
## response.
peakSummary.lm <-
within(peakSummary,
{
fm <- lm(sqrt(reads.blasts) ~ 0 + chromosome:sqrt(reads.tubes),
weights = 1/reads.tubes)
## print(summary(fm))
pred.blasts <- predict(fm)
resid.blasts <- residuals(fm, type = "deviance")
rstandard.blasts <- rstandard(fm)
rstudent.blasts <- rstudent(fm)
rm(fm)
})
xyplot(resid.blasts ~ log2(reads.tubes) | chromosome,
data = peakSummary.lm, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "smooth"), col.line = "black")
xyplot(abs(resid.blasts) ~ log2(reads.tubes) | chromosome,
data = peakSummary.lm, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "smooth"), col.line = "black")
qqmath(~resid.blasts | chromosome,
data = peakSummary.lm,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
panel = function(...) {
panel.abline(0, 1)
panel.qqmath(...)
},
type = c("p", "g"), col.line = "black", aspect = "iso")
bwplot(chromosome ~ resid.blasts, data = peakSummary.lm)
## library(robustbase)
## A desirable alternative is robust regression, but lmrob in
## robustbase doesn't seem to handle weights. The log2 responses seem
## reasoanbly homoskedastic.
xyplot(abs(log2(reads.blasts)-log2(reads.tubes)) ~ log2(reads.tubes) | chromosome,
data = peakSummary.lm, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "smooth"), col.line = "black")
## Our model of y ~ c.x translates to log(y) ~ log(c) + log(x), so all
## we need to do is estimate log(c). A robust estimate is
## median(log(y)-log(x)). So residuals are
## [ (log(y)-log(x)) - median(log(y)-log(x)) ]
peakSummary.rob <-
within(peakSummary,
{
diffs <- log2(reads.blasts)-log2(reads.tubes)
resids <- # prefer per-chromosome?
(diffs - median(diffs)) / mad(diffs)
resids.mean <- diffs - mean(diffs[is.finite(diffs)])
})
median(peakSummary.rob$diffs)
2^median(peakSummary.rob$diffs)
sum(sapply(cblasts, length)) / sum(sapply(ctubes, length))
xyplot(resids ~ log2(reads.tubes) | chromosome,
data = peakSummary.rob, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "smooth"), col.line = "black")
xyplot(abs(resids) ~ log2(reads.tubes) | chromosome,
data = peakSummary.rob, auto.key = TRUE,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
type = c("p", "g", "smooth"), col.line = "black")
qqmath(~resids | chromosome,
data = peakSummary.rob,
subset = (chromosome %in% c("chr1", "chr2", "chr3", "chr4")),
par.settings = simpleTheme(pch = ".", cex = 3),
panel = function(...) {
panel.abline(0, 1)
panel.qqmath(...)
},
type = c("p", "g"), col.line = "black", aspect = "iso")
bwplot(chromosome ~ resids, data = peakSummary.rob)
toppeaks <- subset(peakSummary.rob, abs(resids) > 4)
rownames(toppeaks) <- NULL
toppeaks[rev(order(abs(toppeaks$resids))), 1:6]
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.