hmm.R
In chipseq: chipseq: A package for analyzing chipseq data

## rdafiles <- list.files(".", pattern = "locs.*rda")

## for (file in rdafiles)
## {
##     local({
##         cat(file, fill = TRUE)
##         load(file)
##         assign(gsub(".rda", "", file, fixed = TRUE),
##                locs,
##                envir = .GlobalEnv)
##     })
## }


if (FALSE)
{

library("org.Mm.eg.db")
eg.sym <- 
    toTable(org.Mm.egSYMBOL2EG[c("Tnnc2", "Des", "Myh3", "Myog", "Mylpf",
                                 "Myl1", "Acta1", "Ckm", "Cdh15", "Myl4",
                                 "Itga7")])
eg.chrloc <-
    toTable(org.Mm.egCHRLOC[ eg.sym$gene_id ])
merge(eg.sym, eg.chrloc)

}


library(lattice)
library(simplehmm)


allLocs <- function(chr, lane, jitter = TRUE)
    ## jitter to make the locations more or less unique (otherwise we
    ## have problems downstream)
{
    load(sprintf("locs_%s_%s.rda", lane, chr))
    locs <- with(locs, c(forward, reverse))
    if (jitter) locs + runif(length(locs))
    else locs
}


base.lane <- 8
data.lane <- 2

target.hits <- 30
initial.states <- c(5, 20, 40)

## create list suitable for subsequent work.  We use sapply instead of
## lapply because it uses the first argument as names.

chrs.to.use <-
    c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", ## "chr8",
      "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15",
      "chr16", "chr17", "chr18", "chr19")


base.locs <-
    sapply(chrs.to.use,
           allLocs, lane = base.lane,
           simplify = FALSE)

data.locs <-
    sapply(chrs.to.use,
           allLocs, lane = data.lane,
           simplify = FALSE)

names(data.locs)
chromosomes <- names(data.locs)

if (FALSE)
{
    histogram(~ data/1e6 | which,
              data =
              make.groups(base = base.locs$chr6,
                          data = data.locs$chr6),
              nint = 1000,
              layout = c(1, 2), type = "count",
              col = "magenta", border = "transparent")



    QAhist <- function(file = "locs_3_chr6.rda", xlab = "Location (mb)", ...)
    {
        load(file)
        foo <-
            histogram(~ I(data/1e6) | which,
                      data = do.call(make.groups, locs),
                      nint = 1000,
                      main = file,
                      xlab = xlab,
                      layout = c(1, 2), type = "count",
                      col = "magenta", border = "transparent", ...)
        plot(foo)
    }

    pdf("QA-hist.pdf")
    QAhist("locs_1_chr6.rda")
    QAhist("locs_2_chr6.rda")
    QAhist("locs_3_chr6.rda")
    QAhist("locs_4_chr6.rda")
    QAhist("locs_5_chr6.rda")
    QAhist("locs_6_chr6.rda")
    QAhist("locs_7_chr6.rda")
    QAhist("locs_8_chr6.rda")
    dev.off()

}



base.hits <- sapply(base.locs, length)
data.hits <- sapply(data.locs, length)

rel <- data.hits / base.hits


getIntervals <-
    function(reflocs, obslocs,
             target = 15,
             prop = length(obslocs) / length(reflocs))
{
    nbins <- ceiling(length(reflocs) * prop / target)
    quantile(reflocs, prob = ppoints(nbins, a = 1), names = FALSE)
}


bin.list <-
    sapply(chromosomes,
           function(chr, ...)
           getIntervals(base.locs[[chr]], data.locs[[chr]], ...),
           target = target.hits,
           prop = median(rel),
           simplify = FALSE)

## str(bin.list)

datahits.list <-
    sapply(chromosomes,
           function(chr)
           as.numeric(table(cut(x = data.locs[[chr]],
                                breaks = bin.list[[chr]], labels = NULL))),
           simplify = FALSE)

str(datahits.list)


if (FALSE)
{
    ## just a histogram of the raw counts, with no smoothing

    countList <- 
        mapply(FUN =
               function(counts, bins) {
                   stopifnot(length(bins) == length(counts) + 1)
                   data.frame(bin.center = 0.5 * (bins[-length(bins)] + bins[-1]) / 1e6,
                              counts = counts)
               }, datahits.list, bin.list, SIMPLIFY = FALSE)
    countDF <- do.call(make.groups, countList)

    pdf("spikes.pdf", width = 20, height = 24)

##     xyplot(counts ~ bin.center | which, countDF, type = c("l", "g"),
##            layout = c(1, length(chromosomes)), lwd = 0.5,
##            main = sprintf("Lane %g normalized by lane %g", data.lane, base.lane),
##            scales =
##            list(x = list(tick.number = 20),
##                 y = list(relation = "free", rot = 0)),
##            strip = FALSE, strip.left = TRUE)

    xyplot(counts ~ bin.center | cut(bin.center, 10),
           countDF, subset = (which == "chr2"),
           type = c("l", "g"),
           layout = c(1, 10),
           lwd = 0.5, col = "black",
           main = sprintf("Chr 2, Lane %g normalized by lane %g", data.lane, base.lane),
           scales =
           list(x = list(relation = "sliced", tick.number = 20),
                y = list(relation = "same", rot = 0)),
           strip = FALSE)

    
    dev.off()


}



fm3 <-
    coverageHmm(datahits.list,
                family =
                hmm.family("nbinom",
                           mu = 1,
                           states.scale = initial.states,
                           size = 20,
                           states.free = TRUE))

fm3 <- update(fm3, iterations = 150, verbose = TRUE)
fm3 <- update(fm3, iterations = 2, verbose = TRUE)

summary(fm3)

pdf(sprintf("comparison_%g_%g.pdf", data.lane, base.lane),
    width = 10, height = 14)

dotplot(sort(rel), xlab = "Relative coverage per chromosome")


xyplot(fm3, decode = "viterbi",
       abscissa =
       lapply(bin.list,
             function(x) (x[-1] + x[-length(x)]) / 2e6 ),
       col = c('darkgrey', 'black'),
       ylim = c(0, 100),
       scales = list(x = list(tick.number = 20, axs = "r")),
       xlab = "Location (Mb)", main = "Decoded path (Viterbi)",
       lty = 1,
       strip.left = TRUE)

dev.off()



## print(rootogram(fm3, xlim = c(0, 50), strip = TRUE,
##                 main = "Marginal rootogram"))


## print(rootogram(fm3, marginal = FALSE,
##                 main = "Conditional rootogram",
##                 xlim = c(0, 50), strip = TRUE))



## long.output <- paste(prefix, "hmm-results-long.csv", sep = "-")
## short.output <- paste(prefix, "hmm-results-short.csv", sep = "-")


## write.hmm(fm3,
##           bins = bin.list,
##           decode = "viterbi",
##           file = long.output)

## write.hmm(fm3,
##           bins = bin.list,
##           decode = "viterbi",
##           combine = TRUE,
##           file = short.output)





## densityplot(~data, do.call(make.groups, locs),
##             groups = which,
##             n = 1000, bw = 5000,
##             plot.points = FALSE)