PeakSegJoint: Joint Peak Detection in Several ChIP-Seq Samples

library(testthat)
context("real data")

library(PeakSegJoint)
library(ggplot2)

data(H3K27ac.TDH.MMM4)
small.prob <- subset(H3K27ac.TDH.MMM4, chromStart < 7144233)
small.prob <- H3K27ac.TDH.MMM4
before.chromStart <- min(small.prob$chromStart)-1L
after.chromEnd <- max(small.prob$chromEnd)+1L
small.list <- split(small.prob, small.prob$sample.id)
zero <- function(sample.id, chrom, chromStart, chromEnd){
  data.frame(sample.id, chrom, chromStart, chromEnd, count=0L)
}
with.zero.list <- list()
for(sample.id in names(small.list)){
  sample.counts <- small.list[[sample.id]]
  with.zero.list[[sample.id]] <-
    rbind(zero(sample.id, sample.counts$chrom[1],
               before.chromStart, min(sample.counts$chromStart)),
          sample.counts,
          zero(sample.id, sample.counts$chrom[1],
               max(sample.counts$chromEnd), after.chromEnd))
}
with.zero <- do.call(rbind, with.zero.list)
fit <- PeakSegJointSeveral(with.zero)
converted <- ConvertModelList(fit)
segs <- subset(converted$segments, peaks==10)
breaks <- subset(segs, min(chromStart) < chromStart)

ggplot()+
  theme_bw()+
  geom_vline(xintercept=7141905/1e3)+
  ##coord_cartesian(xlim=c(7121907, 7144233)/1e3)+
  theme(panel.margin=grid::unit(0, "cm"))+
  facet_grid(sample.id ~ ., labeller=function(var, val){
    sub("McGill0", "", sub(" ", "\n", val))
  }, scales="free")+
  geom_rect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3,
                ymin=0, ymax=count),
            fill="grey",
            data=with.zero)+
  geom_segment(aes(chromStart/1e3, mean,
                   xend=chromEnd/1e3, yend=mean),
               color="green",
               data=segs)+
  geom_vline(aes(xintercept=chromStart/1e3),
             linetype="dashed",
             color="green",
             data=breaks)

data(peak.at.profile.end)

fit <- PeakSegJointSeveral(peak.at.profile.end)
sapply(fit$models, "[[", "loss")
converted <- ConvertModelList(fit)

test_that("there is no peakEnd equal to data_end", {
  for(model in fit$models){
    has.peak <- !is.null(model$peak_start_end)
    do.check <- is.finite(model$loss) && has.peak
    if(do.check){
      expect_true(all(is.finite(model$seg3_mean_vec)))
      expect_true(model$peak_start_end[2] < fit$data_start_end[2])
    }
  }
  counts.list <- list()
  for(sample.id in names(peak.at.profile.end)){
    sample.counts <- peak.at.profile.end[[sample.id]]
    counts.list[[sample.id]] <- data.frame(sample.id, sample.counts)
  }
  counts <- do.call(rbind, counts.list)
  segs <- subset(converted$segments, peaks == max(peaks))
  breaks <- subset(segs, min(chromStart) < chromStart)
  ggplot()+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., labeller=function(var, val){
      sub("McGill0", "", sub(" ", "\n", val))
    }, scales="free")+
    geom_rect(aes(xmin=chromStart/1e3, xmax=chromEnd/1e3,
                  ymin=0, ymax=count),
              fill="grey",
              data=counts)+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean),
                 color="green",
                 data=segs)+
    geom_vline(aes(xintercept=chromStart/1e3),
               linetype="dashed",
               color="green",
               data=breaks)
})

test_that("ConvertModelList stops when segments make no sense", {
  fit$models[[2]]$peak_start_end[1] <- fit$data_start_end[1]
  expect_error({
    converted <- ConvertModelList(fit)
  }, "solver reported segment less than 1 base")
})

data(peak1.infeasible)

test_that("peak1 infeasible does not corrupt bigger models", {
  for(bp in 2:7){
    fit <- PeakSegJointHeuristic(peak1.infeasible, bp)
    peak.models <- fit$models[-1]
    peak.mat <- sapply(peak.models, "[[", "peak_start_end")
    peak.ok <- peak.mat[1,] < peak.mat[2,]
    loss <- sapply(peak.models, "[[", "loss")
    infinite.loss <- loss == Inf
    is.ok <- infinite.loss | peak.ok
    expect_true(all(is.ok))
  }
})

test_that("Several detects peaks", {
  fit <- PeakSegJointSeveral(peak1.infeasible)
  peak.models <- fit$models[-1]
  loss <- sapply(peak.models, "[[", "loss")
  expect_true(all(is.finite(loss)))
})

data(H3K4me3.PGP.immune.chunk2)

test_that("all samples with peaks can not be selected", {
  for(bp in c(2, 5)){
    fit <- PeakSegJointHeuristic(H3K4me3.PGP.immune.chunk2, bp)
    converted <- ConvertModelList(fit)
    segs27 <- subset(converted$segments, peaks == max(peaks))
    breaks27 <- subset(segs27, min(chromStart) < chromStart)
    ggplot()+
      theme_bw()+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., labeller=function(var, val){
        sub("McGill0", "", sub(" ", "\n", val))
      }, scales="free")+
      geom_step(aes(chromStart/1e3, count),
                data=H3K4me3.PGP.immune.chunk2,
                color="grey50")+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean),
                   data=segs27,
                   color="green",
                   size=1)+
      geom_vline(aes(xintercept=chromStart/1e3),
                 data=breaks27,
                 color="green",
                 linetype="dashed")
    
    ggplot()+
      geom_segment(aes(chromStart/1e3, peaks,
                       xend=chromEnd/1e3, yend=peaks),
                   data=converted$peaks)+
      scale_y_continuous(limits=c(1, 27), breaks=c(1, 27))
    loss <- converted$loss
    loss$cummin <- cummin(loss$loss)
    ## TDH 24 July 2015 do not allow infeasible models.
    ##with(loss, expect_equal(loss, cummin))
    ##exact <- with(loss, exactModelSelection(loss, peaks, peaks))
    ##expect_true(27 %in% exact$peaks)
  }
})

data(H3K36me3.AM.immune.chunk21)

test_that("21 peak loss < 20 peak loss", {
  fit <- PeakSegJointHeuristicStep2(H3K36me3.AM.immune.chunk21)
  converted <- ConvertModelList(fit)
  with(converted$loss, {
    expect_that(loss[22], is_less_than(loss[21]))
  })

  segs <- subset(converted$segments, peaks %in% c(20, 21))
  segs$peaks.str <- paste(segs$peaks)

  ggplot()+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., labeller=function(var, val){
      sub("McGill0", "", sub(" ", "\n", val))
    }, scales="free")+
    geom_step(aes(chromStart/1e3, count),
              data=H3K36me3.AM.immune.chunk21,
              color="grey50")+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean,
                     size=peaks.str,
                     color=peaks.str),
                 data=segs)+
    scale_size_manual(values=c("21"=1, "20"=2))
  
  ## Begin R implementation of multiple sample constrained
  ## segmentation heuristic. Input: profiles data.frame.
  profiles <- H3K36me3.AM.immune.chunk21
  unfilled.profile.list <- split(profiles, profiles$sample.id, drop=TRUE)
  unfilled.chromStart <- min(sapply(unfilled.profile.list, with, chromStart[1]))
  unfilled.chromEnd <-
    max(sapply(unfilled.profile.list, with, chromEnd[length(chromEnd)]))
  unfilled.bases <- unfilled.chromEnd-unfilled.chromStart
  bin.factor <- 2L
  bases.per.bin <- 1L
  while(unfilled.bases/bases.per.bin/bin.factor >= 4){
    bases.per.bin <- bases.per.bin * bin.factor
  }
  n.bins <- as.integer(unfilled.bases %/% bases.per.bin + 1L)
  expect_equal(fit$n_bins, n.bins)
  expect_equal(fit$bases_per_bin, bases.per.bin)
  expect_equal(fit$bin_factor, bin.factor)
  R.data.start.end <- c(unfilled.chromStart, unfilled.chromEnd)
  expect_equal(fit$data_start_end, R.data.start.end)

  extra.bases <- n.bins * bases.per.bin - unfilled.bases
  extra.before <- as.integer(extra.bases/2)
  extra.after <- extra.bases - extra.before
  max.chromStart <- unfilled.chromStart-extra.before
  min.chromEnd <- unfilled.chromEnd + extra.after
  profile.list <- list()
  for(sample.id in names(unfilled.profile.list)){
    one.sample <- 
      subset(unfilled.profile.list[[sample.id]],
             unfilled.chromStart < chromEnd &
               chromStart < unfilled.chromEnd)
    profile.list[[sample.id]] <- one.sample
  }
  bases <- min.chromEnd-max.chromStart
  ## End pre-processing to add zeros.

  expect_identical(fit$bin_start_end[1], max.chromStart)
  expect_identical(fit$bin_start_end[2], min.chromEnd)
  
  ## Small bins are just for testing the computation of the loss
  ## function in the R implementation, and should not be ported to C
  ## code.
  n.samples <- length(profile.list)
  small.chromEnd <- (max.chromStart+1):min.chromEnd
  small.bins <-
    matrix(NA, bases, n.samples,
           dimnames=list(position=small.chromEnd,
             sample.id=names(profile.list)))
  for(sample.id in names(profile.list)){
    one <- profile.list[[sample.id]]
    bins <- binSum(one, max.chromStart, n.bins=bases, empty.as.zero=TRUE)
    stopifnot(bins$chromEnd == small.chromEnd)
    small.bins[, sample.id] <- bins$count
  }

  na.mat <- 
    matrix(NA, n.bins, n.samples,
           dimnames=list(bin=NULL, sample.id=names(profile.list)))
  first.cumsums <- list(count=na.mat)
  bin.list <- list()
  norm.list <- list()
  for(sample.i in seq_along(profile.list)){
    sample.id <- names(profile.list)[sample.i]
    one <- profile.list[[sample.i]]
    max.count <- max(one$count)
    bins <- binSum(one, max.chromStart, bases.per.bin, n.bins)
    stopifnot(n.bins == nrow(bins))
    bins$mean <- with(bins, count/(chromEnd-chromStart))
    bins$mean.norm <- bins$mean/max.count
    bin.list[[sample.id]] <- data.frame(sample.id, rbind(bins, NA))
    bases.vec <- with(bins, chromEnd-chromStart)
    stopifnot(bins$count >= 0)
    first.cumsums$count[, sample.i] <- cumsum(bins$count)
    one$count.norm <- one$count/max.count
    norm.list[[sample.i]] <- one
  }
  bin.df <- do.call(rbind, bin.list)
  norm.df <- do.call(rbind, norm.list)

  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean,
                     color=what),
                 data=data.frame(bin.df, what="bins"))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  
  ## The formula for the optimal Poisson loss 
  ## for 1 segment with d integer data points x_j is
  ## \sum_{j=1}^d m - x_j \log m_j =
  ##   ( \sum_{j=1}^d x_j ) (1-\log m)
  ## where the segment mean m = (\sum x_j)/d,
  OptimalPoissonLoss <- function(mean.value, cumsum.value){
    ifelse(mean.value == 0, 0, cumsum.value * (1-log(mean.value)))
  }
  loss.list <- list()
  peak.list <- list()
  seg.list <- list()
  best.loss.list <- list()
  flat.cumsums <- first.cumsums$count[n.bins, ]
  flat.means <- flat.cumsums/unfilled.bases

  expect_equal(fit$sample_mean_vec, as.numeric(flat.means))
  
  flat.loss.vec <- OptimalPoissonLoss(flat.means, flat.cumsums)
  best.loss.list[["0"]] <- sum(flat.loss.vec)
  for(seg1.last in 1:(n.bins-2)){
    seg1.cumsums <- first.cumsums$count[seg1.last, ]
    seg1.bases <- seg1.last*bases.per.bin
    seg1.chromEnd <- seg1.bases + max.chromStart
    seg1.corrected <- seg1.bases - extra.before
    seg1.means <- seg1.cumsums/seg1.corrected
    seg1.loss.vec <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
    seg1.loss <- sum(seg1.loss.vec)
    for(seg2.last in (seg1.last+1):(n.bins-1)){
      cumsum.seg2.end <- first.cumsums$count[seg2.last, ]
      seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
      seg12.bases <- seg2.last*bases.per.bin
      seg2.bases <- seg12.bases - seg1.bases
      seg2.chromEnd <- seg1.chromEnd + seg2.bases
      seg2.means <- seg2.cumsums/seg2.bases
      seg2.loss.vec <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
      seg2.loss <- sum(seg2.loss.vec)
      
      seg3.cumsums <- first.cumsums$count[n.bins, ]-cumsum.seg2.end
      seg3.bases <- bases - seg12.bases
      seg3.corrected <- seg3.bases - extra.after
      seg3.means <- seg3.cumsums/seg3.corrected

      mean.mat <- rbind(seg1.means, seg2.means, seg3.means)

      this.seg.list <- list()
      for(sample.id in colnames(mean.mat)){
        this.seg.list[[sample.id]] <-
          data.frame(sample.id,
                     chromStart=c(max.chromStart, seg1.chromEnd, seg2.chromEnd,
                                  max.chromStart),
                     chromEnd=c(seg1.chromEnd, seg2.chromEnd, min.chromEnd,
                                min.chromEnd),
                     mean=c(mean.mat[, sample.id], flat.means[[sample.id]]),
                     segments=c(3, 3, 3, 1))
      }
      these.segs <- do.call(rbind, this.seg.list)
      ggplot()+
        scale_color_manual(values=c(data="grey50",
                             bins="black", segments="green"))+
        geom_step(aes(chromStart/1e3, count, color=what),
                  data=data.frame(norm.df, what="data"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     size=1,
                     data=data.frame(these.segs, what="segments"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     data=data.frame(bin.df, what="bins"))+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
          sub("McGill0", "", val)
        })
      
      seg3.loss.vec <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
      seg3.loss <- sum(seg3.loss.vec)
      seg123.loss.vec <- seg1.loss.vec + seg2.loss.vec + seg3.loss.vec

      peak.feasible <- seg1.means < seg2.means & seg2.means > seg3.means
      if(any(peak.feasible)){
        diff.loss.vec <- flat.loss.vec - seg123.loss.vec
        possible.df <- 
          data.frame(flat.loss.vec, seg123.loss.vec,
                     diff.loss.vec, peak.feasible)
        feasible.df <- subset(possible.df, peak.feasible)
        ordered.df <- 
          feasible.df[order(feasible.df$diff.loss.vec, decreasing = TRUE), ]
        for(peaks in 1:nrow(ordered.df)){
          with.peaks <- ordered.df[1:peaks, ]
          with.segs <-
            subset(these.segs,
                   sample.id %in% rownames(with.peaks) & segments==3)
          without.segs <-
            subset(these.segs,
                   !sample.id %in% rownames(with.peaks) & segments==1)
          without.peaks <-
            possible.df[!rownames(possible.df) %in% rownames(with.peaks),]
          with.loss <- with.peaks$seg123.loss.vec
          without.loss <- without.peaks$flat.loss.vec
          total.loss <- sum(with.loss, without.loss)
          loss.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            data.frame(seg1.last, seg2.last, peaks, total.loss)
          peak.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <- 
            data.frame(sample.id=rownames(with.peaks),
                       chromStart=seg1.last*bases.per.bin+max.chromStart,
                       chromEnd=seg2.last*bases.per.bin+max.chromStart)
          seg.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            rbind(without.segs, with.segs)
        }#peaks
      }#any(peak.feasible)
    }#seg2.last
  }#seg1.last
  best.seg.list <- list()
  best.peak.list <- list()
  best.indices.list <- list()
  for(peaks.str in names(loss.list)){
    loss.df <- do.call(rbind, loss.list[[peaks.str]])
    loss.best <- loss.df[which.min(loss.df$total.loss), ]
    best.indices.list[[peaks.str]] <- loss.best
    last.str <- with(loss.best, paste(seg1.last, seg2.last))
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]
    
    peak.df <- peak.list[[peaks.str]][[last.str]]

    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    C.sample.sorted <- sort(paste(C.sample.id))
    R.sample.sorted <- sort(paste(peak.df$sample.id))
    expect_identical(C.sample.sorted, R.sample.sorted)

    seg.df <- seg.list[[peaks.str]][[last.str]]

    ggplot()+
      ggtitle(paste0("best model with ", peaks,
                    " peak", ifelse(peaks==1, "", "s")))+
      scale_color_manual(values=c(data="grey50",
                           bins="black", segments="green"))+
      geom_step(aes(chromStart/1e3, count, color=what),
                data=data.frame(norm.df, what="data"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   size=1,
                   data=data.frame(seg.df, what="segments"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   data=data.frame(bin.df, what="bins"))+
      theme_bw()+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
        sub("McGill0", "", val)
      })

    best.seg.list[[peaks.str]] <- data.frame(peaks, seg.df)
    best.peak.list[[peaks.str]] <-
      data.frame(peaks, y=peaks*-0.1, peak.df)
    best.loss.list[[peaks.str]] <- loss.best$total.loss
  }
  
  best.peaks <- do.call(rbind, best.peak.list)
  by.sample.loc <-
    split(best.peaks, with(best.peaks, paste(sample.id, chromStart, chromEnd)))
  short.label.list <- list()
  for(sample.loc.name in names(by.sample.loc)){
    sample.loc <- by.sample.loc[[sample.loc.name]]
    peaks.txt <- paste(sample.loc$peaks, collapse=",")
    short.label.list[[sample.loc.name]] <-
      data.frame(sample.loc[1,], peaks.txt)
  }
  short.labels <- do.call(rbind, short.label.list)
  best.peaks$sample.id <- factor(best.peaks$sample.id, names(profile.list))
  sample.counts <- table(best.peaks$sample.id)
  dftype <- function(what, df){
    df$sample.id <- factor(df$sample.id, names(sample.counts))
    data.frame(what, df)
  }
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=dftype("peaks", best.peaks))+
    geom_text(aes(chromStart/1e3, y,
                     label=paste0(peaks, " peak",
                       ifelse(peaks==1, "", "s"), " "),
                     color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=dftype("peaks", best.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  ## for 0, 1, ..., maxPeaks, run the bin pyramid grid search,
  ## around the peaks found in this first step.
  library(PeakError)
  zoom.peak.list <- list("0"=Peaks())
  zoom.loss.list <-
    list("0"=data.frame(peaks=0, loss=sum(flat.loss.vec)))
  peaks.str <- "21"  
  for(peaks.str in names(best.indices.list)){
    loss.best <- best.indices.list[[peaks.str]]
    best.peak.df <- best.peak.list[[peaks.str]]
    samples.with.peaks <- paste(best.peak.df$sample.id)
    flat.without.peaks <- sum(flat.loss.vec)-
      sum(flat.loss.vec[samples.with.peaks])
    sub.norm.df <- subset(norm.df, sample.id %in% samples.with.peaks)
    sub.bin.df <- subset(bin.df, sample.id %in% samples.with.peaks)
    n.samples <- length(samples.with.peaks)
    last.cumsums <- list()
    before.cumsums <- list(left=list(), right=list())
    for(data.type in names(first.cumsums)){
      data.mat <- first.cumsums[[data.type]]
      last.cumsums[[data.type]] <-
        data.mat[nrow(data.mat),][samples.with.peaks]
      before.cumsums$left[[data.type]] <- if(loss.best$seg1.last == 1){
        structure(rep(0, n.samples), names=samples.with.peaks)
      }else{
        data.mat[loss.best$seg1.last-1,][samples.with.peaks]
      }
      before.cumsums$right[[data.type]] <-
        data.mat[loss.best$seg2.last-1,][samples.with.peaks]
    }
    last.chromEnd <- min.chromEnd
    n.bins.zoom <- bin.factor * 2L
    n.cumsum <- n.bins.zoom + 1L

    ## These will change at the end of each iteration.
    peakStart <- best.peak.df$chromStart[1]
    peakEnd <- best.peak.df$chromEnd[1]
    search.list <- list()
    best.list <- list()
    bases.per.bin.zoom <- bases.per.bin
    while(bases.per.bin.zoom > 1){
      left.chromStart <- peakStart - bases.per.bin.zoom
      right.chromStart <- peakEnd-bases.per.bin.zoom
      bases.per.bin.zoom <- as.integer(bases.per.bin.zoom / bin.factor)

      cumsum.list <- function(chromStart){
        limits <- bases.per.bin.zoom*(0:(n.cumsum-1))+chromStart
        chromStart.vec <- limits[-length(limits)]
        chromEnd.vec <- limits[-1]
        intervals <-
          paste0(chromStart.vec, "-", chromEnd.vec)
        dn <- list(bin=c("before", intervals), sample.id=samples.with.peaks)
        m <- matrix(NA, n.cumsum, n.samples, dimnames=dn)
        list(count=m, 
             chromStart=chromStart.vec, chromEnd=chromEnd.vec)
      }
      cumsum.mats <-
        list(left=cumsum.list(left.chromStart),
             right=cumsum.list(right.chromStart))

      for(sample.i in seq_along(samples.with.peaks)){
        sample.id <- samples.with.peaks[[sample.i]]
        one <- profile.list[[sample.id]]
        lr.list <-
          list(left=binSum(one, left.chromStart,
                 bases.per.bin.zoom, n.bins.zoom),
               right=binSum(one, right.chromStart,
                 bases.per.bin.zoom, n.bins.zoom,
                 empty.as.zero=TRUE))
        for(lr in names(lr.list)){
          lr.bins <- lr.list[[lr]]
          stopifnot(nrow(lr.bins) == n.bins.zoom)
          lr.bases <- with(lr.bins, chromEnd-chromStart)
          lr.before <- before.cumsums[[lr]]
          lr.counts <- list(count=lr.bins$count)
          for(data.type in names(lr.counts)){
            lr.count.vec <-
              c(lr.before[[data.type]][[sample.id]], lr.counts[[data.type]])
            cumsum.mats[[lr]][[data.type]][, sample.id] <-
              cumsum(lr.count.vec)
          }
        }
      }
      left.mat <- t(cumsum.mats$left$count[-1,])
      colnames(left.mat) <- NULL
      right.mat <- t(cumsum.mats$right$count[-1,])
      colnames(right.mat) <- NULL
      ## print(bases.per.bin.zoom)
      ## print(left.mat)
      ## print(right.mat)
      ## print(as.integer(cumsum.mats$right$count[1,]))
      possible.grid <- 
        expand.grid(left.cumsum.row=3:n.cumsum, right.cumsum.row=2:n.cumsum)
      possible.grid$left.chromStart <-
        cumsum.mats$left$chromStart[possible.grid$left.cumsum.row-1]
      possible.grid$left.chromEnd <-
        cumsum.mats$left$chromEnd[possible.grid$left.cumsum.row-1]
      possible.grid$right.chromStart <-
        cumsum.mats$right$chromStart[possible.grid$right.cumsum.row-1]
      possible.grid$right.chromEnd <-
        cumsum.mats$right$chromEnd[possible.grid$right.cumsum.row-1]
      feasible.grid <-
        subset(possible.grid,
               left.chromEnd <= right.chromStart &
                 right.chromEnd < last.chromEnd)
      feasible.grid$model.i <- 1:nrow(feasible.grid)
      model.list <- list()
      seg.list <- list()
      sample.loss.list <- list()
      for(model.i in feasible.grid$model.i){
        model.row <- feasible.grid[model.i, ]

        seg1.i <- model.row$left.cumsum.row-1
        seg1.cumsums <- cumsum.mats$left$count[seg1.i, ]
        seg1.chromEnd <- cumsum.mats$left$chromStart[seg1.i]
        seg1.bases <- seg1.chromEnd-max.chromStart
        seg1.corrected <- seg1.bases - extra.before
        seg1.means <- seg1.cumsums/seg1.corrected
        seg1.loss <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
        seg.list[[paste(model.i, 1)]] <-
          data.frame(chromStart=max.chromStart, chromEnd=seg1.chromEnd,
                     mean=seg1.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        seg1.mat <-
          small.bins[small.chromEnd <= seg1.chromEnd,
                     samples.with.peaks,
                     drop=FALSE]
        stopifnot(nrow(seg1.mat) == seg1.bases)
        ## stopifnot(all.equal(as.numeric(colMeans(seg1.mat)),
        ##                     as.numeric(seg1.means)))

        cumsum.seg2.end <-
          cumsum.mats$right$count[model.row$right.cumsum.row, ]
        seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
        seg2.chromEnd <-
          cumsum.mats$right$chromEnd[model.row$right.cumsum.row-1]
        seg2.bases <- seg2.chromEnd-seg1.chromEnd
        seg2.means <- seg2.cumsums/seg2.bases
        seg2.loss <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
        seg.list[[paste(model.i, 2)]] <-
          data.frame(chromStart=seg1.chromEnd, chromEnd=seg2.chromEnd,
                     mean=seg2.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        is.seg2 <-
          seg1.chromEnd < small.chromEnd &
            small.chromEnd <= seg2.chromEnd
        stopifnot(sum(is.seg2) == seg2.bases)
        seg2.mat <-
          small.bins[is.seg2,
                     samples.with.peaks,
                     drop=FALSE]
        stopifnot(all.equal(as.numeric(colMeans(seg2.mat)),
                            as.numeric(seg2.means)))
        
        seg3.cumsums <- last.cumsums$count-cumsum.seg2.end
        seg3.bases <- last.chromEnd-seg2.chromEnd
        seg3.corrected <- seg3.bases - extra.after
        seg3.means <- seg3.cumsums/seg3.corrected
        seg3.loss <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
        seg.list[[paste(model.i, 3)]] <-
          data.frame(chromStart=seg2.chromEnd, chromEnd=last.chromEnd,
                     mean=seg3.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        is.seg3 <-
          seg2.chromEnd < small.chromEnd &
            small.chromEnd <= last.chromEnd
        stopifnot(sum(is.seg3) == seg3.bases)
        seg3.mat <-
          small.bins[is.seg3,
                     samples.with.peaks,
                     drop=FALSE]
        ## stopifnot(all.equal(as.numeric(colMeans(seg3.mat)),
        ##                     as.numeric(seg3.means)))

        total.bases <- sum(seg1.bases + seg2.bases + seg3.bases)
        stopifnot(all.equal(total.bases, last.chromEnd - max.chromStart))
        
        total.loss <- sum(seg1.loss + seg2.loss + seg3.loss)
        with.without.loss <- flat.without.peaks + total.loss
        total.loss.vec <- seg1.loss+seg2.loss+seg3.loss
        feasible.vec <- seg1.means < seg2.means & seg2.means > seg3.means
        model.list[[model.i]] <-
          data.frame(model.row, total.loss, with.without.loss,
                     feasible=all(feasible.vec))
        sample.loss.list[[model.i]] <-
          data.frame(sample.id=samples.with.peaks, loss=total.loss.vec)
      }
      ## Plot the segment means as a reality check.
      seg.df <- do.call(rbind, seg.list)
      segPlot <- 
      ggplot()+
        geom_step(aes(chromStart/1e3, count),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean),
                     data=data.frame(seg.df, what="models"),
                     size=1, color="green")+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ model.i, scales="free")

      ## Then plot the peaks only, colored by total cost of the model.
      model.df <- do.call(rbind, model.list)
      model.df$y <- with(model.df, model.i/max(model.i))
      feasible.models <- subset(model.df, feasible)
      best.model <- feasible.models[which.min(feasible.models$total.loss), ]

      ggplot()+
        coord_equal()+
        geom_point(aes(with.without.loss, total.loss),
                   data=model.df)

      binPlot <- 
      ggplot()+
        xlab("position on chromosome (kilobases = kb)")+
        scale_y_continuous("", breaks=NULL)+
        geom_step(aes(chromStart/1e3, count.norm),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean.norm,
                         xend=chromEnd/1e3, yend=mean.norm),
                     data=data.frame(sub.bin.df, what="bins"),
                     color="black")+
        geom_segment(aes(peakStart/1e3, 0,
                         xend=peakEnd/1e3, yend=0),
                     data=data.frame(loss.best, what="peak"),
                     color="green")+
        scale_linetype_manual(values=c("TRUE"=1, "FALSE"=2))+
        geom_segment(aes(left.chromStart/1e3, y,
                         color=total.loss,
                         linetype=feasible,
                         xend=right.chromEnd/1e3, yend=y),
                     data=data.frame(model.df, sample.id="peaks"),
                     size=1)+
        geom_text(aes(left.chromStart/1e3, y,
                      label="optimal "),
                  data=data.frame(best.model, sample.id="peaks"),
                  hjust=1)+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ .)

      search.list[[paste(bases.per.bin.zoom)]] <-
        data.frame(bases.per.bin.zoom, model.df)
      best.list[[paste(bases.per.bin.zoom)]] <-
        data.frame(bases.per.bin.zoom, best.model)

      peakStart <- best.model$left.chromStart
      peakEnd <- best.model$right.chromEnd
      ## cat(sprintf("\nn_peaks=%s bases_per_bin=%d [%d,%d] loss=%15.6f\n",
      ##             peaks.str, bases.per.bin.zoom,
      ##             peakStart, peakEnd, best.model$total.loss))
      before.i.list <-
        list(left=best.model$left.cumsum.row-2,
             right=best.model$right.cumsum.row-1)
      for(lr in names(before.i.list)){
        before.i <- before.i.list[[lr]]
        mats <- cumsum.mats[[lr]]
        before.cumsums[[lr]]$count <-
          structure(mats$count[before.i,],
                    names=colnames(mats$count))
      }
    }##while(bases.per.bin.zoom)
    samplefac <- function(L){
      df <- do.call(rbind, L)
      sample.ids <- unique(norm.df$sample.id)
      bases.num <- sort(unique(df$bases.per.bin.zoom), decreasing=TRUE)
      levs <- c(paste(sample.ids), paste(bases.num))
      df$sample.id <- factor(df$bases.per.bin.zoom, levs)
      df
    }
    search.df <- samplefac(search.list)
    best.df <- samplefac(best.list)
    C.cols <-
      c("bases.per.bin.zoom", "left.chromStart",
        "right.chromEnd", "with.without.loss")
    best.df[, C.cols]
    searchPlot <- 
      ggplot()+
        xlab("position on chromosome (kilobases = kb)")+
        scale_y_continuous("", breaks=NULL)+
        geom_step(aes(chromStart/1e3, count.norm),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean.norm,
                         xend=chromEnd/1e3, yend=mean.norm),
                     data=data.frame(sub.bin.df, what="bins"),
                     color="black")+
        geom_segment(aes(left.chromStart/1e3, y,
                         color=total.loss,
                         xend=right.chromEnd/1e3, yend=y),
                     data=search.df,
                     size=1)+
        geom_text(aes(left.chromStart/1e3, y,
                      label="selected"),
                  data=best.df,
                  hjust=1)+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., labeller=function(var, val){
          sub("McGill0", "", val)
        })
    ## Check R=C implementation.
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]
    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    for(seg.i in 1:3){
      model.name <- paste(best.model$model.i, seg.i)
      seg.i.df <- seg.list[[model.name]]
      rownames(seg.i.df) <- seg.i.df$sample.id
      R.mean.vec <- as.numeric(seg.i.df[C.sample.id, "mean"])
      C.mean.vec <- model[[sprintf("seg%d_mean_vec", seg.i)]]
      expect_equal(C.mean.vec, R.mean.vec)
    }
    C.start.end <- model$peak_start_end
    R.start.end <- c(peakStart, peakEnd)
    expect_identical(C.start.end, R.start.end)
    
    loss.with.peaks <- sample.loss.list[[best.model$model.i]]
    samples.without.peaks <-
      names(profile.list)[!names(profile.list) %in% samples.with.peaks]
    loss.without.peaks <-
      data.frame(sample.id=samples.without.peaks,
                 loss=flat.loss.vec[samples.without.peaks])
    sample.loss.df <- rbind(loss.with.peaks, loss.without.peaks)
    peaks <- as.numeric(peaks.str)
    zoom.loss.list[[peaks.str]] <- 
      data.frame(peaks, loss=sum(sample.loss.df$loss))
    zoom.peak.list[[peaks.str]] <-
      data.frame(peaks, sample.id=samples.with.peaks,
                 chromStart=peakStart, chromEnd=peakEnd)
  }#peaks.str
  zoom.peaks <- do.call(rbind, zoom.peak.list)
  zoom.loss <- do.call(rbind, zoom.loss.list)
  R.loss.vec <- as.numeric(zoom.loss$loss)
  C.loss.vec <- sapply(fit$models, "[[", "loss")
  expect_equal(C.loss.vec, R.loss.vec)
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, -peaks*0.1,
                     xend=chromEnd/1e3, yend=-peaks*0.1,
                     color=what),
                 size=1,
                 data=dftype("peaks", zoom.peaks))+
    geom_text(aes(chromStart/1e3, -peaks*0.1,
                  label=paste0(peaks,
                    " peak",
                    ifelse(peaks==1, "", "s"),
                    " "),
                  color=what),
              hjust=1,
              vjust=0.5,
              size=3,
              data=dftype("peaks", zoom.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  ggplot()+
    geom_segment(aes(chromStart/1e3, peaks,
                     xend=chromEnd/1e3, yend=peaks),
                 data=zoom.peaks)
  ggplot(zoom.loss, aes(peaks, loss))+
    geom_point()+
    geom_line()
})

test_that("Step1 C result agrees with R", {
  profiles <- H3K36me3.AM.immune.chunk21
  fit <- PeakSegJointHeuristicStep1(profiles)
  unfilled.profile.list <- split(profiles, profiles$sample.id, drop=TRUE)
  unfilled.chromStart <- min(sapply(unfilled.profile.list, with, chromStart[1]))
  unfilled.chromEnd <-
    max(sapply(unfilled.profile.list, with, chromEnd[length(chromEnd)]))
  unfilled.bases <- unfilled.chromEnd-unfilled.chromStart
  bin.factor <- 2L
  bases.per.bin <- 1L
  while(unfilled.bases/bases.per.bin/bin.factor >= 4){
    bases.per.bin <- bases.per.bin * bin.factor
  }
  n.bins <- as.integer(unfilled.bases %/% bases.per.bin + 1L)
  expect_equal(fit$n_bins, n.bins)
  expect_equal(fit$bases_per_bin, bases.per.bin)
  expect_equal(fit$bin_factor, bin.factor)
  R.data.start.end <- c(unfilled.chromStart, unfilled.chromEnd)
  expect_equal(fit$data_start_end, R.data.start.end)

  extra.bases <- n.bins * bases.per.bin - unfilled.bases
  extra.before <- as.integer(extra.bases/2)
  extra.after <- extra.bases - extra.before
  max.chromStart <- unfilled.chromStart-extra.before
  min.chromEnd <- unfilled.chromEnd + extra.after
  profile.list <- list()
  for(sample.id in names(unfilled.profile.list)){
    one.sample <- 
      subset(unfilled.profile.list[[sample.id]],
             unfilled.chromStart < chromEnd &
               chromStart < unfilled.chromEnd)
    profile.list[[sample.id]] <- one.sample
  }
  bases <- min.chromEnd-max.chromStart
  ## End pre-processing to add zeros.

  expect_identical(fit$bin_start_end[1], max.chromStart)
  expect_identical(fit$bin_start_end[2], min.chromEnd)
  
  ## Small bins are just for testing the computation of the loss
  ## function in the R implementation, and should not be ported to C
  ## code.
  n.samples <- length(profile.list)
  small.chromEnd <- (max.chromStart+1):min.chromEnd
  small.bins <-
    matrix(NA, bases, n.samples,
           dimnames=list(position=small.chromEnd,
             sample.id=names(profile.list)))
  for(sample.id in names(profile.list)){
    one <- profile.list[[sample.id]]
    bins <- binSum(one, max.chromStart, n.bins=bases, empty.as.zero=TRUE)
    stopifnot(bins$chromEnd == small.chromEnd)
    small.bins[, sample.id] <- bins$count
  }

  na.mat <- 
    matrix(NA, n.bins, n.samples,
           dimnames=list(bin=NULL, sample.id=names(profile.list)))
  first.cumsums <- list(count=na.mat)
  bin.list <- list()
  norm.list <- list()
  for(sample.i in seq_along(profile.list)){
    sample.id <- names(profile.list)[sample.i]
    one <- profile.list[[sample.i]]
    max.count <- max(one$count)
    bins <- binSum(one, max.chromStart, bases.per.bin, n.bins)
    stopifnot(n.bins == nrow(bins))
    bins$mean <- with(bins, count/(chromEnd-chromStart))
    bins$mean.norm <- bins$mean/max.count
    bin.list[[sample.id]] <- data.frame(sample.id, rbind(bins, NA))
    bases.vec <- with(bins, chromEnd-chromStart)
    stopifnot(bins$count >= 0)
    first.cumsums$count[, sample.i] <- cumsum(bins$count)
    one$count.norm <- one$count/max.count
    norm.list[[sample.i]] <- one
  }
  bin.df <- do.call(rbind, bin.list)
  norm.df <- do.call(rbind, norm.list)

  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean,
                     color=what),
                 data=data.frame(bin.df, what="bins"))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  
  ## The formula for the optimal Poisson loss 
  ## for 1 segment with d integer data points x_j is
  ## \sum_{j=1}^d m - x_j \log m_j =
  ##   ( \sum_{j=1}^d x_j ) (1-\log m)
  ## where the segment mean m = (\sum x_j)/d,
  OptimalPoissonLoss <- function(mean.value, cumsum.value){
    ifelse(mean.value == 0, 0, cumsum.value * (1-log(mean.value)))
  }
  loss.list <- list()
  peak.list <- list()
  seg.list <- list()
  best.loss.list <- list()
  flat.cumsums <- first.cumsums$count[n.bins, ]
  flat.means <- flat.cumsums/unfilled.bases

  expect_equal(fit$sample_mean_vec, as.numeric(flat.means))
  
  flat.loss.vec <- OptimalPoissonLoss(flat.means, flat.cumsums)
  R.flat.loss <- as.numeric(flat.loss.vec[fit$sample.id])
  expect_equal(fit$flat_loss_vec, R.flat.loss)
  
  best.loss.list[["0"]] <- sum(flat.loss.vec)
  for(seg1.last in 1:(n.bins-2)){
    seg1.cumsums <- first.cumsums$count[seg1.last, ]
    seg1.bases <- seg1.last*bases.per.bin
    seg1.chromEnd <- seg1.bases + max.chromStart
    seg1.corrected <- seg1.bases - extra.before
    seg1.means <- seg1.cumsums/seg1.corrected
    seg1.loss.vec <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
    seg1.loss <- sum(seg1.loss.vec)
    for(seg2.last in (seg1.last+1):(n.bins-1)){
      cumsum.seg2.end <- first.cumsums$count[seg2.last, ]
      seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
      seg12.bases <- seg2.last*bases.per.bin
      seg2.bases <- seg12.bases - seg1.bases
      seg2.chromEnd <- seg1.chromEnd + seg2.bases
      seg2.means <- seg2.cumsums/seg2.bases
      seg2.loss.vec <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
      seg2.loss <- sum(seg2.loss.vec)
      
      seg3.cumsums <- first.cumsums$count[n.bins, ]-cumsum.seg2.end
      seg3.bases <- bases - seg12.bases
      seg3.corrected <- seg3.bases - extra.after
      seg3.means <- seg3.cumsums/seg3.corrected

      mean.mat <- rbind(seg1.means, seg2.means, seg3.means)

      this.seg.list <- list()
      for(sample.id in colnames(mean.mat)){
        this.seg.list[[sample.id]] <-
          data.frame(sample.id,
                     chromStart=c(max.chromStart, seg1.chromEnd, seg2.chromEnd,
                                  max.chromStart),
                     chromEnd=c(seg1.chromEnd, seg2.chromEnd, min.chromEnd,
                                min.chromEnd),
                     mean=c(mean.mat[, sample.id], flat.means[[sample.id]]),
                     segments=c(3, 3, 3, 1))
      }
      these.segs <- do.call(rbind, this.seg.list)
      ggplot()+
        scale_color_manual(values=c(data="grey50",
                             bins="black", segments="green"))+
        geom_step(aes(chromStart/1e3, count, color=what),
                  data=data.frame(norm.df, what="data"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     size=1,
                     data=data.frame(these.segs, what="segments"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     data=data.frame(bin.df, what="bins"))+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
          sub("McGill0", "", val)
        })
      
      seg3.loss.vec <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
      seg3.loss <- sum(seg3.loss.vec)
      seg123.loss.vec <- seg1.loss.vec + seg2.loss.vec + seg3.loss.vec

      peak.feasible <- seg1.means < seg2.means & seg2.means > seg3.means
      if(any(peak.feasible)){
        diff.loss.vec <- flat.loss.vec - seg123.loss.vec
        possible.df <- 
          data.frame(flat.loss.vec, seg123.loss.vec,
                     diff.loss.vec, peak.feasible)
        feasible.df <- subset(possible.df, peak.feasible)
        ordered.df <- 
          feasible.df[order(feasible.df$diff.loss.vec, decreasing = TRUE), ]
        for(peaks in 1:nrow(ordered.df)){
          with.peaks <- ordered.df[1:peaks, ]
          with.segs <-
            subset(these.segs,
                   sample.id %in% rownames(with.peaks) & segments==3)
          without.segs <-
            subset(these.segs,
                   !sample.id %in% rownames(with.peaks) & segments==1)
          without.peaks <-
            possible.df[!rownames(possible.df) %in% rownames(with.peaks),]
          with.loss <- with.peaks$seg123.loss.vec
          without.loss <- without.peaks$flat.loss.vec
          total.loss <- sum(with.loss, without.loss)
          loss.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            data.frame(seg1.last, seg2.last, peaks, total.loss)
          peak.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <- 
            data.frame(sample.id=rownames(with.peaks),
                       chromStart=seg1.last*bases.per.bin+max.chromStart,
                       chromEnd=seg2.last*bases.per.bin+max.chromStart)
          seg.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            rbind(without.segs, with.segs)
        }#peaks
      }#any(peak.feasible)
    }#seg2.last
  }#seg1.last
  best.seg.list <- list()
  best.peak.list <- list()
  best.indices.list <- list()
  for(peaks.str in names(loss.list)){
    loss.df <- do.call(rbind, loss.list[[peaks.str]])
    loss.best <- loss.df[which.min(loss.df$total.loss), ]
    best.indices.list[[peaks.str]] <- loss.best
    last.str <- with(loss.best, paste(seg1.last, seg2.last))
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]
    
    peak.df <- peak.list[[peaks.str]][[last.str]]
    C.start.end <- model$peak_start_end
    R.start.end <- as.integer(peak.df[1, c("chromStart", "chromEnd")])
    expect_equal(C.start.end, R.start.end)

    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    C.sample.sorted <- sort(paste(C.sample.id))
    R.sample.sorted <- sort(paste(peak.df$sample.id))
    expect_identical(C.sample.sorted, R.sample.sorted)

    seg.df <- seg.list[[peaks.str]][[last.str]]
    segs.by.chromStart <- split(seg.df, seg.df$chromStart)
    for(seg.i in seq_along(segs.by.chromStart)){
      seg.i.df <- segs.by.chromStart[[seg.i]]
      rownames(seg.i.df) <- seg.i.df$sample.id
      R.mean.vec <- as.numeric(seg.i.df[C.sample.id, "mean"])
      C.mean.vec <- model[[sprintf("seg%d_mean_vec", seg.i)]]
      expect_equal(C.mean.vec, R.mean.vec)
    }

    ggplot()+
      ggtitle(paste0("best model with ", peaks,
                    " peak", ifelse(peaks==1, "", "s")))+
      scale_color_manual(values=c(data="grey50",
                           bins="black", segments="green"))+
      geom_step(aes(chromStart/1e3, count, color=what),
                data=data.frame(norm.df, what="data"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   size=1,
                   data=data.frame(seg.df, what="segments"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   data=data.frame(bin.df, what="bins"))+
      theme_bw()+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
        sub("McGill0", "", val)
      })

    best.seg.list[[peaks.str]] <- data.frame(peaks, seg.df)
    best.peak.list[[peaks.str]] <-
      data.frame(peaks, y=peaks*-0.1, peak.df)
    best.loss.list[[peaks.str]] <- loss.best$total.loss
  }
  R.loss.vec <- as.numeric(best.loss.list)
  C.loss.vec <- sapply(fit$models, "[[", "loss")
  expect_equal(C.loss.vec, R.loss.vec)
  
  best.peaks <- do.call(rbind, best.peak.list)
  by.sample.loc <-
    split(best.peaks, with(best.peaks, paste(sample.id, chromStart, chromEnd)))
  short.label.list <- list()
  for(sample.loc.name in names(by.sample.loc)){
    sample.loc <- by.sample.loc[[sample.loc.name]]
    peaks.txt <- paste(sample.loc$peaks, collapse=",")
    short.label.list[[sample.loc.name]] <-
      data.frame(sample.loc[1,], peaks.txt)
  }
  short.labels <- do.call(rbind, short.label.list)
  best.peaks$sample.id <- factor(best.peaks$sample.id, names(profile.list))
  sample.counts <- table(best.peaks$sample.id)
  dftype <- function(what, df){
    df$sample.id <- factor(df$sample.id, names(sample.counts))
    data.frame(what, df)
  }
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=dftype("peaks", best.peaks))+
    geom_text(aes(chromStart/1e3, y,
                     label=paste0(peaks, " peak",
                       ifelse(peaks==1, "", "s"), " "),
                     color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=dftype("peaks", best.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  for(peaks.str in names(best.indices.list)){
    loss.best <- best.indices.list[[peaks.str]]
    best.peak.df <- best.peak.list[[peaks.str]]
    samples.with.peaks <- paste(best.peak.df$sample.id)
    n.samples <- length(samples.with.peaks)
    last.cumsums <- list()
    before.cumsums <- list(left=list(), right=list())
    for(data.type in names(first.cumsums)){
      data.mat <- first.cumsums[[data.type]]
      last.cumsums[[data.type]] <-
        data.mat[nrow(data.mat),][samples.with.peaks]
      before.cumsums$left[[data.type]] <- if(loss.best$seg1.last == 1){
        structure(rep(0, n.samples), names=samples.with.peaks)
      }else{
        data.mat[loss.best$seg1.last-1,][samples.with.peaks]
      }
      before.cumsums$right[[data.type]] <-
        data.mat[loss.best$seg2.last-1,][samples.with.peaks]
    }
    peaks <- as.integer(peaks.str)
    model.i <- peaks+1L
    model <- fit$models[[model.i]]
    C.sample.i <- model$samples_with_peaks_vec + 1L
    C.sample.id <- fit$sample.id[C.sample.i]
    for(lr in names(before.cumsums)){
      R.cumsums <- as.integer(before.cumsums[[lr]]$count[C.sample.id])
      C.cumsums <- model[[paste0(lr, "_cumsum_vec")]]
      expect_equal(C.cumsums, R.cumsums)
    }
  }
  R.last.cumsums <- as.integer(last.cumsums$count[fit$sample.id])
  C.last.cumsums <- fit$last_cumsum_vec
  expect_equal(C.last.cumsums, R.last.cumsums)
})

data(H3K4me3.TDH.other.chunk8)

test_that("8 peaks are feasible", {
  fit <- PeakSegJointHeuristicStep2(H3K4me3.TDH.other.chunk8, 3)
  converted <- ConvertModelList(fit)

  peaks <- unique(converted$peaks[, c("peaks", "chromStart", "chromEnd")])

  peaks7 <- subset(converted$peaks, peaks == 7)
  expect_false("McGill0267" %in% peaks7$sample.id)

  segs <- subset(converted$segments, peaks == 7)
  breaks <- subset(segs, chromStart > min(chromStart))
  
  ggplot()+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., labeller=function(var, val){
      sub("McGill0", "", sub(" ", "\n", val))
    }, scales="free")+
    geom_step(aes(chromStart/1e3, count),
              data=H3K4me3.TDH.other.chunk8,
              color="grey50")+
    geom_vline(aes(xintercept=chromStart/1e3),
               linetype="dashed",
               color="green",
               data=breaks)+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean),
                 color="green",
                 size=1,
                 data=segs)
  
  ## Begin R implementation of multiple sample constrained
  ## segmentation heuristic. Input: profiles data.frame.
  profiles <- H3K4me3.TDH.other.chunk8
  unfilled.profile.list <- split(profiles, profiles$sample.id, drop=TRUE)
  unfilled.chromStart <- min(sapply(unfilled.profile.list, with, chromStart[1]))
  unfilled.chromEnd <-
    max(sapply(unfilled.profile.list, with, chromEnd[length(chromEnd)]))
  unfilled.bases <- unfilled.chromEnd-unfilled.chromStart
  bin.factor <- 3L
  bases.per.bin <- 1L
  while(unfilled.bases/bases.per.bin/bin.factor >= 4){
    bases.per.bin <- bases.per.bin * bin.factor
  }
  n.bins <- as.integer(unfilled.bases %/% bases.per.bin + 1L)
  expect_equal(fit$n_bins, n.bins)
  expect_equal(fit$bases_per_bin, bases.per.bin)
  expect_equal(fit$bin_factor, bin.factor)
  R.data.start.end <- c(unfilled.chromStart, unfilled.chromEnd)
  expect_equal(fit$data_start_end, R.data.start.end)

  extra.bases <- n.bins * bases.per.bin - unfilled.bases
  extra.before <- as.integer(extra.bases/2)
  extra.after <- extra.bases - extra.before
  max.chromStart <- unfilled.chromStart-extra.before
  min.chromEnd <- unfilled.chromEnd + extra.after
  profile.list <- list()
  for(sample.id in names(unfilled.profile.list)){
    one.sample <- 
      subset(unfilled.profile.list[[sample.id]],
             unfilled.chromStart < chromEnd &
               chromStart < unfilled.chromEnd)
    profile.list[[sample.id]] <- one.sample
  }
  bases <- min.chromEnd-max.chromStart
  ## End pre-processing to add zeros.

  expect_identical(fit$bin_start_end[1], max.chromStart)
  expect_identical(fit$bin_start_end[2], min.chromEnd)
  
  ## Small bins are just for testing the computation of the loss
  ## function in the R implementation, and should not be ported to C
  ## code.
  n.samples <- length(profile.list)
  small.chromEnd <- (max.chromStart+1):min.chromEnd
  small.bins <-
    matrix(NA, bases, n.samples,
           dimnames=list(position=small.chromEnd,
             sample.id=names(profile.list)))
  for(sample.id in names(profile.list)){
    one <- profile.list[[sample.id]]
    bins <- binSum(one, max.chromStart, n.bins=bases, empty.as.zero=TRUE)
    stopifnot(bins$chromEnd == small.chromEnd)
    small.bins[, sample.id] <- bins$count
  }

  na.mat <- 
    matrix(NA, n.bins, n.samples,
           dimnames=list(bin=NULL, sample.id=names(profile.list)))
  first.cumsums <- list(count=na.mat)
  bin.list <- list()
  norm.list <- list()
  for(sample.i in seq_along(profile.list)){
    sample.id <- names(profile.list)[sample.i]
    one <- profile.list[[sample.i]]
    max.count <- max(one$count)
    bins <- binSum(one, max.chromStart, bases.per.bin, n.bins)
    stopifnot(n.bins == nrow(bins))
    bins$mean <- with(bins, count/(chromEnd-chromStart))
    bins$mean.norm <- bins$mean/max.count
    bin.list[[sample.id]] <- data.frame(sample.id, rbind(bins, NA))
    bases.vec <- with(bins, chromEnd-chromStart)
    stopifnot(bins$count >= 0)
    first.cumsums$count[, sample.i] <- cumsum(bins$count)
    one$count.norm <- one$count/max.count
    norm.list[[sample.i]] <- one
  }
  bin.df <- do.call(rbind, bin.list)
  norm.df <- do.call(rbind, norm.list)

  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean,
                     color=what),
                 data=data.frame(bin.df, what="bins"))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  
  ## The formula for the optimal Poisson loss 
  ## for 1 segment with d integer data points x_j is
  ## \sum_{j=1}^d m - x_j \log m_j =
  ##   ( \sum_{j=1}^d x_j ) (1-\log m)
  ## where the segment mean m = (\sum x_j)/d,
  OptimalPoissonLoss <- function(mean.value, cumsum.value){
    ifelse(mean.value == 0, 0, cumsum.value * (1-log(mean.value)))
  }
  loss.list <- list()
  peak.list <- list()
  seg.list <- list()
  best.loss.list <- list()
  flat.cumsums <- first.cumsums$count[n.bins, ]
  flat.means <- flat.cumsums/unfilled.bases

  expect_equal(fit$sample_mean_vec, as.numeric(flat.means))
  
  flat.loss.vec <- OptimalPoissonLoss(flat.means, flat.cumsums)
  best.loss.list[["0"]] <- sum(flat.loss.vec)
  for(seg1.last in 1:(n.bins-2)){
    seg1.cumsums <- first.cumsums$count[seg1.last, ]
    seg1.bases <- seg1.last*bases.per.bin
    seg1.chromEnd <- seg1.bases + max.chromStart
    seg1.means <- seg1.cumsums/seg1.bases
    seg1.loss.vec <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
    seg1.loss <- sum(seg1.loss.vec)
    for(seg2.last in (seg1.last+1):(n.bins-1)){
      cumsum.seg2.end <- first.cumsums$count[seg2.last, ]
      seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
      seg12.bases <- seg2.last*bases.per.bin
      seg2.bases <- seg12.bases - seg1.bases
      seg2.chromEnd <- seg1.chromEnd + seg2.bases
      seg2.means <- seg2.cumsums/seg2.bases
      seg2.loss.vec <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
      seg2.loss <- sum(seg2.loss.vec)
      
      seg3.cumsums <- first.cumsums$count[n.bins, ]-cumsum.seg2.end
      seg3.bases <- bases - seg12.bases
      seg3.means <- seg3.cumsums/seg3.bases

      mean.mat <- rbind(seg1.means, seg2.means, seg3.means)

      this.seg.list <- list()
      for(sample.id in colnames(mean.mat)){
        this.seg.list[[sample.id]] <-
          data.frame(sample.id,
                     chromStart=c(max.chromStart, seg1.chromEnd, seg2.chromEnd,
                                  max.chromStart),
                     chromEnd=c(seg1.chromEnd, seg2.chromEnd, min.chromEnd,
                                min.chromEnd),
                     mean=c(mean.mat[, sample.id], flat.means[[sample.id]]),
                     segments=c(3, 3, 3, 1))
      }
      these.segs <- do.call(rbind, this.seg.list)
      ggplot()+
        scale_color_manual(values=c(data="grey50",
                             bins="black", segments="green"))+
        geom_step(aes(chromStart/1e3, count, color=what),
                  data=data.frame(norm.df, what="data"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     size=1,
                     data=data.frame(these.segs, what="segments"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     data=data.frame(bin.df, what="bins"))+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
          sub("McGill0", "", val)
        })
      
      seg3.loss.vec <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
      seg3.loss <- sum(seg3.loss.vec)
      seg123.loss.vec <- seg1.loss.vec + seg2.loss.vec + seg3.loss.vec

      peak.feasible <- seg1.means < seg2.means & seg2.means > seg3.means
      if(any(peak.feasible)){
        diff.loss.vec <- flat.loss.vec - seg123.loss.vec
        possible.df <- 
          data.frame(flat.loss.vec, seg123.loss.vec,
                     diff.loss.vec, peak.feasible)
        feasible.df <- subset(possible.df, peak.feasible)
        ordered.df <- 
          feasible.df[order(feasible.df$diff.loss.vec, decreasing = TRUE), ]
        for(peaks in 1:nrow(ordered.df)){
          with.peaks <- ordered.df[1:peaks, ]
          with.segs <-
            subset(these.segs,
                   sample.id %in% rownames(with.peaks) & segments==3)
          without.segs <-
            subset(these.segs,
                   !sample.id %in% rownames(with.peaks) & segments==1)
          without.peaks <-
            possible.df[!rownames(possible.df) %in% rownames(with.peaks),]
          with.loss <- with.peaks$seg123.loss.vec
          without.loss <- without.peaks$flat.loss.vec
          total.loss <- sum(with.loss, without.loss)
          loss.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            data.frame(seg1.last, seg2.last, peaks, total.loss)
          peak.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <- 
            data.frame(sample.id=rownames(with.peaks),
                       chromStart=seg1.last*bases.per.bin+max.chromStart,
                       chromEnd=seg2.last*bases.per.bin+max.chromStart)
          seg.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            rbind(without.segs, with.segs)
        }#peaks
      }#any(peak.feasible)
    }#seg2.last
  }#seg1.last
  best.seg.list <- list()
  best.peak.list <- list()
  best.indices.list <- list()
  for(peaks.str in names(loss.list)){
    loss.df <- do.call(rbind, loss.list[[peaks.str]])
    loss.best <- loss.df[which.min(loss.df$total.loss), ]
    best.indices.list[[peaks.str]] <- loss.best
    last.str <- with(loss.best, paste(seg1.last, seg2.last))
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]
    
    peak.df <- peak.list[[peaks.str]][[last.str]]

    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    C.sample.sorted <- sort(paste(C.sample.id))
    R.sample.sorted <- sort(paste(peak.df$sample.id))
    expect_identical(C.sample.sorted, R.sample.sorted)

    seg.df <- seg.list[[peaks.str]][[last.str]]

    ggplot()+
      ggtitle(paste0("best model with ", peaks,
                    " peak", ifelse(peaks==1, "", "s")))+
      scale_color_manual(values=c(data="grey50",
                           bins="black", segments="green"))+
      geom_step(aes(chromStart/1e3, count, color=what),
                data=data.frame(norm.df, what="data"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   size=1,
                   data=data.frame(seg.df, what="segments"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   data=data.frame(bin.df, what="bins"))+
      theme_bw()+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
        sub("McGill0", "", val)
      })

    best.seg.list[[peaks.str]] <- data.frame(peaks, seg.df)
    best.peak.list[[peaks.str]] <-
      data.frame(peaks, y=peaks*-0.1, peak.df)
    best.loss.list[[peaks.str]] <- loss.best$total.loss
  }
  
  best.peaks <- do.call(rbind, best.peak.list)
  by.sample.loc <-
    split(best.peaks, with(best.peaks, paste(sample.id, chromStart, chromEnd)))
  short.label.list <- list()
  for(sample.loc.name in names(by.sample.loc)){
    sample.loc <- by.sample.loc[[sample.loc.name]]
    peaks.txt <- paste(sample.loc$peaks, collapse=",")
    short.label.list[[sample.loc.name]] <-
      data.frame(sample.loc[1,], peaks.txt)
  }
  short.labels <- do.call(rbind, short.label.list)
  best.peaks$sample.id <- factor(best.peaks$sample.id, names(profile.list))
  sample.counts <- table(best.peaks$sample.id)
  dftype <- function(what, df){
    df$sample.id <- factor(df$sample.id, names(sample.counts))
    data.frame(what, df)
  }
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=dftype("peaks", best.peaks))+
    geom_text(aes(chromStart/1e3, y,
                     label=paste0(peaks, " peak",
                       ifelse(peaks==1, "", "s"), " "),
                     color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=dftype("peaks", best.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  ## for 0, 1, ..., maxPeaks, run the bin pyramid grid search,
  ## around the peaks found in this first step.
  library(PeakError)
  zoom.peak.list <- list("0"=Peaks())
  zoom.loss.list <-
    list("0"=data.frame(peaks=0, loss=sum(flat.loss.vec)))
  peaks.str <- "10" #problematic!
  for(peaks.str in paste(1:9)){
    loss.best <- best.indices.list[[peaks.str]]
    best.peak.df <- best.peak.list[[peaks.str]]
    samples.with.peaks <- paste(best.peak.df$sample.id)
    flat.without.peaks <- sum(flat.loss.vec)-
      sum(flat.loss.vec[samples.with.peaks])
    sub.norm.df <- subset(norm.df, sample.id %in% samples.with.peaks)
    sub.bin.df <- subset(bin.df, sample.id %in% samples.with.peaks)
    n.samples <- length(samples.with.peaks)
    last.cumsums <- list()
    before.cumsums <- list(left=list(), right=list())
    for(data.type in names(first.cumsums)){
      data.mat <- first.cumsums[[data.type]]
      last.cumsums[[data.type]] <-
        data.mat[nrow(data.mat),][samples.with.peaks]
      before.cumsums$left[[data.type]] <- if(loss.best$seg1.last == 1){
        structure(rep(0, n.samples), names=samples.with.peaks)
      }else{
        data.mat[loss.best$seg1.last-1,][samples.with.peaks]
      }
      before.cumsums$right[[data.type]] <-
        data.mat[loss.best$seg2.last-1,][samples.with.peaks]
    }
    last.chromEnd <- min.chromEnd
    n.bins.zoom <- bin.factor * 2L
    n.cumsum <- n.bins.zoom + 1L

    ## These will change at the end of each iteration.
    peakStart <- best.peak.df$chromStart[1]
    peakEnd <- best.peak.df$chromEnd[1]
    search.list <- list()
    best.list <- list()
    bases.per.bin.zoom <- bases.per.bin
    while(bases.per.bin.zoom > 1){
      left.chromStart <- peakStart - bases.per.bin.zoom
      right.chromStart <- peakEnd-bases.per.bin.zoom
      bases.per.bin.zoom <- as.integer(bases.per.bin.zoom / bin.factor)

      cumsum.list <- function(chromStart){
        limits <- bases.per.bin.zoom*(0:(n.cumsum-1))+chromStart
        chromStart.vec <- limits[-length(limits)]
        chromEnd.vec <- limits[-1]
        intervals <-
          paste0(chromStart.vec, "-", chromEnd.vec)
        dn <- list(bin=c("before", intervals), sample.id=samples.with.peaks)
        m <- matrix(NA, n.cumsum, n.samples, dimnames=dn)
        list(count=m, 
             chromStart=chromStart.vec, chromEnd=chromEnd.vec)
      }
      cumsum.mats <-
        list(left=cumsum.list(left.chromStart),
             right=cumsum.list(right.chromStart))

      for(sample.i in seq_along(samples.with.peaks)){
        sample.id <- samples.with.peaks[[sample.i]]
        one <- profile.list[[sample.id]]
        lr.list <-
          list(left=binSum(one, left.chromStart,
                 bases.per.bin.zoom, n.bins.zoom),
               right=binSum(one, right.chromStart,
                 bases.per.bin.zoom, n.bins.zoom,
                 empty.as.zero=TRUE))
        for(lr in names(lr.list)){
          lr.bins <- lr.list[[lr]]
          stopifnot(nrow(lr.bins) == n.bins.zoom)
          lr.bases <- with(lr.bins, chromEnd-chromStart)
          lr.before <- before.cumsums[[lr]]
          lr.counts <- list(count=lr.bins$count)
          for(data.type in names(lr.counts)){
            lr.count.vec <-
              c(lr.before[[data.type]][[sample.id]], lr.counts[[data.type]])
            cumsum.mats[[lr]][[data.type]][, sample.id] <-
              cumsum(lr.count.vec)
          }
        }
      }
      left.mat <- t(cumsum.mats$left$count[-1,])
      colnames(left.mat) <- NULL
      right.mat <- t(cumsum.mats$right$count[-1,])
      colnames(right.mat) <- NULL
      ## print(bases.per.bin.zoom)
      ## print(as.integer(cumsum.mats$left$count[1,]))
      ## print(left.mat)
      possible.grid <- 
        expand.grid(left.cumsum.row=3:n.cumsum, right.cumsum.row=2:n.cumsum)
      possible.grid$left.chromStart <-
        cumsum.mats$left$chromStart[possible.grid$left.cumsum.row-1]
      possible.grid$left.chromEnd <-
        cumsum.mats$left$chromEnd[possible.grid$left.cumsum.row-1]
      possible.grid$right.chromStart <-
        cumsum.mats$right$chromStart[possible.grid$right.cumsum.row-1]
      possible.grid$right.chromEnd <-
        cumsum.mats$right$chromEnd[possible.grid$right.cumsum.row-1]
      feasible.grid <-
        subset(possible.grid,
               left.chromEnd <= right.chromStart &
                 right.chromEnd <= last.chromEnd)
      feasible.grid$model.i <- 1:nrow(feasible.grid)
      model.list <- list()
      seg.list <- list()
      sample.loss.list <- list()
      for(model.i in feasible.grid$model.i){
        model.row <- feasible.grid[model.i, ]

        seg1.i <- model.row$left.cumsum.row-1
        seg1.cumsums <- cumsum.mats$left$count[seg1.i, ]
        seg1.chromEnd <- cumsum.mats$left$chromStart[seg1.i]
        seg1.bases <- seg1.chromEnd-max.chromStart
        seg1.corrected <- seg1.bases - extra.before
        seg1.means <- seg1.cumsums/seg1.corrected
        seg1.loss <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
        seg.list[[paste(model.i, 1)]] <-
          data.frame(chromStart=max.chromStart, chromEnd=seg1.chromEnd,
                     mean=seg1.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        seg1.mat <-
          small.bins[small.chromEnd <= seg1.chromEnd,
                     samples.with.peaks,
                     drop=FALSE]
        stopifnot(nrow(seg1.mat) == seg1.bases)
        ## stopifnot(all.equal(as.numeric(colMeans(seg1.mat)),
        ##                     as.numeric(seg1.means)))

        cumsum.seg2.end <-
          cumsum.mats$right$count[model.row$right.cumsum.row, ]
        seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
        seg2.chromEnd <-
          cumsum.mats$right$chromEnd[model.row$right.cumsum.row-1]
        seg2.bases <- seg2.chromEnd-seg1.chromEnd
        seg2.means <- seg2.cumsums/seg2.bases
        seg2.loss <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
        seg.list[[paste(model.i, 2)]] <-
          data.frame(chromStart=seg1.chromEnd, chromEnd=seg2.chromEnd,
                     mean=seg2.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        is.seg2 <-
          seg1.chromEnd < small.chromEnd &
            small.chromEnd <= seg2.chromEnd
        stopifnot(sum(is.seg2) == seg2.bases)
        seg2.mat <-
          small.bins[is.seg2,
                     samples.with.peaks,
                     drop=FALSE]
        stopifnot(all.equal(as.numeric(colMeans(seg2.mat)),
                            as.numeric(seg2.means)))
        
        seg3.cumsums <- last.cumsums$count-cumsum.seg2.end
        seg3.bases <- last.chromEnd-seg2.chromEnd
        seg3.corrected <- seg3.bases - extra.after
        seg3.means <- seg3.cumsums/seg3.corrected
        seg3.loss <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
        seg.list[[paste(model.i, 3)]] <-
          data.frame(chromStart=seg2.chromEnd, chromEnd=last.chromEnd,
                     mean=seg3.means, sample.id=samples.with.peaks,
                     model.i,
                     row.names=NULL)

        is.seg3 <-
          seg2.chromEnd < small.chromEnd &
            small.chromEnd <= last.chromEnd
        stopifnot(sum(is.seg3) == seg3.bases)
        seg3.mat <-
          small.bins[is.seg3,
                     samples.with.peaks,
                     drop=FALSE]
        ## stopifnot(all.equal(as.numeric(colMeans(seg3.mat)),
        ##                     as.numeric(seg3.means)))

        total.bases <- sum(seg1.bases + seg2.bases + seg3.bases)
        stopifnot(all.equal(total.bases, last.chromEnd - max.chromStart))

        ## Total loss is actually just the total of the samples with
        ## peaks, and with.without.loss is the grand total of samples
        ## with and without peaks.
        total.loss <- sum(seg1.loss + seg2.loss + seg3.loss)
        with.without.loss <- flat.without.peaks + total.loss
        total.loss.vec <- seg1.loss+seg2.loss+seg3.loss
        feasible.vec <- seg1.means < seg2.means & seg2.means > seg3.means
        model.list[[model.i]] <-
          data.frame(model.row, total.loss, with.without.loss,
                     feasible=all(feasible.vec))
        sample.loss.list[[model.i]] <-
          data.frame(sample.id=samples.with.peaks, loss=total.loss.vec)
      }
      ## Plot the segment means as a reality check.
      seg.df <- do.call(rbind, seg.list)
      segPlot <- 
      ggplot()+
        geom_step(aes(chromStart/1e3, count),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean),
                     data=data.frame(seg.df, what="models"),
                     size=1, color="green")+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ model.i, scales="free")

      ## Then plot the peaks only, colored by total cost of the model.
      model.df <- do.call(rbind, model.list)
      model.df$y <- with(model.df, model.i/max(model.i))
      feasible.models <- subset(model.df, feasible)
      best.model <- feasible.models[which.min(feasible.models$total.loss), ]

      ggplot()+
        coord_equal()+
        geom_point(aes(with.without.loss, total.loss),
                   data=model.df)

      binPlot <- 
      ggplot()+
        xlab("position on chromosome (kilobases = kb)")+
        scale_y_continuous("", breaks=NULL)+
        geom_step(aes(chromStart/1e3, count.norm),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean.norm,
                         xend=chromEnd/1e3, yend=mean.norm),
                     data=data.frame(sub.bin.df, what="bins"),
                     color="black")+
        geom_segment(aes(peakStart/1e3, 0,
                         xend=peakEnd/1e3, yend=0),
                     data=data.frame(loss.best, what="peak"),
                     color="green")+
        scale_linetype_manual(values=c("TRUE"=1, "FALSE"=2))+
        geom_segment(aes(left.chromStart/1e3, y,
                         color=total.loss,
                         linetype=feasible,
                         xend=right.chromEnd/1e3, yend=y),
                     data=data.frame(model.df, sample.id="peaks"),
                     size=1)+
        geom_text(aes(left.chromStart/1e3, y,
                      label="optimal "),
                  data=data.frame(best.model, sample.id="peaks"),
                  hjust=1)+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ .)

      search.list[[paste(bases.per.bin.zoom)]] <-
        data.frame(bases.per.bin.zoom, model.df)
      best.list[[paste(bases.per.bin.zoom)]] <-
        data.frame(bases.per.bin.zoom, best.model)

      peakStart <- best.model$left.chromStart
      peakEnd <- best.model$right.chromEnd
      ## cat(sprintf("\nn_peaks=%s bases_per_bin=%d [%d,%d] loss=%15.6f\n",
      ##             peaks.str, bases.per.bin.zoom,
      ##             peakStart, peakEnd, best.model$with.without.loss))
      before.i.list <-
        list(left=best.model$left.cumsum.row-2,
             right=best.model$right.cumsum.row-1)
      for(lr in names(before.i.list)){
        before.i <- before.i.list[[lr]]
        mats <- cumsum.mats[[lr]]
        before.cumsums[[lr]]$count <-
          structure(mats$count[before.i,],
                    names=colnames(mats$count))
      }
    }##while(bases.per.bin.zoom)
    samplefac <- function(L){
      df <- do.call(rbind, L)
      sample.ids <- unique(norm.df$sample.id)
      bases.num <- sort(unique(df$bases.per.bin.zoom), decreasing=TRUE)
      levs <- c(paste(sample.ids), paste(bases.num))
      df$sample.id <- factor(df$bases.per.bin.zoom, levs)
      df
    }
    search.df <- samplefac(search.list)
    best.df <- samplefac(best.list)
    C.cols <-
      c("bases.per.bin.zoom", "left.chromStart",
        "right.chromEnd", "with.without.loss")
    best.df[, C.cols]
    searchPlot <- 
      ggplot()+
        xlab("position on chromosome (kilobases = kb)")+
        scale_y_continuous("", breaks=NULL)+
        geom_step(aes(chromStart/1e3, count.norm),
                  data=data.frame(sub.norm.df, what="data"),
                  color="grey")+
        geom_segment(aes(chromStart/1e3, mean.norm,
                         xend=chromEnd/1e3, yend=mean.norm),
                     data=data.frame(sub.bin.df, what="bins"),
                     color="black")+
        geom_segment(aes(left.chromStart/1e3, y,
                         color=total.loss,
                         xend=right.chromEnd/1e3, yend=y),
                     data=search.df,
                     size=1)+
        geom_text(aes(left.chromStart/1e3, y,
                      label="selected"),
                  data=best.df,
                  hjust=1)+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., labeller=function(var, val){
          sub("McGill0", "", val)
        })
    ## Check R=C implementation.
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]

    C.start.end <- model$peak_start_end
    R.start.end <- c(peakStart, peakEnd)
    expect_true(all(C.start.end - R.start.end == 0))
    
    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    for(seg.i in 1:3){
      model.name <- paste(best.model$model.i, seg.i)
      seg.i.df <- seg.list[[model.name]]
      rownames(seg.i.df) <- seg.i.df$sample.id
      R.mean.vec <- as.numeric(seg.i.df[C.sample.id, "mean"])
      C.mean.vec <- model[[sprintf("seg%d_mean_vec", seg.i)]]
      expect_equal(C.mean.vec, R.mean.vec)
    }
    
    loss.with.peaks <- sample.loss.list[[best.model$model.i]]
    samples.without.peaks <-
      names(profile.list)[!names(profile.list) %in% samples.with.peaks]
    loss.without.peaks <-
      data.frame(sample.id=samples.without.peaks,
                 loss=flat.loss.vec[samples.without.peaks])
    sample.loss.df <- rbind(loss.with.peaks, loss.without.peaks)
    peaks <- as.numeric(peaks.str)
    zoom.loss.list[[peaks.str]] <- 
      data.frame(peaks, loss=sum(sample.loss.df$loss))
    zoom.peak.list[[peaks.str]] <-
      data.frame(peaks, sample.id=samples.with.peaks,
                 chromStart=peakStart, chromEnd=peakEnd)
  }#peaks.str
  zoom.peaks <- do.call(rbind, zoom.peak.list)
  zoom.loss <- do.call(rbind, zoom.loss.list)
  C.loss.vec <- sapply(fit$models, "[[", "loss")
  R.loss.vec <- zoom.loss$loss
  some.C.loss <- C.loss.vec[1:length(R.loss.vec)]
  expect_equal(some.C.loss, R.loss.vec)
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, -peaks*0.1,
                     xend=chromEnd/1e3, yend=-peaks*0.1,
                     color=what),
                 size=1,
                 data=dftype("peaks", zoom.peaks))+
    geom_text(aes(chromStart/1e3, -peaks*0.1,
                  label=paste0(peaks,
                    " peak",
                    ifelse(peaks==1, "", "s"),
                    " "),
                  color=what),
              hjust=1,
              vjust=0.5,
              size=3,
              data=dftype("peaks", zoom.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  ggplot()+
    geom_segment(aes(chromStart/1e3, peaks,
                     xend=chromEnd/1e3, yend=peaks),
                 data=zoom.peaks)
  ggplot(zoom.loss, aes(peaks, loss))+
    geom_point()+
    geom_line()
})

test_that("Step1 C result agrees with R", {
  profiles <- H3K4me3.TDH.other.chunk8
  fit <- PeakSegJointHeuristicStep1(profiles, 3)
  unfilled.profile.list <- split(profiles, profiles$sample.id, drop=TRUE)
  unfilled.chromStart <- min(sapply(unfilled.profile.list, with, chromStart[1]))
  unfilled.chromEnd <-
    max(sapply(unfilled.profile.list, with, chromEnd[length(chromEnd)]))
  unfilled.bases <- unfilled.chromEnd-unfilled.chromStart
  bin.factor <- 3L
  bases.per.bin <- 1L
  while(unfilled.bases/bases.per.bin/bin.factor >= 4){
    bases.per.bin <- bases.per.bin * bin.factor
  }
  n.bins <- as.integer(unfilled.bases %/% bases.per.bin + 1L)
  expect_equal(fit$n_bins, n.bins)
  expect_equal(fit$bases_per_bin, bases.per.bin)
  expect_equal(fit$bin_factor, bin.factor)
  R.data.start.end <- c(unfilled.chromStart, unfilled.chromEnd)
  expect_equal(fit$data_start_end, R.data.start.end)

  extra.bases <- n.bins * bases.per.bin - unfilled.bases
  extra.before <- as.integer(extra.bases/2)
  extra.after <- extra.bases - extra.before
  max.chromStart <- unfilled.chromStart-extra.before
  min.chromEnd <- unfilled.chromEnd + extra.after
  profile.list <- list()
  for(sample.id in names(unfilled.profile.list)){
    one.sample <- 
      subset(unfilled.profile.list[[sample.id]],
             unfilled.chromStart < chromEnd &
               chromStart < unfilled.chromEnd)
    profile.list[[sample.id]] <- one.sample
  }
  bases <- min.chromEnd-max.chromStart
  ## End pre-processing to add zeros.

  expect_identical(fit$bin_start_end[1], max.chromStart)
  expect_identical(fit$bin_start_end[2], min.chromEnd)
  
  ## Small bins are just for testing the computation of the loss
  ## function in the R implementation, and should not be ported to C
  ## code.
  n.samples <- length(profile.list)
  small.chromEnd <- (max.chromStart+1):min.chromEnd
  small.bins <-
    matrix(NA, bases, n.samples,
           dimnames=list(position=small.chromEnd,
             sample.id=names(profile.list)))
  for(sample.id in names(profile.list)){
    one <- profile.list[[sample.id]]
    bins <- binSum(one, max.chromStart, n.bins=bases, empty.as.zero=TRUE)
    stopifnot(bins$chromEnd == small.chromEnd)
    small.bins[, sample.id] <- bins$count
  }

  na.mat <- 
    matrix(NA, n.bins, n.samples,
           dimnames=list(bin=NULL, sample.id=names(profile.list)))
  first.cumsums <- list(count=na.mat)
  bin.list <- list()
  norm.list <- list()
  for(sample.i in seq_along(profile.list)){
    sample.id <- names(profile.list)[sample.i]
    one <- profile.list[[sample.i]]
    max.count <- max(one$count)
    bins <- binSum(one, max.chromStart, bases.per.bin, n.bins)
    stopifnot(n.bins == nrow(bins))
    bins$mean <- with(bins, count/(chromEnd-chromStart))
    bins$mean.norm <- bins$mean/max.count
    bin.list[[sample.id]] <- data.frame(sample.id, rbind(bins, NA))
    bases.vec <- with(bins, chromEnd-chromStart)
    stopifnot(bins$count >= 0)
    first.cumsums$count[, sample.i] <- cumsum(bins$count)
    one$count.norm <- one$count/max.count
    norm.list[[sample.i]] <- one
  }
  bin.df <- do.call(rbind, bin.list)
  norm.df <- do.call(rbind, norm.list)

  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, mean,
                     xend=chromEnd/1e3, yend=mean,
                     color=what),
                 data=data.frame(bin.df, what="bins"))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  
  ## The formula for the optimal Poisson loss 
  ## for 1 segment with d integer data points x_j is
  ## \sum_{j=1}^d m - x_j \log m_j =
  ##   ( \sum_{j=1}^d x_j ) (1-\log m)
  ## where the segment mean m = (\sum x_j)/d,
  OptimalPoissonLoss <- function(mean.value, cumsum.value){
    ifelse(mean.value == 0, 0, cumsum.value * (1-log(mean.value)))
  }
  loss.list <- list()
  peak.list <- list()
  seg.list <- list()
  best.loss.list <- list()
  flat.cumsums <- first.cumsums$count[n.bins, ]
  flat.means <- flat.cumsums/unfilled.bases

  expect_equal(fit$sample_mean_vec, as.numeric(flat.means))
  
  flat.loss.vec <- OptimalPoissonLoss(flat.means, flat.cumsums)
  R.flat.loss <- as.numeric(flat.loss.vec[fit$sample.id])
  expect_equal(fit$flat_loss_vec, R.flat.loss)

  best.loss.list[["0"]] <- sum(flat.loss.vec)
  for(seg1.last in 1:(n.bins-2)){
    seg1.cumsums <- first.cumsums$count[seg1.last, ]
    seg1.bases <- seg1.last*bases.per.bin
    seg1.chromEnd <- seg1.bases + max.chromStart
    seg1.corrected <- seg1.bases - extra.before
    seg1.means <- seg1.cumsums/seg1.corrected
    seg1.loss.vec <- OptimalPoissonLoss(seg1.means, seg1.cumsums)
    seg1.loss <- sum(seg1.loss.vec)
    for(seg2.last in (seg1.last+1):(n.bins-1)){
      cumsum.seg2.end <- first.cumsums$count[seg2.last, ]
      seg2.cumsums <- cumsum.seg2.end-seg1.cumsums
      seg12.bases <- seg2.last*bases.per.bin
      seg2.bases <- seg12.bases - seg1.bases
      seg2.chromEnd <- seg1.chromEnd + seg2.bases
      seg2.means <- seg2.cumsums/seg2.bases
      seg2.loss.vec <- OptimalPoissonLoss(seg2.means, seg2.cumsums)
      seg2.loss <- sum(seg2.loss.vec)
      
      seg3.cumsums <- first.cumsums$count[n.bins, ]-cumsum.seg2.end
      seg3.bases <- bases - seg12.bases
      seg3.corrected <- seg3.bases - extra.after
      seg3.means <- seg3.cumsums/seg3.corrected

      mean.mat <- rbind(seg1.means, seg2.means, seg3.means)

      this.seg.list <- list()
      for(sample.id in colnames(mean.mat)){
        this.seg.list[[sample.id]] <-
          data.frame(sample.id,
                     chromStart=c(max.chromStart, seg1.chromEnd, seg2.chromEnd,
                                  max.chromStart),
                     chromEnd=c(seg1.chromEnd, seg2.chromEnd, min.chromEnd,
                                min.chromEnd),
                     mean=c(mean.mat[, sample.id], flat.means[[sample.id]]),
                     segments=c(3, 3, 3, 1))
      }
      these.segs <- do.call(rbind, this.seg.list)
      ggplot()+
        scale_color_manual(values=c(data="grey50",
                             bins="black", segments="green"))+
        geom_step(aes(chromStart/1e3, count, color=what),
                  data=data.frame(norm.df, what="data"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     size=1,
                     data=data.frame(these.segs, what="segments"))+
        geom_segment(aes(chromStart/1e3, mean,
                         xend=chromEnd/1e3, yend=mean,
                         color=what),
                     data=data.frame(bin.df, what="bins"))+
        theme_bw()+
        theme(panel.margin=grid::unit(0, "cm"))+
        facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
          sub("McGill0", "", val)
        })
      
      seg3.loss.vec <- OptimalPoissonLoss(seg3.means, seg3.cumsums)
      seg3.loss <- sum(seg3.loss.vec)
      seg123.loss.vec <- seg1.loss.vec + seg2.loss.vec + seg3.loss.vec

      peak.feasible <- seg1.means < seg2.means & seg2.means > seg3.means
      if(any(peak.feasible)){
        diff.loss.vec <- flat.loss.vec - seg123.loss.vec
        possible.df <- 
          data.frame(flat.loss.vec, seg123.loss.vec,
                     diff.loss.vec, peak.feasible)
        feasible.df <- subset(possible.df, peak.feasible)
        ordered.df <- 
          feasible.df[order(feasible.df$diff.loss.vec, decreasing = TRUE), ]
        for(peaks in 1:nrow(ordered.df)){
          with.peaks <- ordered.df[1:peaks, ]
          with.segs <-
            subset(these.segs,
                   sample.id %in% rownames(with.peaks) & segments==3)
          without.segs <-
            subset(these.segs,
                   !sample.id %in% rownames(with.peaks) & segments==1)
          without.peaks <-
            possible.df[!rownames(possible.df) %in% rownames(with.peaks),]
          with.loss <- with.peaks$seg123.loss.vec
          without.loss <- without.peaks$flat.loss.vec
          total.loss <- sum(with.loss, without.loss)
          loss.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            data.frame(seg1.last, seg2.last, peaks, total.loss)
          peak.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <- 
            data.frame(sample.id=rownames(with.peaks),
                       chromStart=seg1.last*bases.per.bin+max.chromStart,
                       chromEnd=seg2.last*bases.per.bin+max.chromStart)
          seg.list[[paste(peaks)]][[paste(seg1.last, seg2.last)]] <-
            rbind(without.segs, with.segs)
        }#peaks
      }#any(peak.feasible)
    }#seg2.last
  }#seg1.last
  best.seg.list <- list()
  best.peak.list <- list()
  best.indices.list <- list()
  for(peaks.str in names(loss.list)){
    loss.df <- do.call(rbind, loss.list[[peaks.str]])
    loss.best <- loss.df[which.min(loss.df$total.loss), ]
    best.indices.list[[peaks.str]] <- loss.best
    last.str <- with(loss.best, paste(seg1.last, seg2.last))
    peaks <- as.numeric(peaks.str)
    model.i <- peaks + 1
    model <- fit$models[[model.i]]
    
    peak.df <- peak.list[[peaks.str]][[last.str]]
    C.start.end <- model$peak_start_end
    R.start.end <- as.integer(peak.df[1, c("chromStart", "chromEnd")])
    expect_equal(C.start.end, R.start.end)

    sample.i <- model$samples_with_peaks_vec + 1
    C.sample.id <- fit$sample.id[sample.i]
    C.sample.sorted <- sort(paste(C.sample.id))
    R.sample.sorted <- sort(paste(peak.df$sample.id))
    expect_identical(C.sample.sorted, R.sample.sorted)

    seg.df <- seg.list[[peaks.str]][[last.str]]
    segs.by.chromStart <- split(seg.df, seg.df$chromStart)
    for(seg.i in seq_along(segs.by.chromStart)){
      seg.i.df <- segs.by.chromStart[[seg.i]]
      rownames(seg.i.df) <- seg.i.df$sample.id
      R.mean.vec <- as.numeric(seg.i.df[C.sample.id, "mean"])
      C.mean.vec <- model[[sprintf("seg%d_mean_vec", seg.i)]]
      expect_equal(C.mean.vec, R.mean.vec)
    }

    ggplot()+
      ggtitle(paste0("best model with ", peaks,
                    " peak", ifelse(peaks==1, "", "s")))+
      scale_color_manual(values=c(data="grey50",
                           bins="black", segments="green"))+
      geom_step(aes(chromStart/1e3, count, color=what),
                data=data.frame(norm.df, what="data"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   size=1,
                   data=data.frame(seg.df, what="segments"))+
      geom_segment(aes(chromStart/1e3, mean,
                       xend=chromEnd/1e3, yend=mean,
                       color=what),
                   data=data.frame(bin.df, what="bins"))+
      theme_bw()+
      theme(panel.margin=grid::unit(0, "cm"))+
      facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
        sub("McGill0", "", val)
      })

    best.seg.list[[peaks.str]] <- data.frame(peaks, seg.df)
    best.peak.list[[peaks.str]] <-
      data.frame(peaks, y=peaks*-0.1, peak.df)
    best.loss.list[[peaks.str]] <- loss.best$total.loss
  }
  R.loss.list <- rep(Inf, n.samples+1)
  names(R.loss.list) <- 0:n.samples
  R.loss.list[names(best.loss.list)] <- best.loss.list
  R.loss.vec <- as.numeric(unlist(R.loss.list))
  C.loss.vec <- sapply(fit$models, "[[", "loss")
  expect_equal(C.loss.vec, R.loss.vec)
  
  best.peaks <- do.call(rbind, best.peak.list)
  by.sample.loc <-
    split(best.peaks, with(best.peaks, paste(sample.id, chromStart, chromEnd)))
  short.label.list <- list()
  for(sample.loc.name in names(by.sample.loc)){
    sample.loc <- by.sample.loc[[sample.loc.name]]
    peaks.txt <- paste(sample.loc$peaks, collapse=",")
    short.label.list[[sample.loc.name]] <-
      data.frame(sample.loc[1,], peaks.txt)
  }
  short.labels <- do.call(rbind, short.label.list)
  best.peaks$sample.id <- factor(best.peaks$sample.id, names(profile.list))
  sample.counts <- table(best.peaks$sample.id)
  dftype <- function(what, df){
    df$sample.id <- factor(df$sample.id, names(sample.counts))
    data.frame(what, df)
  }
  ggplot()+
    scale_color_manual(values=c(data="grey50",
                         bins="black", peaks="deepskyblue"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=dftype("data", norm.df))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=dftype("peaks", best.peaks))+
    geom_text(aes(chromStart/1e3, y,
                     label=paste0(peaks, " peak",
                       ifelse(peaks==1, "", "s"), " "),
                     color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=dftype("peaks", best.peaks))+
    geom_segment(aes(chromStart/1e3, mean.norm,
                     xend=chromEnd/1e3, yend=mean.norm,
                     color=what),
                 data=dftype("bins", bin.df))+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free", labeller=function(var, val){
      sub("McGill0", "", val)
    })
  for(peaks.str in names(best.indices.list)){
    loss.best <- best.indices.list[[peaks.str]]
    best.peak.df <- best.peak.list[[peaks.str]]
    samples.with.peaks <- paste(best.peak.df$sample.id)
    n.samples <- length(samples.with.peaks)
    last.cumsums <- list()
    before.cumsums <- list(left=list(), right=list())
    for(data.type in names(first.cumsums)){
      data.mat <- first.cumsums[[data.type]]
      last.cumsums[[data.type]] <-
        data.mat[nrow(data.mat),][samples.with.peaks]
      before.cumsums$left[[data.type]] <- if(loss.best$seg1.last == 1){
        structure(rep(0, n.samples), names=samples.with.peaks)
      }else{
        data.mat[loss.best$seg1.last-1,][samples.with.peaks]
      }
      before.cumsums$right[[data.type]] <-
        data.mat[loss.best$seg2.last-1,][samples.with.peaks]
    }
    peaks <- as.integer(peaks.str)
    model.i <- peaks+1L
    model <- fit$models[[model.i]]
    C.sample.i <- model$samples_with_peaks_vec + 1L
    C.sample.id <- fit$sample.id[C.sample.i]
    for(lr in names(before.cumsums)){
      R.cumsums <- as.integer(before.cumsums[[lr]]$count[C.sample.id])
      C.cumsums <- model[[paste0(lr, "_cumsum_vec")]]
      expect_equal(C.cumsums, R.cumsums)
    }
  }
  R.last.cumsums <- as.integer(last.cumsums$count[fit$sample.id])
  C.last.cumsums <- fit$last_cumsum_vec
  C.last.cumsums[is.na(R.last.cumsums)] <- NA
  expect_equal(C.last.cumsums, R.last.cumsums)
})
tdhock/PeakSegJoint documentation built on Jan. 27, 2024, 9 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
tdhock/PeakSegJoint
Joint Peak Detection in Several ChIP-Seq Samples

tests/testthat/test-OTHER-real-data.R
In tdhock/PeakSegJoint: Joint Peak Detection in Several ChIP-Seq Samples

R Package Documentation

Browse R Packages

We want your feedback!

tdhock/PeakSegJoint Joint Peak Detection in Several ChIP-Seq Samples

tests/testthat/test-OTHER-real-data.R In tdhock/PeakSegJoint: Joint Peak Detection in Several ChIP-Seq Samples

R Package Documentation

Browse R Packages

We want your feedback!

tdhock/PeakSegJoint
Joint Peak Detection in Several ChIP-Seq Samples

tests/testthat/test-OTHER-real-data.R
In tdhock/PeakSegJoint: Joint Peak Detection in Several ChIP-Seq Samples