PeakSegJointFasterOne: PeakSegJointFasterOne
In tdhock/PeakSegJoint: Joint Peak Detection in Several ChIP-Seq Samples

PeakSegJointFasterOne

R Documentation

PeakSegJointFasterOne

Description

Run the PeakSegJointFaster heuristic optimization algorithm, which gives an approximate solution to a multi-sample Poisson maximum likelihood segmentation problem. Given S samples, this function computes a sequence of S+1 PeakSegJoint models, with 0, ..., S samples with an overlapping peak (maximum of one peak per sample).

Usage

PeakSegJointFasterOne(profiles, 
    bin.factor = 2L)

Arguments

`profiles`	List of data.frames with columns chromStart, chromEnd, count, or single data.frame with additional column sample.id.
`bin.factor`	Size of bin pyramid. Bigger values result in slower computation.

Value

List of model fit results, see examples to see how to use it.

Author(s)

Toby Dylan Hocking <toby.hocking@r-project.org> [aut, cre]

Examples


library(PeakSegJoint)
data(H3K36me3.TDH.other.chunk1, envir=environment())

some.counts <- subset(
  H3K36me3.TDH.other.chunk1$counts,
  43000000 < chromEnd &
  chromStart < 43200000 &
  sample.id %in% c("McGill0023", "McGill0022", "McGill0016", "McGill0013"))

id.df <- unique(some.counts[, c("cell.type", "sample.id")])
group.list <- split(paste(id.df$sample.id), id.df$cell.type, drop=TRUE)

loss.df.list <- list()
fit.list <- list()
for(bin.factor in 2:7){
  fit.fast <- PeakSegJointFasterOne(some.counts, bin.factor)
  fit.fast$min.loss <- sum(fit.fast$peak_loss_vec)
  fit.fast$sample.loss.diff.vec <- sort(with(fit.fast, structure(
    peak_loss_vec-flat_loss_vec, names=sample.id)))
  fit.fast$group.loss.diff.vec <- sort(sapply(group.list, function(sid.vec){
    sum(fit.fast$sample.loss.diff.vec[sid.vec])
  }))
  fit.fast$sample.loss.vec <- with(fit.fast, structure(
    sum(flat_loss_vec)+cumsum(c(0, sample.loss.diff.vec)),
    names=paste0(0:length(sample.loss.diff.vec), "samples")))
  fit.fast$group.loss.vec <- with(fit.fast, structure(
    sum(flat_loss_vec)+cumsum(c(0, group.loss.diff.vec)),
    names=paste0(0:length(group.loss.diff.vec), "groups")))
  loss.df.list[[paste(bin.factor)]] <- with(fit.fast, data.frame(
    bin.factor,
    loss=sample.loss.vec,
    peaks=0:length(sample.loss.diff.vec)))
  fit.list[[paste(bin.factor)]] <- fit.fast
}
loss.df <- do.call(rbind, loss.df.list)
fit.best <- fit.list[[which.min(sapply(fit.list, "[[", "min.loss"))]]

norm.list <- list()
profile.list <- split(some.counts, some.counts$sample.id, drop=TRUE)
for(sample.id in names(profile.list)){
  one <- profile.list[[sample.id]]
  max.count <- max(one$count)
  one$count.norm <- one$count/max.count
  norm.list[[sample.id]] <- one
}
norm.df <- do.call(rbind, norm.list)

if(interactive() && require(ggplot2)){
  
  peaks.df.list <- list()
  for(n.samples in 1:length(fit.best$sample.loss.diff.vec)){
    peaks.df.list[[paste(n.samples)]] <- with(fit.best, data.frame(
      samples=n.samples,
      sample.id=names(sample.loss.diff.vec)[1:n.samples],
      chromStart=peak_start_end[1],
      chromEnd=peak_start_end[2]))
  }
  peaks <- do.call(rbind, peaks.df.list)
  best.peaks <- transform(peaks, y=samples*-0.1, what="peaks")
  ggplot()+
    ggtitle("model for each sample")+
    scale_color_manual(values=c(data="grey50",
                         peaks="deepskyblue",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=best.peaks)+
    geom_text(aes(chromStart/1e3, y,
                  label=paste0(samples, " sample",
                    ifelse(samples==1, "", "s"), " "),
                  color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=best.peaks)+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id ~ ., scales="free")

  ## same thing but for each group.
  peaks.df.list <- list()
  for(n.groups in 1:length(fit.best$group.loss.diff.vec)){
    group.vec <- names(fit.best$group.loss.diff.vec[1:n.groups])
    meta.df <- do.call(rbind, lapply(group.vec, function(cell.type){
      data.frame(cell.type, sample.id=group.list[[cell.type]])
    }))
    peaks.df.list[[paste(n.groups)]] <- with(fit.best, data.frame(
      groups=n.groups,
      meta.df,
      chromStart=peak_start_end[1],
      chromEnd=peak_start_end[2]))
  }
  peaks <- do.call(rbind, peaks.df.list)
  best.peaks <- transform(peaks, y=groups*-0.1, what="peaks")
  ggplot()+
    ggtitle("model for each group")+
    scale_color_manual(values=c(data="grey50",
                         peaks="deepskyblue",
                         bins="black", segments="green"))+
    geom_step(aes(chromStart/1e3, count.norm, color=what),
              data=data.frame(norm.df, what="data"))+
    geom_segment(aes(chromStart/1e3, y,
                     xend=chromEnd/1e3, yend=y,
                     color=what),
                 size=1,
                 data=best.peaks)+
    geom_text(aes(chromStart/1e3, y,
                  label=paste0(groups, " group",
                    ifelse(groups==1, "", "s"), " "),
                  color=what),
              hjust=1,
              size=3,
              vjust=0.5,
              data=best.peaks)+
    theme_bw()+
    theme(panel.margin=grid::unit(0, "cm"))+
    facet_grid(sample.id + cell.type ~ ., scales="free")

  min.df <- subset(loss.df, peaks==max(peaks))
  ggplot()+
    geom_line(aes(peaks, loss, group=bin.factor), data=loss.df)+
    geom_text(aes(peaks, loss, label=bin.factor), data=min.df, hjust=0)

  if(require(microbenchmark)){

    N.samples.vec <- 10^seq(1, 3, by=0.5)
    max.N <- max(N.samples.vec)
    N.bases <- 10
    rmat <- function(Nr, Nc, mu){
      matrix(rpois(Nr*Nc, mu), Nr, Nc)
    }
    set.seed(1)
    big.mat <- cbind(
      rmat(max.N, N.bases, 5),
      rmat(max.N, N.bases, 10),
      rmat(max.N, N.bases, 5))
    big.df <- data.frame(
      sample.id=as.integer(row(big.mat)),
      chromStart=as.integer(col(big.mat)-1),
      chromEnd=as.integer(col(big.mat)),
      count=as.integer(big.mat))
    full.list <- ProfileList(big.df)
    time.df.list <- list()
    for(N.samples in N.samples.vec){
      partial.list <- full.list[1:N.samples]
      result <- microbenchmark(
        Heuristic=PeakSegJointHeuristicStep2(partial.list, 2L),
        Faster=PeakSegJointFasterOne(partial.list, 2L),
        times=2L)
      time.df.list[[paste(N.samples)]] <- data.frame(
        N.samples,
        result)
    }
    time.df <- do.call(rbind, time.df.list)

    ggplot()+
      geom_point(aes(
        N.samples, time/1e9, color=expr),
        data=time.df)+
      scale_x_log10()+
      scale_y_log10("seconds")
    
  }

}

tdhock/PeakSegJoint documentation built on Dec. 9, 2024, 4:22 a.m.