knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) options(width=100)
The time complexity of the algorithm depends on the number of intervals (candidate changepoints stored). Here we compute the mean number of intervals for real Mono27ac data, and synthetic count data which are always increasing.
data(Mono27ac, package="PeakSegDisk", envir=environment()) library(data.table) loss.list <- list() N.data.vec <- 10^seq(1, 3) for(penalty in c(0, 1e2, 1e4, 1e6)){ for(N.data in N.data.vec){ some.cov <- Mono27ac$coverage[1:N.data] some.inc <- data.table(some.cov) some.inc[, count := 1:.N] data.list <- list( real=some.cov, synthetic=some.inc) for(short.type in names(data.list)){ df <- data.list[[short.type]] fit <- PeakSegDisk::PeakSegFPOP_df(df, penalty) loss.list[[paste(penalty, N.data, short.type)]] <- data.table( N.data, short.type, fit$loss) } } } (loss <- do.call(rbind, loss.list))[, .( penalty, short.type, N.data, mean.intervals, max.intervals, megabytes, seconds)][order(penalty, short.type, N.data)]
Theoretically the most intervals that could be stored is $O(i)$ for
each data point $i\in{1, ..., N}$. Therefore the largest total
number of intervals is sum(1:N)
, which can also be computed by
N*(N+1)/2
. The largest mean is mean(1:N)
, which can be computed
via sum(1:N)/N
= (N+1)/2
.
(worst.dt <- data.table( N.data=N.data.vec, mean.intervals=(N.data.vec+1)/2, short.type="theoretical"))
The plot below shows that the algorithm achieves the theoretical worst case time complexity for the synthetic increasing data, when the penalty is large. But the number of intervals is always much smaller for real Mono27ac ChIP-seq data.
one <- function(short.type, data.type, color){ data.table(short.type, data.type, color) } type.dt <- rbind( one("theoretical", "Theoretical\nworst case", "grey"), one("synthetic", "Synthetic\nIncreasing", "red"), one("real", "Real ChIP-seq", "black")) loss.types <- type.dt[loss, on=list(short.type)] worst.types <- type.dt[worst.dt, on=list(short.type)] (type.colors <- type.dt[, structure(color, names=data.type)]) if(require(ggplot2)){ ggplot()+ guides( color=guide_legend(keyheight=3) )+ geom_blank(aes( N.data, 1), data=data.table(N.data=c(5, 2000)))+ theme_bw()+ theme(panel.spacing=grid::unit(0, "lines"))+ facet_grid(. ~ penalty, labeller=label_both)+ geom_line(aes( N.data, mean.intervals, color=data.type), data=worst.types)+ scale_color_manual( values=type.colors, breaks=names(type.colors))+ geom_line(aes( bedGraph.lines, mean.intervals, color=data.type), data=loss.types)+ scale_x_log10("N data")+ scale_y_log10("Mean intervals (candidate changepoints)") }
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.