Nothing
binseg <- structure(function # Binary segmentation
### Efficient C++ implementation of the classic binary segmentation
### algorithm for finding changepoints in a sequence of N data. Output
### includes columns which can be used to compute parameters for a
### single model in log-linear time, using coef method.
(distribution.str,
### String indicating distribution, use get_distribution_info to see
### possible values.
data.vec,
### Vector of numeric data to segment.
max.segments=NULL,
### Maximum number of segments to compute, default=NULL which means to
### compute the largest number possible, given is.validation.vec and
### min.segment.length. Note that the returned number of segments may
### be less than this, if there are min segment length constraints.
is.validation.vec=rep(FALSE, length(data.vec)),
### logical vector indicating which data are to be used in validation
### set, default=all FALSE (no validation set).
position.vec=seq_along(data.vec),
### integer vector of positions at which data are measured,
### default=1:length(data.vec).
weight.vec=rep(1, length(data.vec)),
### Numeric vector of non-negative weights for each data point.
min.segment.length=NULL,
### Positive integer, minimum number of data points per
### segment. Default NULL means to use min given distribution.str.
container.str="multiset"
### C++ container to use for storing breakpoints/cost. Most users
### should leave this at the default "multiset" for efficiency but you
### could use "list" if you want to study the time complexity of a
### slower implementation of binary segmentation.
){
##alias<< binsegRcpp
if(!(
is.logical(is.validation.vec) &&
length(is.validation.vec)==length(data.vec) &&
sum(is.na(is.validation.vec))==0)){
stop("is.validation.vec must be logical vector with no missing values, same length as data.vec")
}
set.value.vec <- c(validation=TRUE, subtrain=FALSE)
for(set.name in names(set.value.vec)){
set.value <- set.value.vec[[set.name]]
set.data <- data.vec[is.validation.vec == set.value]
L <- rle(set.data)
if(any(1 < L$lengths)){
warning(sprintf("some consecutive data values are identical in set=%s, so you could get speedups by converting your data to use a run length encoding, for example L=rle(data.vec);binseg(data.vec=L$values, weight.vec=L$lengths)", set.name))
}
}
if(is.null(min.segment.length)){
dist.df <- binsegRcpp::get_distribution_info()
dist.row <- dist.df[dist.df$dist==distribution.str,]
dist.params <- if(nrow(dist.row))dist.row$parameters[[1]] else c()
min.segment.length <- if(length(dist.params)==2)2L else 1L
}
if(!(
is.integer(min.segment.length) &&
length(min.segment.length)==1 &&
0<min.segment.length)){
stop("min.segment.length must be a positive integer")
}
if(length(data.vec) < min.segment.length){
stop("number of data must be at least min segment length")
}
if(is.null(max.segments)){
max.segments <- floor(sum(!is.validation.vec)/min.segment.length)
}
##details<< Each iteration involves first computing and storing the
## best split point on one or two segments, then looking up the
## segment with the best split so far. The best case time complexity
## occurs when splits are equal (N data split into two segments of
## size N/2), and the worst case is when splits are unequal (N data
## split into one big segment with N-1 data and one small segment
## with 1 data point). Looking up the segment with the best split so
## far is a constant O(1) time operation using C++ multimap, so O(K)
## overall for K iterations/segments. Storage of a new best split
## point/cost involves the multimap insert method which is
## logarithmic time in the size of the multimap, overall O(K log K)
## for equal splits and O(K) for unequal splits. Computing the cost
## values, and overall time complexity, depends on the loss. For
## normal and poisson distributions the best case O(N log K) time
## for equal splits and worst case O(N K) time for unequal
## splits. For l1/laplace distributions the best case is O(N log N
## log K) time for equal splits and worst case is O(N log N K) time
## for unequal splits.
result <- binseg_interface(
data.vec, weight.vec, max.segments,
min.segment.length,
distribution.str,
container.str,
is.validation.vec, position.vec)
na <- function(x)ifelse(x<0, NA, x)
##value<< list of class binsegRcpp with elements min.segment.length,
##distribution.str, param.names, subtrain.borders and splits, which
##is a data.table with columns:
dt <- with(result, list(
min.segment.length=min.segment.length,
distribution.str=distribution.str,
param.names=colnames(before.param.mat),
subtrain.borders=subtrain.borders,
splits=data.table(
segments=1:max.segments,##<< number of segments
loss,##<< subtrain loss
validation.loss,##<< validation loss
end=end+1L,##<< index of last data point per segment
depth=depth,##<< number of splits to reach segment
before=before.param.mat,##<< params before changepoint
after=ifelse(after.param.mat==Inf, NA, after.param.mat),##<< params after changepoint
before.size,##<< number of data before changepoint
after.size=na(after.size),##<< number of data after changepoint
invalidates.index=na(invalidates.index+1L),##<< index of param invalidated by this split.
invalidates.after=na(invalidates.after)##<< indicates if before/after params invalidated by this split.
)[loss < Inf]
))
class(dt) <- c("binsegRcpp", class(dt))
dt
##end<<
}, ex=function(){
data.table::setDTthreads(1)
x <- c(0.1, 0, 1, 1.1, 0.1, 0)
## Compute full path of binary segmentation models from 1 to 6
## segments.
(models <- binsegRcpp::binseg("mean_norm", x))
## Plot loss values using base graphics.
plot(models)
## Same loss values using ggplot2.
if(require("ggplot2")){
ggplot()+
geom_point(aes(
segments, loss),
data=models$splits)
}
## Compute data table of segments to plot.
(segs.dt <- coef(models, 2:4))
## Plot data, segments, changepoints.
if(require("ggplot2")){
ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
facet_grid(segments ~ ., labeller=label_both)+
geom_vline(aes(
xintercept=start.pos),
color="green",
data=segs.dt[1<start])+
geom_segment(aes(
start.pos, mean,
xend=end.pos, yend=mean),
data=segs.dt,
color="green")+
xlab("Position/index")+
ylab("Data/mean value")+
geom_point(aes(
pos, x),
data=data.frame(x, pos=seq_along(x)))
}
## Use min.segment.length to constrain segment sizes.
(constrained.models <- binsegRcpp::binseg("mean_norm", x, min.segment.length = 2L))
## Demonstration of model selection using cross-validation in
## simulated data.
seg.mean.vec <- 1:5
data.mean.vec <- rep(seg.mean.vec, each=20)
set.seed(1)
n.data <- length(data.mean.vec)
data.vec <- rnorm(n.data, data.mean.vec, 0.2)
plot(data.vec)
library(data.table)
loss.dt <- data.table(seed=1:10)[, {
set.seed(seed)
is.valid <- sample(rep(c(TRUE,FALSE), l=n.data))
bs.model <- binsegRcpp::binseg("mean_norm", data.vec, is.validation.vec=is.valid)
bs.model$splits[, data.table(
segments,
validation.loss)]
}, by=seed]
loss.stats <- loss.dt[, .(
mean.valid.loss=mean(validation.loss)
), by=segments]
plot(
mean.valid.loss ~ segments, loss.stats,
col=ifelse(
mean.valid.loss==min(mean.valid.loss),
"black",
"red"))
selected.segments <- loss.stats[which.min(mean.valid.loss), segments]
full.model <- binsegRcpp::binseg("mean_norm", data.vec, selected.segments)
(segs.dt <- coef(full.model, selected.segments))
if(require("ggplot2")){
ggplot()+
theme_bw()+
theme(panel.spacing=grid::unit(0, "lines"))+
geom_vline(aes(
xintercept=start.pos),
color="green",
data=segs.dt[1<start])+
geom_segment(aes(
start.pos, mean,
xend=end.pos, yend=mean),
data=segs.dt,
color="green")+
xlab("Position/index")+
ylab("Data/mean value")+
geom_point(aes(
pos, data.vec),
data=data.frame(data.vec, pos=seq_along(data.vec)))
}
## Demo of poisson loss, weights.
data.vec <- c(3,4,10,20)
(fit1 <- binsegRcpp::binseg("poisson", data.vec, weight.vec=c(1,1,1,10)))
coef(fit1, 2L)
(fit2 <- binsegRcpp::binseg("poisson", data.vec, weight.vec=c(1,1,10,1)))
coef(fit2, 2L)
})
print.binsegRcpp <- function
### Print method for binsegRcpp.
(x,
### data.table from binseg.
...
### ignored.
){
. <- segments <- end <- loss <- validation.loss <- NULL
## Above to avoid CRAN NOTE.
cat("binary segmentation model:\n")
print(x$splits[, .(segments, end, loss, validation.loss)])
}
plot.binsegRcpp <- function
### Plot loss values from binary segmentation.
(x,
### data.table from binseg.
...
### ignored.
){
plot(loss ~ segments, x$splits)
}
coef.binsegRcpp <- function
### Compute a data table of segment start/end/mean values for all
### models given by segments.
(object,
### data.table from binseg.
segments=1:min(nrow(object$splits), 10),
### integer vector, model sizes in number of segments.
...
### ignored.
){
before.mean <- after.mean <- end <-
invalidates.after <- invalidates.index <- NULL
kmax <- nrow(object$splits)
if(!(
is.integer(segments) &&
0<length(segments) &&
all(is.finite(segments) & 0<segments & segments <= kmax)
)){
stop(sprintf("segments must be a vector of unique integers between 1 and %d", kmax))
}
data.table(segments)[, {
i <- 1:segments
cum.fit <- object$splits[i]
ord <- order(cum.fit$end)
seg.dt <- cum.fit[ord, {
start <- c(1L, end[-.N]+1L)
data.table(
start, end,
start.pos=object$subtrain.borders[start],
end.pos=object$subtrain.borders[end+1])
}]
for(param.name in object$param.names){
p <- function(b.or.a)cum.fit[[paste0(b.or.a, ".", param.name)]]
params <- c(p("before"),p("after"))
params[
cum.fit[, .N*invalidates.after+invalidates.index]
] <- NA
param.mat <- matrix(params, 2, byrow=TRUE)[, ord]
set(seg.dt, j=param.name, value=param.mat[!is.na(param.mat)])
}
seg.dt
}, by="segments"]
### data.table with one row for each segment.
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.